FD.io VPP  v18.07.1-11-g31aa6f2
Vector Packet Processing
flowhash_template.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2012 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 /*
17  * Author: Pierre Pfister <ppfister@cisco.com>
18  *
19  * DISCLAIMER !
20  *
21  * This most likely is not the hash table you are looking for !!
22  *
23  * This structure targets a very specific and quite narrow set of use-cases
24  * that are not covered by other hash tables.
25  *
26  * Read the following text carefully, or ask the author or one of VPP's
27  * committers to make sure this is what you are looking for.
28  *
29  *
30  * -- Abstract:
31  * This hash table intends to provide a very fast lookup and insertion of
32  * key-value pairs for flow tables (although it might be used for other
33  * purposes), with additional support for lazy-timeouts.
34  * In particular, it was designed to minimize blocking reads, register usage and
35  * cache-lines accesses during a typical lookup.
36  * This hash table therefore provides stateful packet processing
37  * without performance degradation even when every single lookup has to fetch
38  * memory from RAM.
39  * This hash table is not thread-safe and requires executing a garbage
40  * collection function to clean-up chained buckets.
41  *
42  * -- Overview:
43  *
44  * One first aspect of this hash table is that it is self-contained in a single
45  * bulk of memory. Each entry contains a key, a value, and a 32 bits timeout
46  * value; occupies a full and single cache line; and is identified by a unique
47  * 32 bits index. The entry index zero is reserved and used when an entry
48  * could not be found nor inserted. Which means it is not necessary to
49  * immediately check whether an insertion or lookup was successful before
50  * behaving accordingly. One can just keep doing business as usual and
51  * check for the error later.
52  *
53  * Each entry is associated with a timeout value (which unit or clock is up to
54  * the user of the hash table). An entry which timeout is strictly smaller
55  * than the current time is considered empty, whereas an entry which timeout is
56  * greater or equal to the current time contains a valid key-value pair.
57  *
58  * Hash table lookup and insertion are equivalent:
59  * - An entry index is always returned (possibly index 0 if no entry could be
60  * found nor created).
61  * - The returned entry always has its key set to the provided key.
62  * - Timeout value will be greater than the provided current time whenever a
63  * valid entry was found, strictly smaller otherwise. In particular, one can
64  * not differentiate between an entry which was just created, and an entry
65  * which already existed in the past but got timeouted in between.
66  *
67  * As mentioned earlier, entry index zero is used as an invalid entry which may
68  * be manipulated as a normal one. Entries which index go from 1 to
69  * N (where N is a power of 2) are used as direct buckets, each containing a
70  * single entry. In the absence of hash collision, a single entry which location
71  * can deterministically be determined from the key-hash and the hash table
72  * header is accessed (One single cache line, without indirection). This
73  * allows for efficient pre-fetching of the key-value for more than 95% of
74  * accesses.
75  *
76  * In order to handle hash collisions (i.e. when multiple keys
77  * end-up in the same bucket), entries which index are greater than N are
78  * grouped into M groups of 16 collision entries. Such groups are linked
79  * with regular entries whenever a collision needs to be handled.
80  * When looking up a key with a bucket where a collision occurred, unused bits
81  * from the key hash are used to select two entries (from the collision bucket)
82  * where the new entry might be inserted.
83  *
84  * Once an entry is inserted, it will never be moved as long as the entry
85  * timeout value remains greater or equal to the provided current time value.
86  * The entry index can therefore be stored in other data structure as a way
87  * to bypass the hash lookup. But when doing so, one should check if the
88  * present key is the actual looked-up key.
89  *
90  * -- Garbage Collection:
91  *
92  * Since there is no explicit element removal, a garbage collector mechanism
93  * is required in order to remove buckets used for hash collisions. This
94  * is done by calling the flowhash_gc function on a regular basis. Each call
95  * to this function examines a single fixed entry. It shall therefore be called
96  * as many times as there are fixed entries in the hash table in order to
97  * ensure a full inspection.
98  *
99  * -- Time and timeout mechanism:
100  *
101  * The hash table makes use of a time value between in [1, 2^32 - 1].
102  * The provided time value shall keep increasing, and looping is not handled.
103  * When seconds are used, the system should run for 136 years without any issue.
104  * If milliseconds are used, a shift should be operated on all timeout values
105  * on a regular basis (more than every 49 days).
106  */
107 
108 #ifndef __included_flowhash_template_h__
109 #define __included_flowhash_template_h__
110 
111 #include <vppinfra/clib.h>
112 #include <vppinfra/mem.h>
113 #include <vppinfra/cache.h>
114 
115 #ifndef FLOWHASH_TYPE
116 #error FLOWHASH_TYPE not defined
117 #endif
118 
119 #define _fv(a,b) a##b
120 #define __fv(a,b) _fv(a,b)
121 #define FV(a) __fv(a,FLOWHASH_TYPE)
122 
123 #define _fvt(a,b) a##b##_t
124 #define __fvt(a,b) _fvt(a,b)
125 #define FVT(a) __fvt(a,FLOWHASH_TYPE)
126 
127 /* Same for all flowhash variants */
128 #ifndef __included_flowhash_common__
129 
130 #define FLOWHASH_INVALID_ENTRY_INDEX 0
131 
132 #define FLOWHASH_ENTRIES_PER_BUCKETS_LOG 4
133 #define FLOWHASH_ENTRIES_PER_BUCKETS (1 << FLOWHASH_ENTRIES_PER_BUCKETS_LOG)
134 
135 #endif /* ifndef __included_flowhash_common__ */
136 
137  /**
138  * @brief Compare a stored key with a lookup key.
139  *
140  * This function must be defined to use this template. It must return 0
141  * when the two keys are identical, and a different value otherwise.
142  */
144 u8 FV(flowhash_cmp_key)(FVT(flowhash_skey) *a, FVT(flowhash_lkey) *b);
145 
146  /**
147  * @brief Hash a lookup key into a 32 bit integer.
148  *
149  * This function must be defined to use this template.
150  * It must provides close to 32 bits of entropy distributed amongst
151  * all 32 bits of the provided value.
152  * Keys that are equal must have the same hash.
153  */
155  u32 FV(flowhash_hash)(FVT(flowhash_lkey) *k);
156 
157 /**
158  * @brief Copy a lookup key into a destination stored key.
159  *
160  * This function must be defined to use this template. It must modify the dst
161  * key such that a later call to flowhash_cmp_key with the same arguments
162  * would return 0.
163  */
165 void FV(flowhash_cpy_key)(FVT(flowhash_skey) *dst, FVT(flowhash_lkey) *src);
166 
167 /**
168  * @brief One flow hash entry used for both direct buckets and collision
169  * buckets.
170  */
171 typedef struct {
172  /* Each entry is cache-line aligned. */
173  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
174 
175  /* Key is first to take advantage of alignment. */
176  FVT(flowhash_skey) key;
177 
178  /* Entry value. */
179  FVT(flowhash_value) value;
180 
181  /* Timeout value */
183 
184  /* Entry index to the chained bucket. */
186 } FVT(flowhash_entry);
187 
188 typedef struct FVT(__flowhash_struct) {
189  /* Cache aligned to simplify allocation. */
190  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
191 
192  /* Array going downward containing free bucket indices */
193  u32 free_buckets_indices[0];
194 
195  /* Negative index of the first free bucket */
197 
198  /* Number of fixed buckets minus one */
200 
201  /* Allocated pointer for this hash table */
202  void *mem;
203 
206 
210 
211  u32 gc_counter;
212 
213  /* Entry array containing:
214  * - 1 Dummy entry for error return
215  * - (buckets_mask + 1) Fixed buckets
216  * - chained_buckets Chained Buckets
217  */
218  FVT(flowhash_entry) entries[0];
219 } FVT(flowhash);
220 
221 /* Same for all flowhash variants */
222 #ifndef __included_flowhash_common__
223 #define __included_flowhash_common__
224 
225 /**
226  * @brief Test whether a returned entry index corresponds to an overflow event.
227  */
228 #define flowhash_is_overflow(ei) \
229  ((ei) == FLOWHASH_INVALID_ENTRY_INDEX)
230 
231 /**
232  * @brief Iterate over all entries in the hash table.
233  *
234  * Iterate over all entries in the hash table, not including the first invalid
235  * entry (at index 0), but including all chained hash collision buckets.
236  *
237  */
238 #define flowhash_foreach_entry(h, ei) \
239  for (ei = 1; \
240  ei < (h)->total_entries; \
241  ei++)
242 
243 /**
244  * @brief Iterate over all currently valid entries.
245  *
246  * Iterate over all entries in the hash table which timeout value is greater
247  * or equal to the current time.
248  */
249 #define flowhash_foreach_valid_entry(h, ei, now) \
250  flowhash_foreach_entry(h, ei) \
251  if (((now) <= (h)->entries[ei].timeout))
252 
253 /**
254  * @brief Timeout variable from a given entry.
255  */
256 #define flowhash_timeout(h, ei) (h)->entries[ei].timeout
257 
258 /**
259  * @brief Indicates whether the entry is being used.
260  */
261 #define flowhash_is_timeouted(h, ei, time_now) \
262  ((time_now) > flowhash_timeout(h, ei))
263 
264 /**
265  * @brief Get the key from the entry index, casted to the provided type.
266  */
267 #define flowhash_key(h, ei) (&(h)->entries[ei].key)
268 
269 /**
270  * @brief Get the value from the entry index, casted to the provided type.
271  */
272 #define flowhash_value(h, ei) (&(h)->entries[ei].value)
273 
274 /**
275  * @brief Get the number of octets allocated to this structure.
276  */
277 #define flowhash_memory_size(h) clib_mem_size((h)->mem)
278 
279 /**
280  * @brief Test whether the entry index is in hash table boundaries.
281  */
282 #define flowhash_is_valid_entry_index(h, ei) (ei < (h)->total_entries)
283 
284 /**
285  * @brief Adjust, if necessary, provided parameters such as being valid flowhash
286  * sizes.
287  */
288 static
290 {
291  /* Find power of two greater or equal to the provided value */
292  if (*fixed_entries < FLOWHASH_ENTRIES_PER_BUCKETS)
293  *fixed_entries = FLOWHASH_ENTRIES_PER_BUCKETS;
294  if (*fixed_entries > (1 << (32 - FLOWHASH_ENTRIES_PER_BUCKETS_LOG)))
295  *fixed_entries = (1 << (32 - FLOWHASH_ENTRIES_PER_BUCKETS_LOG));
296 
297  *fixed_entries -= 1;
298  *fixed_entries |= *fixed_entries >> 16;
299  *fixed_entries |= *fixed_entries >> 8;
300  *fixed_entries |= *fixed_entries >> 4;
301  *fixed_entries |= *fixed_entries >> 2;
302  *fixed_entries |= *fixed_entries >> 1;
303  *fixed_entries += 1;
304 
305  if (*collision_buckets != 0)
306  {
307  if (*collision_buckets < CLIB_CACHE_LINE_BYTES/sizeof(u32))
308  *collision_buckets = CLIB_CACHE_LINE_BYTES/sizeof(u32);
309 
310  *collision_buckets -= 1;
311  *collision_buckets |= *collision_buckets >> 16;
312  *collision_buckets |= *collision_buckets >> 8;
313  *collision_buckets |= *collision_buckets >> 4;
314  *collision_buckets |= *collision_buckets >> 2;
315  *collision_buckets |= *collision_buckets >> 1;
316  *collision_buckets += 1;
317  }
318 }
319 
320 /**
321  * @brief Prefetch the the hash entry bucket.
322  *
323  * This should be performed approximately 200-300 cycles before lookup
324  * if the table is located in RAM. Or 30-40 cycles before lookup
325  * in case the table is located in L3.
326  */
327 #define flowhash_prefetch(h, hash) \
328  CLIB_PREFETCH (&(h)->entries[((hash) & (h)->fixed_entries_mask) + 1], \
329  sizeof((h)->entries[0]), LOAD)
330 
331 #endif /* ifndef __included_flowhash_common__ */
332 
333 /**
334  * @brief Allocate a flowhash structure.
335  *
336  * @param[in] fixed_entries The number of fixed entries in the hash table.
337  * @param[in] chained_buckets The number of chained buckets.
338  *
339  * fixed_entries and chained_buckets parameters may not be used as is but
340  * modified in order to fit requirements.
341  *
342  * Since the flowhash does not support dynamic resizing, it is fairly
343  * important to choose the parameters carefully. In particular the performance
344  * gain from using this structure comes from an efficient lookup in the
345  * absence of hash collision.
346  * As a rule of thumbs, if the number of active entries (flows) is M,
347  * there should be about 16*M fixed entries, and M/16 collision buckets.
348  * Which represents 17*M allocated entries.
349  *
350  * For example:
351  * M = 2^20 total_size ~= 1GiB collision ~= 3%
352  * M = 2^18 total_size ~= 250MiB collision ~= 3%
353  * M = 2^10 total_size ~= 1MiB collision ~= 6%
354  *
355  */
357 FVT(flowhash) *FV(flowhash_alloc)(u32 fixed_entries, u32 collision_buckets)
358 {
359  FVT(flowhash) *h;
361  void *mem;
363 
365 
366  entries = 1 + fixed_entries +
368  size = sizeof(*h) + sizeof(h->entries[0]) * entries +
369  sizeof(h->free_buckets_indices[0]) * collision_buckets;
370 
372  h = mem + collision_buckets * sizeof(h->free_buckets_indices[0]);
373  h->mem = mem;
374 
375  /* Fill free elements list */
376  int i;
377  for (i = 1; i <= collision_buckets; i++)
378  {
379  h->free_buckets_indices[-i] =
380  entries - i * FLOWHASH_ENTRIES_PER_BUCKETS;
381  }
382 
383  /* Init buckets */
384  for (i=0; i < entries; i++)
385  {
386  h->entries[i].chained_entry_index = FLOWHASH_INVALID_ENTRY_INDEX;
387  h->entries[i].timeout = 0;
388  }
389 
390  h->free_buckets_position = -collision_buckets;
391  h->fixed_entries_mask = fixed_entries - 1;
392  h->collision_buckets_mask = collision_buckets - 1;
393  h->total_entries = entries;
394  h->not_enough_buckets_counter = 0;
395  h->collision_lookup_counter = 0;
396  h->garbage_collection_counter = 0;
397  h->gc_counter = 0;
398 
399  return h;
400 }
401 
402 /**
403  * @brief Free the flow hash memory.
404  */
406 void FV(flowhash_free)(FVT(flowhash) *h)
407 {
408  clib_mem_free(h->mem);
409 }
410 
411 static void
412 FV(__flowhash_get_chained) (FVT(flowhash) *h, FVT(flowhash_lkey) *k,
413  u32 hash, u32 time_now, u32 *ei);
414 
415 /**
416  * @brief Retrieves an entry index corresponding to a provided key and its hash.
417  *
418  * @param h The hash table pointer.
419  * @param k[in] A pointer to the key value.
420  * @param hash[in] The hash of the key.
421  * @param time_now[in] The current time.
422  * @param ei[out] A pointer set to the found entry index.
423  *
424  * This function always sets ei value to a valid entry index which can then be
425  * used to access the stored value as well as get or set its associated timeout.
426  * The key stored in the returned entry is always set to the provided key.
427  *
428  * In case the provided key is not found, and no entry could be created
429  * (either because there is no hash collision bucket available or
430  * the candidate entries in the collision bucket were already used), ei is
431  * set to the special value FLOWHASH_INVALID_ENTRY_INDEX (which can be tested
432  * with the flowhash_is_overflow macro).
433  *
434  * The timeout value is never modified during a lookup.
435  * - Use the flowhash_is_timeouted macro to test whether the returned entry
436  * was already valid, or is proposed for insertion.
437  * - Use the flowhash_timeout macro to get and set the entry timeout value.
438  *
439  */
441 void FV(flowhash_get) (FVT(flowhash) *h, FVT(flowhash_lkey) *k,
442  u32 hash, u32 time_now, u32 *ei)
443 {
444  *ei = (hash & h->fixed_entries_mask) + 1;
445 
446  if (PREDICT_FALSE(FV(flowhash_cmp_key)(&h->entries[*ei].key, k) != 0))
447  {
448  if (PREDICT_TRUE(time_now > h->entries[*ei].timeout &&
449  (h->entries[*ei].chained_entry_index ==
451  {
452  FV(flowhash_cpy_key)(&h->entries[*ei].key, k);
453  }
454  else
455  {
456  FV(__flowhash_get_chained)(h, k, hash, time_now, ei);
457  }
458  }
459 }
460 
462 FV(__flowhash_get_chained) (FVT(flowhash) *h, FVT(flowhash_lkey) *k,
463  u32 hash, u32 time_now, u32 *ei)
464 {
465  h->collision_lookup_counter++;
466 
467  if (h->entries[*ei].chained_entry_index == FLOWHASH_INVALID_ENTRY_INDEX)
468  {
469  /* No chained entry yet. Let's chain one. */
470  if (h->free_buckets_position == 0)
471  {
472  /* Oops. No more buckets available. */
473  h->not_enough_buckets_counter++;
475  h->entries[FLOWHASH_INVALID_ENTRY_INDEX].timeout =
476  time_now - 1;
478  &h->entries[FLOWHASH_INVALID_ENTRY_INDEX].key, k);
479  return;
480  }
481 
482  /* Forward link */
483  h->entries[*ei].chained_entry_index =
484  h->free_buckets_indices[h->free_buckets_position];
485 
486  /* Backward link (for garbage collection) */
487  h->entries[h->free_buckets_indices[h->free_buckets_position]].
488  chained_entry_index = *ei;
489 
490  /* Move pointer */
491  h->free_buckets_position++;
492  }
493 
494  /* Get the two indexes where to look at. */
495  u32 bi0 = h->entries[*ei].chained_entry_index +
496  (hash >> (32 - FLOWHASH_ENTRIES_PER_BUCKETS_LOG));
497  u32 bi1 = bi0 + 1;
498  bi1 = (bi0 & (FLOWHASH_ENTRIES_PER_BUCKETS - 1)) ? bi1 :
500 
501  /* It is possible that we wait while comparing bi0 key.
502  * It's better to prefetch bi1 so we don't wait twice. */
503  CLIB_PREFETCH(&h->entries[bi1], sizeof (h->entries[0]), READ);
504 
505  if (FV(flowhash_cmp_key)(&h->entries[bi0].key, k) == 0)
506  {
507  *ei = bi0;
508  return;
509  }
510 
511  if (FV(flowhash_cmp_key)(&h->entries[bi1].key, k) == 0)
512  {
513  *ei = bi1;
514  return;
515  }
516 
517  if (h->entries[*ei].timeout >= time_now)
518  {
520  *ei = (time_now > h->entries[bi0].timeout) ? bi0 : *ei;
521  *ei = (time_now > h->entries[bi1].timeout) ? bi1 : *ei;
522  }
523 
524  FV(flowhash_cpy_key)(&h->entries[*ei].key, k);
525 }
526 
528 FV(flowhash_gc)(FVT(flowhash) *h, u32 time_now)
529 {
530  u32 ei;
531  if (PREDICT_FALSE(h->collision_buckets_mask == (((u32)0) - 1)))
532  return;
533 
534  /* prefetch two rounds in advance */
535  ei = 2 + h->fixed_entries_mask +
536  ((h->gc_counter + 2) & h->collision_buckets_mask) *
538  CLIB_PREFETCH(&h->entries[ei], sizeof (h->entries[0]), READ);
539 
540  /* prefetch one round in advance */
541  ei = 2 + h->fixed_entries_mask +
542  ((h->gc_counter + 1) & h->collision_buckets_mask) *
544  if (h->entries[ei].chained_entry_index != FLOWHASH_INVALID_ENTRY_INDEX)
545  {
546  CLIB_PREFETCH(&h->entries[ei], 4 * CLIB_CACHE_LINE_BYTES, READ);
547  }
548 
549  /* do GC */
550  ei = 2 + h->fixed_entries_mask +
551  ((h->gc_counter) & h->collision_buckets_mask) *
553  if (h->entries[ei].chained_entry_index != FLOWHASH_INVALID_ENTRY_INDEX)
554  {
555  u8 found = 0;
556  int i;
557  for (i=0; i<FLOWHASH_ENTRIES_PER_BUCKETS; i++)
558  {
559  if (time_now <= h->entries[ei + i].timeout)
560  {
561  found = 1;
562  break;
563  }
564  }
565 
566  if (!found)
567  {
568  /* The bucket is not used. Let's free it. */
569  h->free_buckets_position--;
570  /* Reset forward link */
571  h->entries[h->entries[ei].chained_entry_index].chained_entry_index =
573  /* Reset back link */
574  h->entries[ei].chained_entry_index = FLOWHASH_INVALID_ENTRY_INDEX;
575  /* Free element */
576  h->free_buckets_indices[h->free_buckets_position] = ei;
577  /* Count the garbage collection event */
578  h->garbage_collection_counter++;
579  }
580  }
581 
582  h->gc_counter++;
583 }
584 
586 u32 FV(flowhash_elts)(FVT(flowhash) *h, u32 time_now)
587 {
588  u32 tot = 0;
589  u32 ei;
590 
591  flowhash_foreach_valid_entry(h, ei, time_now)
592  tot++;
593 
594  return tot;
595 }
596 
597 #endif /* __included_flowhash_template_h__ */
#define FLOWHASH_ENTRIES_PER_BUCKETS
static_always_inline void FV() flowhash_cpy_key(FVT(flowhash_skey)*dst, FVT(flowhash_lkey)*src)
Copy a lookup key into a destination stored key.
#define CLIB_CACHE_LINE_ALIGN_MARK(mark)
Definition: cache.h:63
a
Definition: bitmap.h:538
h collision_buckets_mask
#define PREDICT_TRUE(x)
Definition: clib.h:106
static_always_inline u32 FV() flowhash_elts(FVT(flowhash)*h, u32 time_now)
unsigned long u64
Definition: types.h:89
h garbage_collection_counter
int i
#define FLOWHASH_ENTRIES_PER_BUCKETS_LOG
unsigned char u8
Definition: types.h:56
flowhash_validate_sizes & fixed_entries
#define static_always_inline
Definition: clib.h:93
h collision_lookup_counter
static void flowhash_validate_sizes(u32 *fixed_entries, u32 *collision_buckets)
Adjust, if necessary, provided parameters such as being valid flowhash sizes.
#define flowhash_foreach_valid_entry(h, ei, now)
Iterate over all currently valid entries.
unsigned int u32
Definition: types.h:88
u32 chained_entry_index
h gc_counter
uword size
#define PREDICT_FALSE(x)
Definition: clib.h:105
static_always_inline void FV() flowhash_get(FVT(flowhash)*h, FVT(flowhash_lkey)*k, u32 hash, u32 time_now, u32 *ei)
Retrieves an entry index corresponding to a provided key and its hash.
#define CLIB_PREFETCH(addr, size, type)
Definition: cache.h:77
static_always_inline void FV() flowhash_gc(FVT(flowhash)*h, u32 time_now)
h fixed_entries_mask
signed int i32
Definition: types.h:81
static_always_inline u32 FV() flowhash_hash(FVT(flowhash_lkey)*k)
Hash a lookup key into a 32 bit integer.
#define FLOWHASH_INVALID_ENTRY_INDEX
static_always_inline u32 collision_buckets
#define FVT(a)
static void clib_mem_free(void *p)
Definition: mem.h:179
static_always_inline void FV() flowhash_free(FVT(flowhash)*h)
Free the flow hash memory.
u32 entries
h total_entries
static_always_inline u8 FV() flowhash_cmp_key(FVT(flowhash_skey)*a, FVT(flowhash_lkey)*b)
Compare a stored key with a lookup key.
h free_buckets_position
u64 uword
Definition: types.h:112
h not_enough_buckets_counter
void * mem
static void * clib_mem_alloc_aligned(uword size, uword align)
Definition: mem.h:120
#define FV(a)
#define CLIB_CACHE_LINE_BYTES
Definition: cache.h:62
#define flowhash_value(h, ei)
Get the value from the entry index, casted to the provided type.