FD.io VPP  v19.08-27-gf4dcae4
Vector Packet Processing
tcp.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2019 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef _vnet_tcp_h_
17 #define _vnet_tcp_h_
18 
19 #include <vnet/vnet.h>
20 #include <vnet/ip/ip.h>
21 #include <vnet/tcp/tcp_packet.h>
22 #include <vnet/tcp/tcp_timer.h>
23 #include <vnet/session/transport.h>
24 #include <vnet/session/session.h>
25 #include <vnet/tcp/tcp_debug.h>
26 
27 #define TCP_TICK 0.001 /**< TCP tick period (s) */
28 #define THZ (u32) (1/TCP_TICK) /**< TCP tick frequency */
29 #define TCP_TSTAMP_RESOLUTION TCP_TICK /**< Time stamp resolution */
30 #define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ /**< 24 days */
31 #define TCP_FIB_RECHECK_PERIOD 1 * THZ /**< Recheck every 1s */
32 #define TCP_MAX_OPTION_SPACE 40
33 #define TCP_CC_DATA_SZ 24
34 
35 #define TCP_DUPACK_THRESHOLD 3
36 #define TCP_IW_N_SEGMENTS 10
37 #define TCP_ALWAYS_ACK 1 /**< On/off delayed acks */
38 #define TCP_USE_SACKS 1 /**< Disable only for testing */
39 
40 /** TCP FSM state definitions as per RFC793. */
41 #define foreach_tcp_fsm_state \
42  _(CLOSED, "CLOSED") \
43  _(LISTEN, "LISTEN") \
44  _(SYN_SENT, "SYN_SENT") \
45  _(SYN_RCVD, "SYN_RCVD") \
46  _(ESTABLISHED, "ESTABLISHED") \
47  _(CLOSE_WAIT, "CLOSE_WAIT") \
48  _(FIN_WAIT_1, "FIN_WAIT_1") \
49  _(LAST_ACK, "LAST_ACK") \
50  _(CLOSING, "CLOSING") \
51  _(FIN_WAIT_2, "FIN_WAIT_2") \
52  _(TIME_WAIT, "TIME_WAIT")
53 
54 typedef enum _tcp_state
55 {
56 #define _(sym, str) TCP_STATE_##sym,
58 #undef _
60 } tcp_state_t;
61 
66 
67 /** TCP timers */
68 #define foreach_tcp_timer \
69  _(RETRANSMIT, "RETRANSMIT") \
70  _(DELACK, "DELAYED ACK") \
71  _(PERSIST, "PERSIST") \
72  _(WAITCLOSE, "WAIT CLOSE") \
73  _(RETRANSMIT_SYN, "RETRANSMIT SYN") \
74 
75 typedef enum _tcp_timers
76 {
77 #define _(sym, str) TCP_TIMER_##sym,
79 #undef _
81 } tcp_timers_e;
82 
83 typedef void (timer_expiration_handler) (u32 index);
84 
89 
90 #define TCP_TIMER_HANDLE_INVALID ((u32) ~0)
91 
92 #define TCP_TIMER_TICK 0.1 /**< Timer tick in seconds */
93 #define TCP_TO_TIMER_TICK TCP_TICK*10 /**< Factor for converting
94  ticks to timer ticks */
95 
96 #define TCP_RTO_MAX 60 * THZ /* Min max RTO (60s) as per RFC6298 */
97 #define TCP_RTO_MIN 0.2 * THZ /* Min RTO (200ms) - lower than standard */
98 #define TCP_RTT_MAX 30 * THZ /* 30s (probably too much) */
99 #define TCP_RTO_SYN_RETRIES 3 /* SYN retries without doubling RTO */
100 #define TCP_RTO_INIT 1 * THZ /* Initial retransmit timer */
101 #define TCP_RTO_BOFF_MAX 8 /* Max number of retries before reset */
102 #define TCP_ESTABLISH_TIME (60 * THZ) /* Connection establish timeout */
104 /** TCP connection flags */
105 #define foreach_tcp_connection_flag \
106  _(SNDACK, "Send ACK") \
107  _(FINSNT, "FIN sent") \
108  _(RECOVERY, "Recovery") \
109  _(FAST_RECOVERY, "Fast Recovery") \
110  _(DCNT_PENDING, "Disconnect pending") \
111  _(HALF_OPEN_DONE, "Half-open completed") \
112  _(FINPNDG, "FIN pending") \
113  _(FRXT_PENDING, "Fast-retransmit pending") \
114  _(FRXT_FIRST, "Fast-retransmit first again") \
115  _(DEQ_PENDING, "Pending dequeue acked") \
116  _(PSH_PENDING, "PSH pending") \
117  _(FINRCVD, "FIN received") \
118  _(RATE_SAMPLE, "Conn does rate sampling") \
119  _(TRACK_BURST, "Track burst") \
120  _(ZERO_RWND_SENT, "Zero RWND sent") \
121 
122 typedef enum _tcp_connection_flag_bits
123 {
124 #define _(sym, str) TCP_CONN_##sym##_BIT,
126 #undef _
129 
130 typedef enum _tcp_connection_flag
131 {
132 #define _(sym, str) TCP_CONN_##sym = 1 << TCP_CONN_##sym##_BIT,
134 #undef _
137 
138 #define TCP_SCOREBOARD_TRACE (0)
139 #define TCP_MAX_SACK_BLOCKS 256 /**< Max number of SACK blocks stored */
140 #define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0)
142 typedef struct _scoreboard_trace_elt
143 {
144  u32 start;
145  u32 end;
146  u32 ack;
147  u32 snd_una_max;
148  u32 group;
150 
151 typedef struct _sack_scoreboard_hole
152 {
153  u32 next; /**< Index for next entry in linked list */
154  u32 prev; /**< Index for previous entry in linked list */
155  u32 start; /**< Start sequence number */
156  u32 end; /**< End sequence number */
157  u8 is_lost; /**< Mark hole as lost */
159 
160 typedef struct _sack_scoreboard
161 {
162  sack_scoreboard_hole_t *holes; /**< Pool of holes */
163  u32 head; /**< Index of first entry */
164  u32 tail; /**< Index of last entry */
165  u32 sacked_bytes; /**< Number of bytes sacked in sb */
166  u32 last_sacked_bytes; /**< Number of bytes last sacked */
167  u32 last_bytes_delivered; /**< Sack bytes delivered to app */
168  u32 snd_una_adv; /**< Bytes to add to snd_una */
169  u32 high_sacked; /**< Highest byte sacked (fack) */
170  u32 high_rxt; /**< Highest retransmitted sequence */
171  u32 rescue_rxt; /**< Rescue sequence number */
172  u32 lost_bytes; /**< Bytes lost as per RFC6675 */
173  u32 last_lost_bytes; /**< Number of bytes last lost */
174  u32 cur_rxt_hole; /**< Retransmitting from this hole */
175 
176 #if TCP_SCOREBOARD_TRACE
178 #endif
179 
181 
182 #if TCP_SCOREBOARD_TRACE
183 #define tcp_scoreboard_trace_add(_tc, _ack) \
184 { \
185  static u64 _group = 0; \
186  sack_scoreboard_t *_sb = &_tc->sack_sb; \
187  sack_block_t *_sack, *_sacks; \
188  scoreboard_trace_elt_t *_elt; \
189  int i; \
190  _group++; \
191  _sacks = _tc->rcv_opts.sacks; \
192  for (i = 0; i < vec_len (_sacks); i++) \
193  { \
194  _sack = &_sacks[i]; \
195  vec_add2 (_sb->trace, _elt, 1); \
196  _elt->start = _sack->start; \
197  _elt->end = _sack->end; \
198  _elt->ack = _elt->end == _ack ? _ack : 0; \
199  _elt->snd_una_max = _elt->end == _ack ? _tc->snd_una_max : 0; \
200  _elt->group = _group; \
201  } \
202 }
203 #else
204 #define tcp_scoreboard_trace_add(_tc, _ack)
205 #endif
206 
209  start, u8 have_sent_1_smss,
210  u8 * can_rescue,
211  u8 * snd_limited);
213  u32 index);
214 
216  sack_scoreboard_hole_t * hole);
218  sack_scoreboard_hole_t * hole);
223 u8 *format_tcp_scoreboard (u8 * s, va_list * args);
224 
225 #define TCP_BTS_INVALID_INDEX ((u32)~0)
227 typedef enum tcp_bts_flags_
228 {
229  TCP_BTS_IS_RXT = 1,
231 } __clib_packed tcp_bts_flags_t;
232 
233 typedef struct tcp_bt_sample_
234 {
235  u32 next; /**< Next sample index in list */
236  u32 prev; /**< Previous sample index in list */
237  u32 min_seq; /**< Min seq number in sample */
238  u32 max_seq; /**< Max seq number. Set for rxt samples */
239  u64 delivered; /**< Total delivered bytes for sample */
240  f64 delivered_time; /**< Delivered time when sample taken */
241  f64 tx_time; /**< Transmit time for the burst */
242  u64 tx_rate; /**< Tx pacing rate */
243  tcp_bts_flags_t flags; /**< Sample flag */
245 
246 typedef struct tcp_rate_sample_
247 {
248  u64 prior_delivered; /**< Delivered of sample used for rate, i.e.,
249  total bytes delivered at prior_time */
250  f64 prior_time; /**< Delivered time of sample used for rate */
251  f64 interval_time; /**< Time to ack the bytes delivered */
252  f64 rtt_time; /**< RTT for sample */
253  u64 tx_rate; /**< Tx pacing rate */
254  u32 delivered; /**< Bytes delivered in interval_time */
255  u32 acked_and_sacked; /**< Bytes acked + sacked now */
256  u32 lost; /**< Bytes lost now */
257  tcp_bts_flags_t flags; /**< Rate sample flags from bt sample */
259 
260 typedef struct tcp_byte_tracker_
261 {
262  tcp_bt_sample_t *samples; /**< Pool of samples */
263  rb_tree_t sample_lookup; /**< Rbtree for sample lookup by min_seq */
264  u32 head; /**< Head of samples linked list */
265  u32 tail; /**< Tail of samples linked list */
266  u32 last_ooo; /**< Cached last ooo sample */
268 
269 typedef enum _tcp_cc_algorithm_type
270 {
275 
276 typedef struct _tcp_cc_algorithm tcp_cc_algorithm_t;
278 typedef enum _tcp_cc_ack_t
279 {
280  TCP_CC_ACK,
284 
285 typedef enum tcp_cc_event_
286 {
289 
290 /*
291  * As per RFC4898 tcpEStatsStackSoftErrors
292  */
293 typedef struct tcp_errors_
294 {
295  u32 below_data_wnd; /**< All data in seg is below snd_una */
296  u32 above_data_wnd; /**< Some data in segment is above snd_wnd */
297  u32 below_ack_wnd; /**< Acks for data below snd_una */
298  u32 above_ack_wnd; /**< Acks for data not sent */
300 
301 typedef struct _tcp_connection
302 {
303  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
304  transport_connection_t connection; /**< Common transport data. First! */
305 
306  u8 state; /**< TCP state as per tcp_state_t */
307  u16 flags; /**< Connection flags (see tcp_conn_flags_e) */
308  u32 timers[TCP_N_TIMERS]; /**< Timer handles into timer wheel */
309 
310  u64 segs_in; /** RFC4022/4898 tcpHCInSegs/tcpEStatsPerfSegsIn */
311  u64 bytes_in; /** RFC4898 tcpEStatsPerfHCDataOctetsIn */
312  u64 segs_out; /** RFC4898 tcpEStatsPerfSegsOut */
313  u64 bytes_out; /** RFC4898 tcpEStatsPerfHCDataOctetsOut */
314 
315  /** Send sequence variables RFC793 */
316  u32 snd_una; /**< oldest unacknowledged sequence number */
317  u32 snd_una_max; /**< newest unacknowledged sequence number + 1*/
318  u32 snd_wnd; /**< send window */
319  u32 snd_wl1; /**< seq number used for last snd.wnd update */
320  u32 snd_wl2; /**< ack number used for last snd.wnd update */
321  u32 snd_nxt; /**< next seq number to be sent */
322  u16 snd_mss; /**< Effective send max seg (data) size */
323 
324  u64 data_segs_in; /** RFC4898 tcpEStatsPerfDataSegsIn */
325  u64 data_segs_out; /** RFC4898 tcpEStatsPerfDataSegsOut */
326 
327  /** Receive sequence variables RFC793 */
328  u32 rcv_nxt; /**< next sequence number expected */
329  u32 rcv_wnd; /**< receive window we expect */
330 
331  u32 rcv_las; /**< rcv_nxt at last ack sent/rcv_wnd update */
332  u32 iss; /**< initial sent sequence */
333  u32 irs; /**< initial remote sequence */
334 
335  /* Options */
336  u8 snd_opts_len; /**< Tx options len */
337  u8 rcv_wscale; /**< Window scale to advertise to peer */
338  u8 snd_wscale; /**< Window scale to use when sending */
339  u32 tsval_recent; /**< Last timestamp received */
340  u32 tsval_recent_age; /**< When last updated tstamp_recent*/
341  tcp_options_t snd_opts; /**< Tx options for connection */
342  tcp_options_t rcv_opts; /**< Rx options for connection */
343 
344  sack_block_t *snd_sacks; /**< Vector of SACKs to send. XXX Fixed size? */
345  u8 snd_sack_pos; /**< Position in vec of first block to send */
346  sack_block_t *snd_sacks_fl; /**< Vector for building new list */
347  sack_scoreboard_t sack_sb; /**< SACK "scoreboard" that tracks holes */
348 
349  u16 rcv_dupacks; /**< Number of recent DUPACKs received */
350  u32 dupacks_in; /**< RFC4898 tcpEStatsStackDupAcksIn*/
351  u8 pending_dupacks; /**< Number of DUPACKs to be sent */
352  u32 dupacks_out; /**< RFC4898 tcpEStatsPathDupAcksOut */
353 
354  /* Congestion control */
355  u32 cwnd; /**< Congestion window */
356  u32 cwnd_acc_bytes; /**< Bytes accumulated for cwnd increment */
357  u32 ssthresh; /**< Slow-start threshold */
358  u32 prev_ssthresh; /**< ssthresh before congestion */
359  u32 prev_cwnd; /**< ssthresh before congestion */
360  u32 bytes_acked; /**< Bytes acknowledged by current segment */
361  u32 burst_acked; /**< Bytes acknowledged in current burst */
362  u32 snd_rxt_bytes; /**< Retransmitted bytes */
363  u32 snd_rxt_ts; /**< Timestamp when first packet is retransmitted */
364  u32 tsecr_last_ack; /**< Timestamp echoed to us in last healthy ACK */
365  u32 snd_congestion; /**< snd_una_max when congestion is detected */
366  u32 tx_fifo_size; /**< Tx fifo size. Used to constrain cwnd */
367  tcp_cc_algorithm_t *cc_algo; /**< Congestion control algorithm */
368  u8 cc_data[TCP_CC_DATA_SZ]; /**< Congestion control algo private data */
369 
370  u32 fr_occurences; /**< fast-retransmit occurrences RFC4898
371  tcpEStatsStackFastRetran */
372  u32 tr_occurences; /**< timer-retransmit occurrences */
373  u64 bytes_retrans; /**< RFC4898 tcpEStatsPerfOctetsRetrans */
374  u64 segs_retrans; /**< RFC4898 tcpEStatsPerfSegsRetrans*/
375 
376  /* RTT and RTO */
377  u32 rto; /**< Retransmission timeout */
378  u32 rto_boff; /**< Index for RTO backoff */
379  u32 srtt; /**< Smoothed RTT */
380  u32 rttvar; /**< Smoothed mean RTT difference. Approximates variance */
381  u32 rtt_seq; /**< Sequence number for tracked ACK */
382  f64 rtt_ts; /**< Timestamp for tracked ACK */
383  f64 mrtt_us; /**< High precision mrtt from tracked acks */
384 
385  u32 psh_seq; /**< Add psh header for seg that includes this */
386  u32 next_node_index; /**< Can be used to control next node in output */
387  u32 next_node_opaque; /**< Opaque to pass to next node */
388  u32 limited_transmit; /**< snd_nxt when limited transmit starts */
389  u32 sw_if_index; /**< Interface for the connection */
390 
391  /* Delivery rate estimation */
392  u64 delivered; /**< Total bytes delivered to peer */
393  u64 app_limited; /**< Delivered when app-limited detected */
394  f64 delivered_time; /**< Time last bytes were acked */
395  tcp_byte_tracker_t *bt; /**< Tx byte tracker */
396 
397  tcp_errors_t errors; /**< Soft connection errors */
398 
399  f64 start_ts; /**< Timestamp when connection initialized */
400  u32 last_fib_check; /**< Last time we checked fib route for peer */
401  u16 mss; /**< Our max seg size that includes options */
402  u32 timestamp_delta; /**< Offset for timestamp */
404 
405 /* *INDENT-OFF* */
406 struct _tcp_cc_algorithm
407 {
408  const char *name;
409  uword (*unformat_cfg) (unformat_input_t * input);
410  void (*init) (tcp_connection_t * tc);
411  void (*cleanup) (tcp_connection_t * tc);
412  void (*rcv_ack) (tcp_connection_t * tc, tcp_rate_sample_t *rs);
413  void (*rcv_cong_ack) (tcp_connection_t * tc, tcp_cc_ack_t ack,
414  tcp_rate_sample_t *rs);
415  void (*congestion) (tcp_connection_t * tc);
416  void (*loss) (tcp_connection_t * tc);
417  void (*recovered) (tcp_connection_t * tc);
418  void (*undo_recovery) (tcp_connection_t * tc);
419  void (*event) (tcp_connection_t *tc, tcp_cc_event_t evt);
420 };
421 /* *INDENT-ON* */
422 
423 #define tcp_fastrecovery_on(tc) (tc)->flags |= TCP_CONN_FAST_RECOVERY
424 #define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY
425 #define tcp_recovery_on(tc) (tc)->flags |= TCP_CONN_RECOVERY
426 #define tcp_recovery_off(tc) (tc)->flags &= ~TCP_CONN_RECOVERY
427 #define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY)
428 #define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_RECOVERY))
429 #define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh)
430 #define tcp_disconnect_pending(tc) ((tc)->flags & TCP_CONN_DCNT_PENDING)
431 #define tcp_disconnect_pending_on(tc) ((tc)->flags |= TCP_CONN_DCNT_PENDING)
432 #define tcp_disconnect_pending_off(tc) ((tc)->flags &= ~TCP_CONN_DCNT_PENDING)
433 #define tcp_fastrecovery_first(tc) ((tc)->flags & TCP_CONN_FRXT_FIRST)
434 #define tcp_fastrecovery_first_on(tc) ((tc)->flags |= TCP_CONN_FRXT_FIRST)
435 #define tcp_fastrecovery_first_off(tc) ((tc)->flags &= ~TCP_CONN_FRXT_FIRST)
437 #define tcp_in_cong_recovery(tc) ((tc)->flags & \
438  (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY))
439 
440 always_inline void
442 {
443  tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY);
445 }
446 
447 #define tcp_zero_rwnd_sent(tc) ((tc)->flags & TCP_CONN_ZERO_RWND_SENT)
448 #define tcp_zero_rwnd_sent_on(tc) (tc)->flags |= TCP_CONN_ZERO_RWND_SENT
449 #define tcp_zero_rwnd_sent_off(tc) (tc)->flags &= ~TCP_CONN_ZERO_RWND_SENT
451 typedef enum _tcp_error
452 {
453 #define tcp_error(n,s) TCP_ERROR_##n,
455 #undef tcp_error
456  TCP_N_ERROR,
458 
459 typedef struct _tcp_lookup_dispatch
460 {
461  u8 next, error;
463 
464 typedef struct tcp_worker_ctx_
465 {
466  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
467  /** worker time */
468  u32 time_now;
470  /** worker timer wheel */
471  tw_timer_wheel_16t_2w_512sl_t timer_wheel;
473  /** tx buffer free list */
474  u32 *tx_buffers;
476  /** tx frames for tcp 4/6 output nodes */
477  vlib_frame_t *tx_frames[2];
479  /** tx frames for ip 4/6 lookup nodes */
480  vlib_frame_t *ip_lookup_tx_frames[2];
482  /** vector of pending ack dequeues */
483  u32 *pending_deq_acked;
485  /** vector of pending disconnect notifications */
486  u32 *pending_disconnects;
488  /** convenience pointer to this thread's vlib main */
489  vlib_main_t *vm;
491  CLIB_CACHE_LINE_ALIGN_MARK (cacheline1);
492 
493  /** cached 'on the wire' options for bursts */
494  u8 cached_opts[40];
497 
498 typedef struct tcp_iss_seed_
499 {
500  u64 first;
501  u64 second;
503 
504 typedef struct tcp_configuration_
505 {
506  /** Max rx fifo size for a session (in bytes). It is used in to compute the
507  * rfc 7323 window scaling factor */
508  u32 max_rx_fifo;
510  /** Min rx fifo for a session (in bytes) */
511  u32 min_rx_fifo;
513  /** Default MTU to be used when establishing connections */
514  u16 default_mtu;
516  /** Initial CWND multiplier, which multiplies MSS to determine initial CWND.
517  * Set 0 to determine the initial CWND by another way */
518  u16 initial_cwnd_multiplier;
520  /** Enable tx pacing for new connections */
521  u8 enable_tx_pacing;
523  /** Default congestion control algorithm type */
524  tcp_cc_algorithm_type_e cc_algo;
526  /** Delayed ack time (disabled) */
527  u16 delack_time;
529  /** Timer ticks to wait for close from app */
530  u16 closewait_time;
532  /** Timer ticks to wait in time-wait. Also known as 2MSL */
533  u16 timewait_time;
535  /** Timer ticks to wait in fin-wait1 to send fin and rcv fin-ack */
536  u16 finwait1_time;
538  /** Timer ticks to wait in last ack for ack */
539  u16 lastack_time;
541  /** Timer ticks to wait in fin-wait2 for fin */
542  u16 finwait2_time;
544  /** Timer ticks to wait in closing for fin ack */
545  u16 closing_time;
547  /** Timer ticks to wait before cleaning up the connection */
548  u16 cleanup_time;
550  /** Number of preallocated connections */
551  u32 preallocated_connections;
553  /** Number of preallocated half-open connections */
554  u32 preallocated_half_open_connections;
556  /** Vectors of src addresses. Optional unless one needs > 63K active-opens */
557  ip4_address_t *ip4_src_addrs;
558  ip6_address_t *ip6_src_addrs;
560  /** Fault-injection. Debug only */
561  f64 buffer_fail_fraction;
563 
564 typedef struct _tcp_main
565 {
566  /* Per-worker thread tcp connection pools */
568 
569  /* Pool of listeners. */
570  tcp_connection_t *listener_pool;
571 
572  /** Dispatch table by state and flags */
573  tcp_lookup_dispatch_t dispatch_table[TCP_N_STATES][64];
574 
575  u8 log2_tstamp_clocks_per_tick;
576  f64 tstamp_ticks_per_clock;
577 
578  /** per-worker context */
579  tcp_worker_ctx_t *wrk_ctx;
580 
581  /** Pool of half-open connections on which we've sent a SYN */
582  tcp_connection_t *half_open_connections;
583  clib_spinlock_t half_open_lock;
584 
585  /** vlib buffer size */
586  u32 bytes_per_buffer;
587 
588  /** Seed used to generate random iss */
589  tcp_iss_seed_t iss_seed;
590 
591  /** Congestion control algorithms registered */
592  tcp_cc_algorithm_t *cc_algos;
593 
594  /** Hash table of cc algorithms by name */
595  uword *cc_algo_by_name;
596 
597  /** Last cc algo registered */
598  tcp_cc_algorithm_type_e cc_last_type;
599 
600  /** Flag that indicates if stack is on or off */
601  u8 is_enabled;
602 
603  /** Flag that indicates if v4 punting is enabled */
604  u8 punt_unknown4;
605 
606  /** Flag that indicates if v6 punting is enabled */
607  u8 punt_unknown6;
608 
609  /** Rotor for v4 source addresses */
610  u32 last_v4_addr_rotor;
611 
612  /** Rotor for v6 source addresses */
613  u32 last_v6_addr_rotor;
614 
615  /** Protocol configuration */
617 } tcp_main_t;
618 
619 extern tcp_main_t tcp_main;
632 
633 #define tcp_cfg tcp_main.cfg
634 #define tcp_node_index(node_id, is_ip4) \
635  ((is_ip4) ? tcp4_##node_id##_node.index : tcp6_##node_id##_node.index)
636 
639 {
640  return &tcp_main;
641 }
642 
644 tcp_get_worker (u32 thread_index)
645 {
646  return &tcp_main.wrk_ctx[thread_index];
647 }
648 
651 {
652  ASSERT ((signed) b->current_data >= (signed) -VLIB_BUFFER_PRE_DATA_SIZE);
653  return (tcp_header_t *) (b->data + b->current_data
654  + vnet_buffer (b)->tcp.hdr_offset);
655 }
656 
657 #if (VLIB_BUFFER_TRACE_TRAJECTORY)
658 #define tcp_trajectory_add_start(b, start) \
659 { \
660  (*vlib_buffer_trace_trajectory_cb) (b, start); \
661 }
662 #else
663 #define tcp_trajectory_add_start(b, start)
664 #endif
665 
667 
668 void tcp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add);
669 
671 tcp_connection_get (u32 conn_index, u32 thread_index)
672 {
673  if (PREDICT_FALSE
674  (pool_is_free_index (tcp_main.connections[thread_index], conn_index)))
675  return 0;
676  return pool_elt_at_index (tcp_main.connections[thread_index], conn_index);
677 }
678 
680 tcp_connection_get_if_valid (u32 conn_index, u32 thread_index)
681 {
682  if (tcp_main.connections[thread_index] == 0)
683  return 0;
684  if (pool_is_free_index (tcp_main.connections[thread_index], conn_index))
685  return 0;
686  return pool_elt_at_index (tcp_main.connections[thread_index], conn_index);
687 }
688 
691 {
692  return (tcp_connection_t *) tconn;
693 }
694 
695 always_inline void
697 {
698  tc->state = state;
699  TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc);
700 }
701 
710  ip4_address_t * start,
711  ip4_address_t * end, u32 table_id);
713  ip6_address_t * start,
714  ip6_address_t * end, u32 table_id);
715 void tcp_api_reference (void);
716 u8 *format_tcp_connection (u8 * s, va_list * args);
717 
719 tcp_listener_get (u32 tli)
720 {
721  return pool_elt_at_index (tcp_main.listener_pool, tli);
722 }
723 
726 {
727  tcp_connection_t *tc = 0;
728  clib_spinlock_lock_if_init (&tcp_main.half_open_lock);
729  if (!pool_is_free_index (tcp_main.half_open_connections, conn_index))
730  tc = pool_elt_at_index (tcp_main.half_open_connections, conn_index);
731  clib_spinlock_unlock_if_init (&tcp_main.half_open_lock);
732  return tc;
733 }
734 
738  u32 thread_index, u8 is_ip4);
739 void tcp_send_reset (tcp_connection_t * tc);
740 void tcp_send_syn (tcp_connection_t * tc);
742 void tcp_send_fin (tcp_connection_t * tc);
743 void tcp_init_mss (tcp_connection_t * tc);
745 void tcp_update_rto (tcp_connection_t * tc);
746 void tcp_flush_frame_to_output (tcp_worker_ctx_t * wrk, u8 is_ip4);
749 
753 
754 /*
755  * Rate estimation
756  */
757 
758 /**
759  * Byte tracker initialize
760  *
761  * @param tc connection for which the byte tracker should be allocated and
762  * initialized
763  */
764 void tcp_bt_init (tcp_connection_t * tc);
765 /**
766  * Byte tracker cleanup
767  *
768  * @param tc connection for which the byte tracker should be cleaned up
769  */
770 void tcp_bt_cleanup (tcp_connection_t * tc);
771 /**
772  * Flush byte tracker samples
773  *
774  * @param tc tcp connection for which samples should be flushed
775  */
777 /**
778  * Track a tcp tx burst
779  *
780  * @param tc tcp connection
781  */
783 /**
784  * Track a tcp retransmission
785  *
786  * @param tc tcp connection
787  * @param start start sequence number
788  * @param end end sequence number
789  */
790 void tcp_bt_track_rxt (tcp_connection_t * tc, u32 start, u32 end);
791 /**
792  * Generate a delivery rate sample from recently acked bytes
793  *
794  * @param tc tcp connection
795  * @param rs resulting rate sample
796  */
798  tcp_rate_sample_t * rs);
799 /**
800  * Check if sample to be generated is app limited
801  *
802  * @param tc tcp connection
803  */
805 /**
806  * Check if the byte tracker is in sane state
807  *
808  * Should be used only for testing
809  *
810  * @param bt byte tracker
811  */
813 
816 {
817  return th->seq_number + tcp_is_syn (th) + tcp_is_fin (th) + len;
818 }
819 
820 /* Modulo arithmetic for TCP sequence numbers */
821 #define seq_lt(_s1, _s2) ((i32)((_s1)-(_s2)) < 0)
822 #define seq_leq(_s1, _s2) ((i32)((_s1)-(_s2)) <= 0)
823 #define seq_gt(_s1, _s2) ((i32)((_s1)-(_s2)) > 0)
824 #define seq_geq(_s1, _s2) ((i32)((_s1)-(_s2)) >= 0)
825 #define seq_max(_s1, _s2) (seq_gt((_s1), (_s2)) ? (_s1) : (_s2))
827 /* Modulo arithmetic for timestamps */
828 #define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0)
829 #define timestamp_leq(_t1, _t2) ((i32)((_t1)-(_t2)) <= 0)
831 /**
832  * Our estimate of the number of bytes that have left the network
833  */
835 tcp_bytes_out (const tcp_connection_t * tc)
836 {
837  if (tcp_opts_sack_permitted (&tc->rcv_opts))
838  return tc->sack_sb.sacked_bytes + tc->sack_sb.lost_bytes;
839  else
840  return tc->rcv_dupacks * tc->snd_mss;
841 }
842 
843 /**
844  * Our estimate of the number of bytes in flight (pipe size)
845  */
848 {
849  int flight_size;
850 
851  flight_size = (int) (tc->snd_nxt - tc->snd_una) - tcp_bytes_out (tc)
852  + tc->snd_rxt_bytes;
853 
854  if (flight_size < 0)
855  {
856  if (0)
858  ("Negative: %u %u %u dupacks %u sacked bytes %u flags %d",
859  tc->snd_una_max - tc->snd_una, tcp_bytes_out (tc),
860  tc->snd_rxt_bytes, tc->rcv_dupacks, tc->sack_sb.sacked_bytes,
861  tc->rcv_opts.flags);
862  return 0;
863  }
864 
865  return flight_size;
866 }
867 
868 /**
869  * Initial cwnd as per RFC5681
870  */
873 {
874  if (tcp_cfg.initial_cwnd_multiplier > 0)
875  return tcp_cfg.initial_cwnd_multiplier * tc->snd_mss;
876 
877  if (tc->snd_mss > 2190)
878  return 2 * tc->snd_mss;
879  else if (tc->snd_mss > 1095)
880  return 3 * tc->snd_mss;
881  else
882  return 4 * tc->snd_mss;
883 }
884 
885 /*
886  * Accumulate acked bytes for cwnd increase
887  *
888  * Once threshold bytes are accumulated, snd_mss bytes are added
889  * to the cwnd.
890  */
891 always_inline void
892 tcp_cwnd_accumulate (tcp_connection_t * tc, u32 thresh, u32 bytes)
893 {
894  tc->cwnd_acc_bytes += bytes;
895  if (tc->cwnd_acc_bytes >= thresh)
896  {
897  u32 inc = tc->cwnd_acc_bytes / thresh;
898  tc->cwnd_acc_bytes -= inc * thresh;
899  tc->cwnd += inc * tc->snd_mss;
900  tc->cwnd = clib_min (tc->cwnd, tc->tx_fifo_size);
901  }
902 }
903 
905 tcp_loss_wnd (const tcp_connection_t * tc)
906 {
907  return tc->snd_mss;
908 }
909 
912 {
913  return clib_min (tc->cwnd, tc->snd_wnd);
914 }
915 
918 {
919  u32 available_wnd = tcp_available_snd_wnd (tc);
920  int flight_size = (int) (tc->snd_nxt - tc->snd_una);
921 
922  if (available_wnd <= flight_size)
923  return 0;
924 
925  return available_wnd - flight_size;
926 }
927 
928 /**
929  * Estimate of how many bytes we can still push into the network
930  */
933 {
934  u32 available_wnd = tcp_available_snd_wnd (tc);
935  u32 flight_size = tcp_flight_size (tc);
936 
937  if (available_wnd <= flight_size)
938  return 0;
939 
940  return available_wnd - flight_size;
941 }
942 
945 {
946  if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1)
947  return 1;
948  return 0;
949 }
950 
953  tcp_connection_t * tc);
955  tcp_connection_t * tc, u32 burst_size);
957  u32 burst_size);
959  u32 burst_size);
962 
964 
965 /* Made public for unit testing only */
966 void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end);
968 
970 tcp_time_now (void)
971 {
972  return tcp_main.wrk_ctx[vlib_get_thread_index ()].time_now;
973 }
974 
976 tcp_time_now_w_thread (u32 thread_index)
977 {
978  return tcp_main.wrk_ctx[thread_index].time_now;
979 }
980 
981 /**
982  * Generate timestamp for tcp connection
983  */
986 {
987  return (tcp_main.wrk_ctx[tc->c_thread_index].time_now -
988  tc->timestamp_delta);
989 }
990 
992 tcp_time_now_us (u32 thread_index)
993 {
994  return transport_time_now (thread_index);
995 }
996 
999 {
1000  wrk->time_now = clib_cpu_time_now () * tcp_main.tstamp_ticks_per_clock;
1001  return wrk->time_now;
1002 }
1003 
1005  vlib_buffer_t * b);
1006 int tcp_session_custom_tx (void *conn, u32 max_burst_size);
1007 
1014  u32 start_bucket);
1015 
1016 always_inline void
1019  tc->cc_algo->rcv_ack (tc, rs);
1020  tc->tsecr_last_ack = tc->rcv_opts.tsecr;
1021 }
1022 
1023 static inline void
1026 {
1027  tc->cc_algo->rcv_cong_ack (tc, ack_type, rs);
1028 }
1029 
1030 static inline void
1033  tc->cc_algo->loss (tc);
1034 }
1035 
1036 static inline void
1039  tc->cc_algo->recovered (tc);
1040 }
1041 
1042 static inline void
1045  if (tc->cc_algo->undo_recovery)
1046  tc->cc_algo->undo_recovery (tc);
1047 }
1048 
1049 static inline void
1052  if (tc->cc_algo->event)
1053  tc->cc_algo->event (tc, evt);
1054 }
1055 
1056 always_inline void
1057 tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval)
1059  ASSERT (tc->c_thread_index == vlib_get_thread_index ());
1060  ASSERT (tc->timers[timer_id] == TCP_TIMER_HANDLE_INVALID);
1061  tc->timers[timer_id] =
1062  tw_timer_start_16t_2w_512sl (&tcp_main.
1063  wrk_ctx[tc->c_thread_index].timer_wheel,
1064  tc->c_c_index, timer_id, interval);
1065 }
1066 
1067 always_inline void
1068 tcp_timer_reset (tcp_connection_t * tc, u8 timer_id)
1070  ASSERT (tc->c_thread_index == vlib_get_thread_index ());
1071  if (tc->timers[timer_id] == TCP_TIMER_HANDLE_INVALID)
1072  return;
1073 
1074  tw_timer_stop_16t_2w_512sl (&tcp_main.
1075  wrk_ctx[tc->c_thread_index].timer_wheel,
1076  tc->timers[timer_id]);
1077  tc->timers[timer_id] = TCP_TIMER_HANDLE_INVALID;
1078 }
1079 
1080 always_inline void
1081 tcp_timer_update (tcp_connection_t * tc, u8 timer_id, u32 interval)
1083  ASSERT (tc->c_thread_index == vlib_get_thread_index ());
1084  if (tc->timers[timer_id] != TCP_TIMER_HANDLE_INVALID)
1085  tw_timer_update_16t_2w_512sl (&tcp_main.
1086  wrk_ctx[tc->c_thread_index].timer_wheel,
1087  tc->timers[timer_id], interval);
1088  else
1089  tc->timers[timer_id] =
1090  tw_timer_start_16t_2w_512sl (&tcp_main.
1091  wrk_ctx[tc->c_thread_index].timer_wheel,
1092  tc->c_c_index, timer_id, interval);
1093 }
1094 
1095 always_inline void
1098  ASSERT (tc->snd_una != tc->snd_una_max);
1099  tcp_timer_set (tc, TCP_TIMER_RETRANSMIT,
1100  clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
1101 }
1102 
1103 always_inline void
1106  tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT);
1107 }
1108 
1109 always_inline void
1112  tcp_timer_update (tc, TCP_TIMER_RETRANSMIT,
1113  clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
1114 }
1115 
1116 always_inline void
1119  /* Reuse RTO. It's backed off in handler */
1120  tcp_timer_set (tc, TCP_TIMER_PERSIST,
1121  clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
1122 }
1123 
1124 always_inline void
1127  tcp_timer_update (tc, TCP_TIMER_PERSIST,
1128  clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
1129 }
1130 
1131 always_inline void
1134  tcp_timer_reset (tc, TCP_TIMER_PERSIST);
1135 }
1136 
1137 always_inline void
1140  if (tc->snd_una == tc->snd_nxt)
1141  {
1143  if (tc->snd_wnd < tc->snd_mss)
1145  }
1146  else
1147  tcp_timer_update (tc, TCP_TIMER_RETRANSMIT,
1148  clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
1149 }
1150 
1154  return tc->timers[timer] != TCP_TIMER_HANDLE_INVALID;
1155 }
1156 
1157 #define tcp_validate_txf_size(_tc, _a) \
1158  ASSERT(_tc->state != TCP_STATE_ESTABLISHED \
1159  || transport_max_tx_dequeue (&_tc->connection) >= _a)
1160 
1161 void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack);
1162 u8 *tcp_scoreboard_replay (u8 * s, tcp_connection_t * tc, u8 verbose);
1163 
1164 /**
1165  * Register exiting cc algo type
1166  */
1168  const tcp_cc_algorithm_t * vft);
1169 
1170 /**
1171  * Register new cc algo type
1172  */
1175 
1176 static inline void *
1179  return (void *) tc->cc_data;
1180 }
1181 
1183  tcp_rate_sample_t * rs);
1184 
1185 /**
1186  * Push TCP header to buffer
1187  *
1188  * @param vm - vlib_main
1189  * @param b - buffer to write the header to
1190  * @param sp_net - source port net order
1191  * @param dp_net - destination port net order
1192  * @param seq - sequence number net order
1193  * @param ack - ack number net order
1194  * @param tcp_hdr_opts_len - header and options length in bytes
1195  * @param flags - header flags
1196  * @param wnd - window size
1197  *
1198  * @return - pointer to start of TCP header
1199  */
1200 always_inline void *
1202  u32 ack, u8 tcp_hdr_opts_len, u8 flags,
1203  u16 wnd)
1204 {
1205  tcp_header_t *th;
1206 
1207  th = vlib_buffer_push_uninit (b, tcp_hdr_opts_len);
1208 
1209  th->src_port = sp;
1210  th->dst_port = dp;
1211  th->seq_number = seq;
1212  th->ack_number = ack;
1213  th->data_offset_and_reserved = (tcp_hdr_opts_len >> 2) << 4;
1214  th->flags = flags;
1215  th->window = wnd;
1216  th->checksum = 0;
1217  th->urgent_pointer = 0;
1218  return th;
1219 }
1220 
1221 /**
1222  * Push TCP header to buffer
1223  *
1224  * @param b - buffer to write the header to
1225  * @param sp_net - source port net order
1226  * @param dp_net - destination port net order
1227  * @param seq - sequence number host order
1228  * @param ack - ack number host order
1229  * @param tcp_hdr_opts_len - header and options length in bytes
1230  * @param flags - header flags
1231  * @param wnd - window size
1232  *
1233  * @return - pointer to start of TCP header
1234  */
1235 always_inline void *
1236 vlib_buffer_push_tcp (vlib_buffer_t * b, u16 sp_net, u16 dp_net, u32 seq,
1237  u32 ack, u8 tcp_hdr_opts_len, u8 flags, u16 wnd)
1238 {
1239  return vlib_buffer_push_tcp_net_order (b, sp_net, dp_net,
1240  clib_host_to_net_u32 (seq),
1241  clib_host_to_net_u32 (ack),
1242  tcp_hdr_opts_len, flags,
1243  clib_host_to_net_u16 (wnd));
1244 }
1245 
1246 #endif /* _vnet_tcp_h_ */
1247 
1248 /*
1249  * fd.io coding-style-patch-verification: ON
1250  *
1251  * Local Variables:
1252  * eval: (c-set-style "gnu")
1253  * End:
1254  */
tcp_main_t tcp_main
Definition: tcp.c:30
vlib_node_registration_t tcp6_listen_node
(constructor) VLIB_REGISTER_NODE (tcp6_listen_node)
Definition: tcp_input.c:3216
static vlib_cli_command_t trace
(constructor) VLIB_CLI_COMMAND (trace)
Definition: vlib_api_cli.c:870
#define CLIB_CACHE_LINE_ALIGN_MARK(mark)
Definition: cache.h:60
u32 flags
Definition: vhost_user.h:141
void tcp_send_window_update_ack(tcp_connection_t *tc)
Send Window Update ACK, ensuring that it will be sent once, if RWND became non-zero, after zero RWND has been advertised in ACK before.
Definition: tcp_output.c:1242
#define clib_min(x, y)
Definition: clib.h:295
void scoreboard_clear(sack_scoreboard_t *sb)
Definition: tcp_input.c:906
static f64 tcp_time_now_us(u32 thread_index)
Definition: tcp.h:993
static void tcp_retransmit_timer_set(tcp_connection_t *tc)
Definition: tcp.h:1097
void tcp_make_fin(tcp_connection_t *tc, vlib_buffer_t *b)
Convert buffer to FIN-ACK.
Definition: tcp_output.c:525
struct _sack_block sack_block_t
void tcp_cc_init_congestion(tcp_connection_t *tc)
Init loss recovery/fast recovery.
Definition: tcp_input.c:1171
struct _scoreboard_trace_elt scoreboard_trace_elt_t
void tcp_connection_timers_reset(tcp_connection_t *tc)
Stop all connection timers.
Definition: tcp.c:488
static f64 transport_time_now(u32 thread_index)
Definition: session.h:474
#define TCP_TO_TIMER_TICK
Factor for converting ticks to timer ticks.
Definition: tcp.h:93
vlib_node_registration_t tcp4_output_node
(constructor) VLIB_REGISTER_NODE (tcp4_output_node)
Definition: tcp_output.c:2305
void scoreboard_init(sack_scoreboard_t *sb)
Definition: tcp_input.c:898
static u32 tcp_bytes_out(const tcp_connection_t *tc)
Our estimate of the number of bytes that have left the network.
Definition: tcp.h:836
i16 current_data
signed offset in data[], pre_data[] that we are currently processing.
Definition: buffer.h:110
unsigned long u64
Definition: types.h:89
static tcp_connection_t * tcp_connection_get_if_valid(u32 conn_index, u32 thread_index)
Definition: tcp.h:681
void tcp_connection_del(tcp_connection_t *tc)
Connection removal.
Definition: tcp.c:290
struct _sack_scoreboard sack_scoreboard_t
void tcp_update_sack_list(tcp_connection_t *tc, u32 start, u32 end)
Build SACK list as per RFC2018.
Definition: tcp_input.c:1683
enum tcp_bts_flags_ tcp_bts_flags_t
u32 tcp_snd_space(tcp_connection_t *tc)
Definition: tcp.c:1191
int tcp_bt_is_sane(tcp_byte_tracker_t *bt)
Check if the byte tracker is in sane state.
Definition: tcp_bt.c:182
static tcp_connection_t * tcp_half_open_connection_get(u32 conn_index)
Definition: tcp.h:726
sack_scoreboard_hole_t * scoreboard_last_hole(sack_scoreboard_t *sb)
Definition: tcp_input.c:694
sack_scoreboard_hole_t * scoreboard_next_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp_input.c:670
#define VLIB_BUFFER_PRE_DATA_SIZE
Definition: buffer.h:51
static_always_inline void clib_spinlock_unlock_if_init(clib_spinlock_t *p)
Definition: lock.h:108
struct _tcp_main tcp_main_t
vlib_node_registration_t tcp6_output_node
(constructor) VLIB_REGISTER_NODE (tcp6_output_node)
Definition: tcp_output.c:2325
u8 data[0]
Packet data.
Definition: buffer.h:181
static u64 clib_cpu_time_now(void)
Definition: time.h:75
timer_expiration_handler tcp_timer_retransmit_handler
u8 * format_tcp_scoreboard(u8 *s, va_list *args)
Definition: tcp.c:1071
u32 tcp_session_push_header(transport_connection_t *tconn, vlib_buffer_t *b)
Definition: tcp_output.c:1138
struct _tcp_lookup_dispatch tcp_lookup_dispatch_t
void tcp_connection_tx_pacer_update(tcp_connection_t *tc)
Definition: tcp.c:1264
enum tcp_cc_event_ tcp_cc_event_t
void tcp_update_burst_snd_vars(tcp_connection_t *tc)
Update burst send vars.
Definition: tcp_output.c:400
struct _tcp_connection tcp_connection_t
static u32 tcp_available_cc_snd_space(const tcp_connection_t *tc)
Estimate of how many bytes we can still push into the network.
Definition: tcp.h:933
static u32 tcp_available_snd_wnd(const tcp_connection_t *tc)
Definition: tcp.h:912
static tcp_connection_t * tcp_get_connection_from_transport(transport_connection_t *tconn)
Definition: tcp.h:691
void tcp_connection_cleanup(tcp_connection_t *tc)
Cleans up connection state.
Definition: tcp.c:239
u8 *( format_function_t)(u8 *s, va_list *args)
Definition: format.h:48
format_function_t format_tcp_flags
Definition: tcp.h:63
struct _tcp_header tcp_header_t
void tcp_connection_reset(tcp_connection_t *tc)
Notify session that connection has been reset.
Definition: tcp.c:328
unsigned char u8
Definition: types.h:56
struct _sack_scoreboard_hole sack_scoreboard_hole_t
double f64
Definition: types.h:142
vlib_node_registration_t tcp4_syn_sent_node
(constructor) VLIB_REGISTER_NODE (tcp4_syn_sent_node)
Definition: tcp_input.c:2569
u8 * tcp_scoreboard_replay(u8 *s, tcp_connection_t *tc, u8 verbose)
Definition: tcp.c:1998
#define tcp_is_fin(_th)
Definition: tcp_packet.h:90
static void tcp_connection_set_state(tcp_connection_t *tc, tcp_state_t state)
Definition: tcp.h:697
#define tcp_cfg
Definition: tcp.h:634
vl_api_interface_index_t sw_if_index
Definition: gre.api:50
sack_scoreboard_hole_t * scoreboard_get_hole(sack_scoreboard_t *sb, u32 index)
Definition: tcp_input.c:662
#define always_inline
Definition: clib.h:98
struct tcp_bt_sample_ tcp_bt_sample_t
static u32 tcp_available_output_snd_space(const tcp_connection_t *tc)
Definition: tcp.h:918
int tcp_retransmit_first_unacked(tcp_worker_ctx_t *wrk, tcp_connection_t *tc)
Retransmit first unacked segment.
Definition: tcp_output.c:1761
static void tcp_cc_loss(tcp_connection_t *tc)
Definition: tcp.h:1032
static tcp_header_t * tcp_buffer_hdr(vlib_buffer_t *b)
Definition: tcp.h:651
enum _tcp_state tcp_state_t
vhost_vring_state_t state
Definition: vhost_user.h:146
timer_expiration_handler tcp_timer_retransmit_syn_handler
static u32 tcp_time_now(void)
Definition: tcp.h:971
unsigned int u32
Definition: types.h:88
static ct_connection_t * connections
sack_scoreboard_hole_t * scoreboard_next_rxt_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *start, u8 have_sent_1_smss, u8 *can_rescue, u8 *snd_limited)
Figure out the next hole to retransmit.
Definition: tcp_input.c:828
void tcp_api_reference(void)
Definition: tcp_api.c:109
#define TCP_EVT_DBG(_evt, _args...)
Definition: tcp_debug.h:243
struct tcp_byte_tracker_ tcp_byte_tracker_t
static void tcp_timer_set(tcp_connection_t *tc, u8 timer_id, u32 interval)
Definition: tcp.h:1058
static heap_elt_t * first(heap_header_t *h)
Definition: heap.c:59
vl_api_fib_path_type_t type
Definition: fib_types.api:123
struct tcp_worker_ctx_ tcp_worker_ctx_t
#define pool_elt_at_index(p, i)
Returns pointer to element at given index.
Definition: pool.h:514
void tcp_bt_init(tcp_connection_t *tc)
Byte tracker initialize.
Definition: tcp_bt.c:575
timer_expiration_handler tcp_timer_persist_handler
vlib_node_registration_t tcp6_syn_sent_node
(constructor) VLIB_REGISTER_NODE (tcp6_syn_sent_node)
Definition: tcp_input.c:2588
u32 tcp_sack_list_bytes(tcp_connection_t *tc)
Definition: tcp_input.c:1732
static void * vlib_buffer_push_tcp_net_order(vlib_buffer_t *b, u16 sp, u16 dp, u32 seq, u32 ack, u8 tcp_hdr_opts_len, u8 flags, u16 wnd)
Push TCP header to buffer.
Definition: tcp.h:1202
void tcp_rcv_sacks(tcp_connection_t *tc, u32 ack)
Definition: tcp_input.c:945
int tcp_fast_retransmit_sack(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, u32 burst_size)
Do fast retransmit with SACKs.
Definition: tcp_output.c:1818
clib_error_t * vnet_tcp_enable_disable(vlib_main_t *vm, u8 is_en)
Definition: tcp.c:1490
void tcp_send_syn(tcp_connection_t *tc)
Send SYN.
Definition: tcp_output.c:926
static void * tcp_cc_data(tcp_connection_t *tc)
Definition: tcp.h:1178
struct _unformat_input_t unformat_input_t
unsigned short u16
Definition: types.h:57
sack_scoreboard_hole_t * scoreboard_prev_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp_input.c:678
static void cleanup(void)
Definition: client.c:131
#define TCP_TIMER_HANDLE_INVALID
Definition: tcp.h:90
static void tcp_cc_rcv_cong_ack(tcp_connection_t *tc, tcp_cc_ack_t ack_type, tcp_rate_sample_t *rs)
Definition: tcp.h:1025
static u32 tcp_flight_size(const tcp_connection_t *tc)
Our estimate of the number of bytes in flight (pipe size)
Definition: tcp.h:848
#define PREDICT_FALSE(x)
Definition: clib.h:111
tcp_connection_t * tcp_connection_alloc(u8 thread_index)
Definition: tcp.c:297
void tcp_connection_tx_pacer_reset(tcp_connection_t *tc, u32 window, u32 start_bucket)
Definition: tcp.c:1280
void tcp_cc_algo_register(tcp_cc_algorithm_type_e type, const tcp_cc_algorithm_t *vft)
Register exiting cc algo type.
Definition: tcp.c:88
void( timer_expiration_handler)(u32 index)
Definition: tcp.h:83
u8 name[64]
Definition: memclnt.api:152
enum _tcp_cc_ack_t tcp_cc_ack_t
static void tcp_cwnd_accumulate(tcp_connection_t *tc, u32 thresh, u32 bytes)
Definition: tcp.h:893
void newreno_rcv_cong_ack(tcp_connection_t *tc, tcp_cc_ack_t ack_type, tcp_rate_sample_t *rs)
Definition: tcp_newreno.c:52
u8 len
Definition: ip_types.api:90
tcp_bts_flags_
Definition: tcp.h:228
static void tcp_timer_reset(tcp_connection_t *tc, u8 timer_id)
Definition: tcp.h:1069
vlib_node_registration_t tcp4_listen_node
(constructor) VLIB_REGISTER_NODE (tcp4_listen_node)
Definition: tcp_input.c:3197
enum _tcp_error tcp_error_t
static void * vlib_buffer_push_tcp(vlib_buffer_t *b, u16 sp_net, u16 dp_net, u32 seq, u32 ack, u8 tcp_hdr_opts_len, u8 flags, u16 wnd)
Push TCP header to buffer.
Definition: tcp.h:1237
int tcp_configure_v4_source_address_range(vlib_main_t *vm, ip4_address_t *start, ip4_address_t *end, u32 table_id)
Configure an ipv4 source address range.
Definition: tcp.c:1687
static_always_inline uword vlib_get_thread_index(void)
Definition: threads.h:213
vlib_main_t * vm
Definition: buffer.c:312
void tcp_program_dupack(tcp_connection_t *tc)
Definition: tcp_output.c:1199
void tcp_send_reset(tcp_connection_t *tc)
Build and set reset packet for connection.
Definition: tcp_output.c:846
static u32 tcp_tstamp(tcp_connection_t *tc)
Generate timestamp for tcp connection.
Definition: tcp.h:986
void tcp_bt_track_tx(tcp_connection_t *tc)
Track a tcp tx burst.
Definition: tcp_bt.c:266
void tcp_punt_unknown(vlib_main_t *vm, u8 is_ip4, u8 is_add)
Definition: tcp.c:1508
format_function_t format_tcp_state
Definition: tcp.h:62
static void tcp_cc_undo_recovery(tcp_connection_t *tc)
Definition: tcp.h:1044
#define clib_warning(format, args...)
Definition: error.h:59
enum _tcp_timers tcp_timers_e
struct _transport_connection transport_connection_t
int tcp_half_open_connection_cleanup(tcp_connection_t *tc)
Try to cleanup half-open connection.
Definition: tcp.c:211
u32 fib_node_index_t
A typedef of a node index.
Definition: fib_types.h:30
void tcp_connection_timers_init(tcp_connection_t *tc)
Initialize all connection timers as invalid.
Definition: tcp.c:471
void tcp_make_synack(tcp_connection_t *ts, vlib_buffer_t *b)
Convert buffer to SYN-ACK.
Definition: tcp_output.c:559
#define pool_is_free_index(P, I)
Use free bitmap to query whether given index is free.
Definition: pool.h:283
format_function_t format_tcp_rcv_sacks
Definition: tcp.h:65
vlib_node_registration_t tcp6_input_node
(constructor) VLIB_REGISTER_NODE (tcp6_input_node)
Definition: tcp_input.c:3630
fib_node_index_t tcp_lookup_rmt_in_fib(tcp_connection_t *tc)
void tcp_send_synack(tcp_connection_t *tc)
Definition: tcp_output.c:961
static void tcp_timer_update(tcp_connection_t *tc, u8 timer_id, u32 interval)
Definition: tcp.h:1082
#define ASSERT(truth)
enum _tcp_cc_algorithm_type tcp_cc_algorithm_type_e
struct tcp_rate_sample_ tcp_rate_sample_t
void tcp_flush_frames_to_output(tcp_worker_ctx_t *wrk)
Flush v4 and v6 tcp and ip-lookup tx frames for thread index.
Definition: tcp_output.c:1019
enum _tcp_connection_flag_bits tcp_connection_flag_bits_e
void tcp_connection_init_vars(tcp_connection_t *tc)
Initialize tcp connection variables.
Definition: tcp.c:652
void tcp_bt_sample_delivery_rate(tcp_connection_t *tc, tcp_rate_sample_t *rs)
Generate a delivery rate sample from recently acked bytes.
Definition: tcp_bt.c:508
vlib_node_registration_t tcp4_rcv_process_node
(constructor) VLIB_REGISTER_NODE (tcp4_rcv_process_node)
Definition: tcp_input.c:2995
u8 * format_tcp_connection(u8 *s, va_list *args)
Definition: tcp.c:954
static u32 tcp_end_seq(tcp_header_t *th, u32 len)
Definition: tcp.h:816
static void init(void)
Definition: client.c:116
vlib_node_registration_t tcp6_established_node
(constructor) VLIB_REGISTER_NODE (tcp6_established_node)
Definition: tcp_input.c:2203
#define tcp_fastrecovery_first_off(tc)
Definition: tcp.h:436
void tcp_flush_frame_to_output(tcp_worker_ctx_t *wrk, u8 is_ip4)
Flush tx frame populated by retransmits and timer pops.
Definition: tcp_output.c:988
struct _tcp_cc_algorithm tcp_cc_algorithm_t
Definition: tcp.h:277
#define clib_max(x, y)
Definition: clib.h:288
static u32 tcp_time_now_w_thread(u32 thread_index)
Definition: tcp.h:977
void tcp_cc_fastrecovery_clear(tcp_connection_t *tc)
Definition: tcp_input.c:1198
tcp_cc_event_
Definition: tcp.h:286
struct _vlib_node_registration vlib_node_registration_t
#define tcp_is_syn(_th)
Definition: tcp_packet.h:89
struct tcp_iss_seed_ tcp_iss_seed_t
static void tcp_persist_timer_update(tcp_connection_t *tc)
Definition: tcp.h:1126
void tcp_send_reset_w_pkt(tcp_connection_t *tc, vlib_buffer_t *pkt, u32 thread_index, u8 is_ip4)
Send reset without reusing existing buffer.
Definition: tcp_output.c:764
static void * vlib_buffer_push_uninit(vlib_buffer_t *b, u8 size)
Prepend uninitialized data to buffer.
Definition: buffer.h:335
void tcp_bt_check_app_limited(tcp_connection_t *tc)
Check if sample to be generated is app limited.
Definition: tcp_bt.c:251
vlib_node_registration_t tcp4_input_node
(constructor) VLIB_REGISTER_NODE (tcp4_input_node)
Definition: tcp_input.c:3610
tcp_cc_algorithm_t * tcp_cc_algo_get(tcp_cc_algorithm_type_e type)
Definition: tcp.c:99
static u32 tcp_initial_cwnd(const tcp_connection_t *tc)
Initial cwnd as per RFC5681.
Definition: tcp.h:873
vlib_node_registration_t tcp6_rcv_process_node
(constructor) VLIB_REGISTER_NODE (tcp6_rcv_process_node)
Definition: tcp_input.c:3014
#define TCP_CC_DATA_SZ
Definition: tcp.h:33
#define foreach_tcp_fsm_state
TCP FSM state definitions as per RFC793.
Definition: tcp.h:41
void tcp_send_fin(tcp_connection_t *tc)
Send FIN.
Definition: tcp_output.c:1031
#define foreach_tcp_connection_flag
TCP connection flags.
Definition: tcp.h:106
#define foreach_tcp_timer
TCP timers.
Definition: tcp.h:68
static u8 tcp_is_lost_fin(tcp_connection_t *tc)
Definition: tcp.h:945
static void tcp_cc_rcv_ack(tcp_connection_t *tc, tcp_rate_sample_t *rs)
Definition: tcp.h:1018
sack_scoreboard_hole_t * scoreboard_first_hole(sack_scoreboard_t *sb)
Definition: tcp_input.c:686
static tcp_worker_ctx_t * tcp_get_worker(u32 thread_index)
Definition: tcp.h:645
static void tcp_retransmit_timer_update(tcp_connection_t *tc)
Definition: tcp.h:1139
VLIB buffer representation.
Definition: buffer.h:102
u64 uword
Definition: types.h:112
u32 time_now
worker time
Definition: tcp.h:469
int tcp_fast_retransmit_no_sack(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, u32 burst_size)
Fast retransmit without SACK info.
Definition: tcp_output.c:1921
int tcp_fast_retransmit(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, u32 burst_size)
Do fast retransmit.
Definition: tcp_output.c:1985
static void tcp_cc_recovered(tcp_connection_t *tc)
Definition: tcp.h:1038
void tcp_init_snd_vars(tcp_connection_t *tc)
Initialize connection send variables.
Definition: tcp.c:619
struct tcp_configuration_ tcp_configuration_t
void tcp_connection_close(tcp_connection_t *tc)
Begin connection closing procedure.
Definition: tcp.c:385
void tcp_program_fastretransmit(tcp_connection_t *tc)
Definition: tcp_output.c:1211
static void tcp_retransmit_timer_force_update(tcp_connection_t *tc)
Definition: tcp.h:1111
#define vnet_buffer(b)
Definition: buffer.h:361
static tcp_connection_t * tcp_connection_get(u32 conn_index, u32 thread_index)
Definition: tcp.h:672
static void tcp_cc_event(tcp_connection_t *tc, tcp_cc_event_t evt)
Definition: tcp.h:1051
void tcp_update_rto(tcp_connection_t *tc)
Definition: tcp_input.c:469
void tcp_init_mss(tcp_connection_t *tc)
Definition: tcp_output.c:426
enum _tcp_connection_flag tcp_connection_flags_e
format_function_t format_tcp_sacks
Definition: tcp.h:64
tcp_cc_algorithm_type_e tcp_cc_algo_new_type(const tcp_cc_algorithm_t *vft)
Register new cc algo type.
Definition: tcp.c:106
void tcp_program_ack(tcp_connection_t *tc)
Definition: tcp_output.c:1189
#define tcp_opts_sack_permitted(_to)
Definition: tcp_packet.h:160
static u32 tcp_loss_wnd(const tcp_connection_t *tc)
Definition: tcp.h:906
struct tcp_errors_ tcp_errors_t
int tcp_configure_v6_source_address_range(vlib_main_t *vm, ip6_address_t *start, ip6_address_t *end, u32 table_id)
Configure an ipv6 source address range.
Definition: tcp.c:1775
u32 table_id
Definition: fib_types.api:118
static void tcp_persist_timer_set(tcp_connection_t *tc)
Definition: tcp.h:1118
static tcp_main_t * vnet_get_tcp_main()
Definition: tcp.h:639
static void tcp_cong_recovery_off(tcp_connection_t *tc)
Definition: tcp.h:442
timer_expiration_handler tcp_timer_delack_handler
void tcp_connection_free(tcp_connection_t *tc)
Definition: tcp.c:310
static_always_inline void clib_spinlock_lock_if_init(clib_spinlock_t *p)
Definition: lock.h:93
static void tcp_retransmit_timer_reset(tcp_connection_t *tc)
Definition: tcp.h:1105
static u32 tcp_set_time_now(tcp_worker_ctx_t *wrk)
Definition: tcp.h:999
void tcp_bt_track_rxt(tcp_connection_t *tc, u32 start, u32 end)
Track a tcp retransmission.
Definition: tcp_bt.c:291
void tcp_bt_flush_samples(tcp_connection_t *tc)
Flush byte tracker samples.
Definition: tcp_bt.c:540
static u8 tcp_timer_is_active(tcp_connection_t *tc, tcp_timers_e timer)
Definition: tcp.h:1153
void tcp_bt_cleanup(tcp_connection_t *tc)
Byte tracker cleanup.
Definition: tcp_bt.c:564
static tcp_connection_t * tcp_listener_get(u32 tli)
Definition: tcp.h:720
vlib_node_registration_t tcp4_established_node
(constructor) VLIB_REGISTER_NODE (tcp4_established_node)
Definition: tcp_input.c:2184
static void tcp_persist_timer_reset(tcp_connection_t *tc)
Definition: tcp.h:1133
int tcp_session_custom_tx(void *conn, u32 max_burst_size)
Definition: tcp_output.c:2066