FD.io VPP  v17.07-30-g839fa73
Vector Packet Processing
tcp_input.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include <vppinfra/sparse_vec.h>
17 #include <vnet/tcp/tcp_packet.h>
18 #include <vnet/tcp/tcp.h>
19 #include <vnet/session/session.h>
20 #include <math.h>
21 
22 static char *tcp_error_strings[] = {
23 #define tcp_error(n,s) s,
24 #include <vnet/tcp/tcp_error.def>
25 #undef tcp_error
26 };
27 
28 /* All TCP nodes have the same outgoing arcs */
29 #define foreach_tcp_state_next \
30  _ (DROP, "error-drop") \
31  _ (TCP4_OUTPUT, "tcp4-output") \
32  _ (TCP6_OUTPUT, "tcp6-output")
33 
34 typedef enum _tcp_established_next
35 {
36 #define _(s,n) TCP_ESTABLISHED_NEXT_##s,
38 #undef _
41 
42 typedef enum _tcp_rcv_process_next
43 {
44 #define _(s,n) TCP_RCV_PROCESS_NEXT_##s,
46 #undef _
49 
50 typedef enum _tcp_syn_sent_next
51 {
52 #define _(s,n) TCP_SYN_SENT_NEXT_##s,
54 #undef _
57 
58 typedef enum _tcp_listen_next
59 {
60 #define _(s,n) TCP_LISTEN_NEXT_##s,
62 #undef _
65 
66 /* Generic, state independent indices */
67 typedef enum _tcp_state_next
68 {
69 #define _(s,n) TCP_NEXT_##s,
71 #undef _
74 
75 #define tcp_next_output(is_ip4) (is_ip4 ? TCP_NEXT_TCP4_OUTPUT \
76  : TCP_NEXT_TCP6_OUTPUT)
77 
80 
81 /**
82  * Validate segment sequence number. As per RFC793:
83  *
84  * Segment Receive Test
85  * Length Window
86  * ------- ------- -------------------------------------------
87  * 0 0 SEG.SEQ = RCV.NXT
88  * 0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
89  * >0 0 not acceptable
90  * >0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
91  * or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
92  *
93  * This ultimately consists in checking if segment falls within the window.
94  * The one important difference compared to RFC793 is that we use rcv_las,
95  * or the rcv_nxt at last ack sent instead of rcv_nxt since that's the
96  * peer's reference when computing our receive window.
97  *
98  * This:
99  * seq_leq (end_seq, tc->rcv_las + tc->rcv_wnd) && seq_geq (seq, tc->rcv_las)
100  * however, is too strict when we have retransmits. Instead we just check that
101  * the seq is not beyond the right edge and that the end of the segment is not
102  * less than the left edge.
103  *
104  * N.B. rcv_nxt and rcv_wnd are both updated in this node if acks are sent, so
105  * use rcv_nxt in the right edge window test instead of rcv_las.
106  *
107  */
110 {
111  return (seq_geq (end_seq, tc->rcv_las)
112  && seq_leq (seq, tc->rcv_nxt + tc->rcv_wnd));
113 }
114 
115 /**
116  * Parse TCP header options.
117  *
118  * @param th TCP header
119  * @param to TCP options data structure to be populated
120  * @return -1 if parsing failed
121  */
122 int
124 {
125  const u8 *data;
126  u8 opt_len, opts_len, kind;
127  int j;
128  sack_block_t b;
129 
130  opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t);
131  data = (const u8 *) (th + 1);
132 
133  /* Zero out all flags but those set in SYN */
134  to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE);
135 
136  for (; opts_len > 0; opts_len -= opt_len, data += opt_len)
137  {
138  kind = data[0];
139 
140  /* Get options length */
141  if (kind == TCP_OPTION_EOL)
142  break;
143  else if (kind == TCP_OPTION_NOOP)
144  {
145  opt_len = 1;
146  continue;
147  }
148  else
149  {
150  /* broken options */
151  if (opts_len < 2)
152  return -1;
153  opt_len = data[1];
154 
155  /* weird option length */
156  if (opt_len < 2 || opt_len > opts_len)
157  return -1;
158  }
159 
160  /* Parse options */
161  switch (kind)
162  {
163  case TCP_OPTION_MSS:
164  if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th))
165  {
166  to->flags |= TCP_OPTS_FLAG_MSS;
167  to->mss = clib_net_to_host_u16 (*(u16 *) (data + 2));
168  }
169  break;
171  if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th))
172  {
173  to->flags |= TCP_OPTS_FLAG_WSCALE;
174  to->wscale = data[2];
175  if (to->wscale > TCP_MAX_WND_SCALE)
176  {
177  clib_warning ("Illegal window scaling value: %d",
178  to->wscale);
180  }
181  }
182  break;
184  if (opt_len == TCP_OPTION_LEN_TIMESTAMP)
185  {
186  to->flags |= TCP_OPTS_FLAG_TSTAMP;
187  to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2));
188  to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6));
189  }
190  break;
192  if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th))
193  to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
194  break;
196  /* If SACK permitted was not advertised or a SYN, break */
197  if ((to->flags & TCP_OPTS_FLAG_SACK_PERMITTED) == 0 || tcp_syn (th))
198  break;
199 
200  /* If too short or not correctly formatted, break */
201  if (opt_len < 10 || ((opt_len - 2) % TCP_OPTION_LEN_SACK_BLOCK))
202  break;
203 
204  to->flags |= TCP_OPTS_FLAG_SACK;
205  to->n_sack_blocks = (opt_len - 2) / TCP_OPTION_LEN_SACK_BLOCK;
206  vec_reset_length (to->sacks);
207  for (j = 0; j < to->n_sack_blocks; j++)
208  {
209  b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 4 * j));
210  b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 4 * j));
211  vec_add1 (to->sacks, b);
212  }
213  break;
214  default:
215  /* Nothing to see here */
216  continue;
217  }
218  }
219  return 0;
220 }
221 
222 /**
223  * RFC1323: Check against wrapped sequence numbers (PAWS). If we have
224  * timestamp to echo and it's less than tsval_recent, drop segment
225  * but still send an ACK in order to retain TCP's mechanism for detecting
226  * and recovering from half-open connections
227  *
228  * Or at least that's what the theory says. It seems that this might not work
229  * very well with packet reordering and fast retransmit. XXX
230  */
231 always_inline int
233 {
234  return tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent
235  && timestamp_lt (tc->rcv_opts.tsval, tc->tsval_recent);
236 }
237 
238 /**
239  * Update tsval recent
240  */
241 always_inline void
243 {
244  /*
245  * RFC1323: If Last.ACK.sent falls within the range of sequence numbers
246  * of an incoming segment:
247  * SEG.SEQ <= Last.ACK.sent < SEG.SEQ + SEG.LEN
248  * then the TSval from the segment is copied to TS.Recent;
249  * otherwise, the TSval is ignored.
250  */
251  if (tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent
252  && seq_leq (seq, tc->rcv_las) && seq_leq (tc->rcv_las, seq_end))
253  {
254  tc->tsval_recent = tc->rcv_opts.tsval;
255  tc->tsval_recent_age = tcp_time_now ();
256  }
257 }
258 
259 /**
260  * Validate incoming segment as per RFC793 p. 69 and RFC1323 p. 19
261  *
262  * It first verifies if segment has a wrapped sequence number (PAWS) and then
263  * does the processing associated to the first four steps (ignoring security
264  * and precedence): sequence number, rst bit and syn bit checks.
265  *
266  * @return 0 if segments passes validation.
267  */
268 static int
270  vlib_buffer_t * b0, tcp_header_t * th0, u32 * next0)
271 {
272  if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0)))
273  return -1;
274 
275  if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->rcv_opts)))
276  {
277  return -1;
278  }
279 
280  if (tcp_segment_check_paws (tc0))
281  {
282  if (CLIB_DEBUG > 2)
283  {
284  clib_warning ("paws failed\n%U", format_tcp_connection, tc0, 2);
285  clib_warning ("seq %u seq_end %u ack %u",
286  vnet_buffer (b0)->tcp.seq_number - tc0->irs,
287  vnet_buffer (b0)->tcp.seq_end - tc0->irs,
288  vnet_buffer (b0)->tcp.ack_number - tc0->iss);
289  }
290  TCP_EVT_DBG (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number,
291  vnet_buffer (b0)->tcp.seq_end);
292 
293  /* If it just so happens that a segment updates tsval_recent for a
294  * segment over 24 days old, invalidate tsval_recent. */
295  if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE,
296  tcp_time_now ()))
297  {
298  /* Age isn't reset until we get a valid tsval (bsd inspired) */
299  tc0->tsval_recent = 0;
300  clib_warning ("paws failed - really old segment. REALLY?");
301  }
302  else
303  {
304  /* Drop after ack if not rst */
305  if (!tcp_rst (th0))
306  {
307  tcp_make_ack (tc0, b0);
308  *next0 = tcp_next_output (tc0->c_is_ip4);
309  TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0);
310  return -1;
311  }
312  }
313  }
314 
315  /* 1st: check sequence number */
316  if (!tcp_segment_in_rcv_wnd (tc0, vnet_buffer (b0)->tcp.seq_number,
317  vnet_buffer (b0)->tcp.seq_end))
318  {
319  /* If our window is 0 and the packet is in sequence, let it pass
320  * through for ack processing. It should be dropped later.*/
321  if (tc0->rcv_wnd == 0
322  && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number)
323  {
324  /* TODO Should segment be tagged? */
325  }
326  else
327  {
328  /* If not RST, send dup ack */
329  if (!tcp_rst (th0))
330  {
331  tcp_make_ack (tc0, b0);
332  *next0 = tcp_next_output (tc0->c_is_ip4);
333  TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0);
334  }
335  return -1;
336  }
337  }
338 
339  /* 2nd: check the RST bit */
340  if (tcp_rst (th0))
341  {
342  tcp_connection_reset (tc0);
343  return -1;
344  }
345 
346  /* 3rd: check security and precedence (skip) */
347 
348  /* 4th: check the SYN bit */
349  if (tcp_syn (th0))
350  {
351  tcp_send_reset (b0, tc0->c_is_ip4);
352  return -1;
353  }
354 
355  /* If segment in window, save timestamp */
356  tcp_update_timestamp (tc0, vnet_buffer (b0)->tcp.seq_number,
357  vnet_buffer (b0)->tcp.seq_end);
358  return 0;
359 }
360 
361 always_inline int
363 {
364  /* SND.UNA =< SEG.ACK =< SND.NXT */
365  return (seq_leq (tc0->snd_una, vnet_buffer (tb0)->tcp.ack_number)
366  && seq_leq (vnet_buffer (tb0)->tcp.ack_number, tc0->snd_nxt));
367 }
368 
369 /**
370  * Compute smoothed RTT as per VJ's '88 SIGCOMM and RFC6298
371  *
372  * Note that although the original article, srtt and rttvar are scaled
373  * to minimize round-off errors, here we don't. Instead, we rely on
374  * better precision time measurements.
375  *
376  * TODO support us rtt resolution
377  */
378 static void
380 {
381  int err, diff;
382 
383  if (tc->srtt != 0)
384  {
385  err = mrtt - tc->srtt;
386 // tc->srtt += err >> 3;
387 
388  /* XXX Drop in RTT results in RTTVAR increase and bigger RTO.
389  * The increase should be bound */
390 // tc->rttvar += ((int) clib_abs (err) - (int) tc->rttvar) >> 2;
391 
392  tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1);
393  diff = (clib_abs (err) - (int) tc->rttvar) >> 2;
394  tc->rttvar = clib_max ((int) tc->rttvar + diff, 1);
395  }
396  else
397  {
398  /* First measurement. */
399  tc->srtt = mrtt;
400  tc->rttvar = mrtt >> 1;
401  }
402 }
403 
404 void
406 {
407  tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
408  tc->rto = clib_max (tc->rto, TCP_RTO_MIN);
409 }
410 
411 /** Update RTT estimate and RTO timer
412  *
413  * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK
414  * timing. Middle boxes are known to fiddle with TCP options so we
415  * should give higher priority to ACK timing.
416  *
417  * return 1 if valid rtt 0 otherwise
418  */
419 static int
421 {
422  u32 mrtt = 0;
423  u8 rtx_acked;
424 
425  /* Determine if only rtx bytes are acked. */
426  rtx_acked = tcp_in_cong_recovery (tc) || !tc->bytes_acked;
427 
428  /* Karn's rule, part 1. Don't use retransmitted segments to estimate
429  * RTT because they're ambiguous. */
430  if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq) && !rtx_acked)
431  {
432  mrtt = tcp_time_now () - tc->rtt_ts;
433  }
434  /* As per RFC7323 TSecr can be used for RTTM only if the segment advances
435  * snd_una, i.e., the left side of the send window:
436  * seq_lt (tc->snd_una, ack). */
437  else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr
438  && tc->bytes_acked)
439  {
440  mrtt = tcp_time_now () - tc->rcv_opts.tsecr;
441  }
442 
443  /* Allow measuring of a new RTT */
444  tc->rtt_ts = 0;
445 
446  /* If ACK moves left side of the wnd make sure boff is 0, even if mrtt is
447  * not valid */
448  if (tc->bytes_acked)
449  tc->rto_boff = 0;
450 
451  /* Ignore dubious measurements */
452  if (mrtt == 0 || mrtt > TCP_RTT_MAX)
453  return 0;
454 
455  tcp_estimate_rtt (tc, mrtt);
456  tcp_update_rto (tc);
457 
458  return 0;
459 }
460 
461 /**
462  * Dequeue bytes that have been acked and while at it update RTT estimates.
463  */
464 static void
466 {
467  /* Dequeue the newly ACKed add SACKed bytes */
468  stream_session_dequeue_drop (&tc->connection,
469  tc->bytes_acked + tc->sack_sb.snd_una_adv);
470 
471  tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
472 
473  /* Update rtt and rto */
474  tcp_update_rtt (tc, ack);
475 
476  /* If everything has been acked, stop retransmit timer
477  * otherwise update. */
479 }
480 
481 /**
482  * Check if duplicate ack as per RFC5681 Sec. 2
483  */
484 static u8
486  u32 prev_snd_una)
487 {
488  return ((vnet_buffer (b)->tcp.ack_number == prev_snd_una)
489  && seq_gt (tc->snd_una_max, tc->snd_una)
490  && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number)
491  && (prev_snd_wnd == tc->snd_wnd));
492 }
493 
494 /**
495  * Checks if ack is a congestion control event.
496  */
497 static u8
499  u32 prev_snd_wnd, u32 prev_snd_una, u8 * is_dack)
500 {
501  /* Check if ack is duplicate. Per RFC 6675, ACKs that SACK new data are
502  * defined to be 'duplicate' */
503  *is_dack = tc->sack_sb.last_sacked_bytes
504  || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una);
505 
506  return (*is_dack || tcp_in_cong_recovery (tc));
507 }
508 
509 void
511 {
512  sack_scoreboard_hole_t *next, *prev;
513 
514  if (hole->next != TCP_INVALID_SACK_HOLE_INDEX)
515  {
516  next = pool_elt_at_index (sb->holes, hole->next);
517  next->prev = hole->prev;
518  }
519  else
520  {
521  sb->tail = hole->prev;
522  }
523 
524  if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX)
525  {
526  prev = pool_elt_at_index (sb->holes, hole->prev);
527  prev->next = hole->next;
528  }
529  else
530  {
531  sb->head = hole->next;
532  }
533 
534  if (scoreboard_hole_index (sb, hole) == sb->cur_rxt_hole)
535  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
536 
537  pool_put (sb->holes, hole);
538 }
539 
542  u32 start, u32 end)
543 {
544  sack_scoreboard_hole_t *hole, *next, *prev;
545  u32 hole_index;
546 
547  pool_get (sb->holes, hole);
548  memset (hole, 0, sizeof (*hole));
549 
550  hole->start = start;
551  hole->end = end;
552  hole_index = hole - sb->holes;
553 
554  prev = scoreboard_get_hole (sb, prev_index);
555  if (prev)
556  {
557  hole->prev = prev_index;
558  hole->next = prev->next;
559 
560  if ((next = scoreboard_next_hole (sb, hole)))
561  next->prev = hole_index;
562  else
563  sb->tail = hole_index;
564 
565  prev->next = hole_index;
566  }
567  else
568  {
569  sb->head = hole_index;
570  hole->prev = TCP_INVALID_SACK_HOLE_INDEX;
571  hole->next = TCP_INVALID_SACK_HOLE_INDEX;
572  }
573 
574  return hole;
575 }
576 
577 void
579 {
580  sack_scoreboard_hole_t *hole, *prev;
581  u32 bytes = 0, blks = 0;
582 
583  sb->lost_bytes = 0;
584  sb->sacked_bytes = 0;
585  hole = scoreboard_last_hole (sb);
586  if (!hole)
587  return;
588 
589  if (seq_gt (sb->high_sacked, hole->end))
590  {
591  bytes = sb->high_sacked - hole->end;
592  blks = 1;
593  }
594 
595  while ((prev = scoreboard_prev_hole (sb, hole))
596  && (bytes < (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss
597  && blks < TCP_DUPACK_THRESHOLD))
598  {
599  bytes += hole->start - prev->end;
600  blks++;
601  hole = prev;
602  }
603 
604  while (hole)
605  {
606  sb->lost_bytes += scoreboard_hole_bytes (hole);
607  hole->is_lost = 1;
608  prev = hole;
609  hole = scoreboard_prev_hole (sb, hole);
610  if (hole)
611  bytes += prev->start - hole->end;
612  }
613  sb->sacked_bytes = bytes;
614 }
615 
616 /**
617  * Figure out the next hole to retransmit
618  *
619  * Follows logic proposed in RFC6675 Sec. 4, NextSeg()
620  */
623  sack_scoreboard_hole_t * start,
624  u8 have_sent_1_smss,
625  u8 * can_rescue, u8 * snd_limited)
626 {
627  sack_scoreboard_hole_t *hole = 0;
628 
629  hole = start ? start : scoreboard_first_hole (sb);
630  while (hole && seq_leq (hole->end, sb->high_rxt) && hole->is_lost)
631  hole = scoreboard_next_hole (sb, hole);
632 
633  /* Nothing, return */
634  if (!hole)
635  {
636  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
637  return 0;
638  }
639 
640  /* Rule (1): if higher than rxt, less than high_sacked and lost */
641  if (hole->is_lost && seq_lt (hole->start, sb->high_sacked))
642  {
643  sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
644  }
645  else
646  {
647  /* Rule (2): output takes care of transmitting new data */
648  if (!have_sent_1_smss)
649  {
650  hole = 0;
651  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
652  }
653  /* Rule (3): if hole not lost */
654  else if (seq_lt (hole->start, sb->high_sacked))
655  {
656  *snd_limited = 1;
657  sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
658  }
659  /* Rule (4): if hole beyond high_sacked */
660  else
661  {
662  ASSERT (seq_geq (hole->start, sb->high_sacked));
663  *snd_limited = 1;
664  *can_rescue = 1;
665  /* HighRxt MUST NOT be updated */
666  return 0;
667  }
668  }
669 
670  if (hole && seq_lt (sb->high_rxt, hole->start))
671  sb->high_rxt = hole->start;
672 
673  return hole;
674 }
675 
676 void
678 {
680  hole = scoreboard_first_hole (sb);
681  sb->high_rxt = hole->start;
682  sb->cur_rxt_hole = sb->head;
683 }
684 
685 void
687 {
688  sack_scoreboard_t *sb = &tc->sack_sb;
689  sack_block_t *blk, tmp;
690  sack_scoreboard_hole_t *hole, *next_hole, *last_hole;
691  u32 blk_index = 0, old_sacked_bytes, hole_index;
692  int i, j;
693 
694  sb->last_sacked_bytes = 0;
695  sb->snd_una_adv = 0;
696  old_sacked_bytes = sb->sacked_bytes;
697  sb->last_bytes_delivered = 0;
698 
699  if (!tcp_opts_sack (&tc->rcv_opts)
700  && sb->head == TCP_INVALID_SACK_HOLE_INDEX)
701  return;
702 
703  /* Remove invalid blocks */
704  blk = tc->rcv_opts.sacks;
705  while (blk < vec_end (tc->rcv_opts.sacks))
706  {
707  if (seq_lt (blk->start, blk->end)
708  && seq_gt (blk->start, tc->snd_una)
709  && seq_gt (blk->start, ack) && seq_leq (blk->end, tc->snd_nxt))
710  {
711  blk++;
712  continue;
713  }
714  vec_del1 (tc->rcv_opts.sacks, blk - tc->rcv_opts.sacks);
715  }
716 
717  /* Add block for cumulative ack */
718  if (seq_gt (ack, tc->snd_una))
719  {
720  tmp.start = tc->snd_una;
721  tmp.end = ack;
722  vec_add1 (tc->rcv_opts.sacks, tmp);
723  }
724 
725  if (vec_len (tc->rcv_opts.sacks) == 0)
726  return;
727 
728  /* Make sure blocks are ordered */
729  for (i = 0; i < vec_len (tc->rcv_opts.sacks); i++)
730  for (j = i + 1; j < vec_len (tc->rcv_opts.sacks); j++)
731  if (seq_lt (tc->rcv_opts.sacks[j].start, tc->rcv_opts.sacks[i].start))
732  {
733  tmp = tc->rcv_opts.sacks[i];
734  tc->rcv_opts.sacks[i] = tc->rcv_opts.sacks[j];
735  tc->rcv_opts.sacks[j] = tmp;
736  }
737 
738  if (sb->head == TCP_INVALID_SACK_HOLE_INDEX)
739  {
740  /* If no holes, insert the first that covers all outstanding bytes */
742  tc->snd_una, tc->snd_una_max);
743  sb->tail = scoreboard_hole_index (sb, last_hole);
744  tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1];
745  sb->high_sacked = tmp.end;
746  }
747  else
748  {
749  /* If we have holes but snd_una_max is beyond the last hole, update
750  * last hole end */
751  tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1];
752  last_hole = scoreboard_last_hole (sb);
753  if (seq_gt (tc->snd_una_max, sb->high_sacked)
754  && seq_gt (tc->snd_una_max, last_hole->end))
755  last_hole->end = tc->snd_una_max;
756  /* keep track of max byte sacked for when the last hole
757  * is acked */
758  if (seq_gt (tmp.end, sb->high_sacked))
759  sb->high_sacked = tmp.end;
760  }
761 
762  /* Walk the holes with the SACK blocks */
763  hole = pool_elt_at_index (sb->holes, sb->head);
764  while (hole && blk_index < vec_len (tc->rcv_opts.sacks))
765  {
766  blk = &tc->rcv_opts.sacks[blk_index];
767 
768  if (seq_leq (blk->start, hole->start))
769  {
770  /* Block covers hole. Remove hole */
771  if (seq_geq (blk->end, hole->end))
772  {
773  next_hole = scoreboard_next_hole (sb, hole);
774 
775  /* Byte accounting: snd_una needs to be advanced */
776  if (blk->end == ack)
777  {
778  if (next_hole)
779  {
780  if (seq_lt (ack, next_hole->start))
781  sb->snd_una_adv = next_hole->start - ack;
782  sb->last_bytes_delivered +=
783  next_hole->start - hole->end;
784  }
785  else if (!next_hole)
786  {
787  sb->snd_una_adv = sb->high_sacked - ack;
788  sb->last_bytes_delivered += sb->high_sacked - hole->end;
789  }
790  }
791 
792  scoreboard_remove_hole (sb, hole);
793  hole = next_hole;
794  }
795  /* Partial 'head' overlap */
796  else
797  {
798  if (seq_gt (blk->end, hole->start))
799  {
800  hole->start = blk->end;
801  }
802  blk_index++;
803  }
804  }
805  else
806  {
807  /* Hole must be split */
808  if (seq_lt (blk->end, hole->end))
809  {
810  hole_index = scoreboard_hole_index (sb, hole);
811  scoreboard_insert_hole (sb, hole_index, blk->end, hole->end);
812 
813  /* Pool might've moved */
814  hole = scoreboard_get_hole (sb, hole_index);
815  hole->end = blk->start;
816  blk_index++;
817  }
818  else if (seq_lt (blk->start, hole->end))
819  {
820  hole->end = blk->start;
821  }
822 
823  hole = scoreboard_next_hole (sb, hole);
824  }
825  }
826 
827  scoreboard_update_bytes (tc, sb);
828  sb->last_sacked_bytes = sb->sacked_bytes
829  - (old_sacked_bytes - sb->last_bytes_delivered);
830  ASSERT (sb->sacked_bytes == 0
831  || sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack));
832  ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max
833  - seq_max (tc->snd_una, ack));
834 }
835 
836 /**
837  * Try to update snd_wnd based on feedback received from peer.
838  *
839  * If successful, and new window is 'effectively' 0, activate persist
840  * timer.
841  */
842 static void
843 tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd)
844 {
845  /* If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set
846  * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */
847  if (seq_lt (tc->snd_wl1, seq)
848  || (tc->snd_wl1 == seq && seq_leq (tc->snd_wl2, ack)))
849  {
850  tc->snd_wnd = snd_wnd;
851  tc->snd_wl1 = seq;
852  tc->snd_wl2 = ack;
853  TCP_EVT_DBG (TCP_EVT_SND_WND, tc);
854 
855  if (tc->snd_wnd < tc->snd_mss)
856  {
857  /* Set persist timer if not set and we just got 0 wnd */
858  if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST)
859  && !tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT))
861  }
862  else
863  {
865  if (!tcp_in_recovery (tc) && tc->rto_boff > 0)
866  {
867  tc->rto_boff = 0;
868  tcp_update_rto (tc);
869  }
870  }
871  }
872 }
873 
874 void
876 {
877  tcp_fastrecovery_on (tc);
878  tc->snd_congestion = tc->snd_una_max;
879  tc->cc_algo->congestion (tc);
880  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4);
881 }
882 
883 static void
885 {
886  /* Deflate rto */
887  tcp_update_rto (tc);
888  tc->rto_boff = 0;
889  tc->snd_rxt_ts = 0;
890  tcp_recovery_off (tc);
891 }
892 
893 void
895 {
896  tc->cc_algo->recovered (tc);
897  tc->snd_rxt_bytes = 0;
898  tc->rcv_dupacks = 0;
901 }
902 
903 static void
905 {
906  tc->cwnd = tc->prev_cwnd;
907  tc->ssthresh = tc->prev_ssthresh;
908  tc->snd_nxt = tc->snd_una_max;
909  tc->rcv_dupacks = 0;
910  if (tcp_in_recovery (tc))
912  ASSERT (tc->rto_boff == 0);
913  /* TODO extend for fastrecovery */
914 }
915 
916 static u8
918 {
919  return (tc->snd_rxt_ts
920  && tcp_opts_tstamp (&tc->rcv_opts)
921  && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts));
922 }
923 
924 int
926 {
929  {
931  return 1;
932  }
933 
934  if (tcp_in_recovery (tc))
936  else if (tcp_in_fastrecovery (tc))
938 
939  ASSERT (tc->rto_boff == 0);
941 
942  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
943  return 0;
944 }
945 
946 static void
948 {
950 
951  /* Congestion avoidance */
952  tc->cc_algo->rcv_ack (tc);
953  tc->tsecr_last_ack = tc->rcv_opts.tsecr;
954 
955  /* If a cumulative ack, make sure dupacks is 0 */
956  tc->rcv_dupacks = 0;
957 
958  /* When dupacks hits the threshold we only enter fast retransmit if
959  * cumulative ack covers more than snd_congestion. Should snd_una
960  * wrap this test may fail under otherwise valid circumstances.
961  * Therefore, proactively update snd_congestion when wrap detected. */
962  if (PREDICT_FALSE
963  (seq_leq (tc->snd_congestion, tc->snd_una - tc->bytes_acked)
964  && seq_gt (tc->snd_congestion, tc->snd_una)))
965  tc->snd_congestion = tc->snd_una - 1;
966 }
967 
968 static u8
970 {
971  return (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss < tc->sack_sb.sacked_bytes;
972 }
973 
974 static u8
976 {
977  return (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD
979 }
980 
981 /**
982  * One function to rule them all ... and in the darkness bind them
983  */
984 static void
986 {
987  u32 rxt_delivered;
988 
989  /*
990  * Duplicate ACK. Check if we should enter fast recovery, or if already in
991  * it account for the bytes that left the network.
992  */
993  if (is_dack)
994  {
995  ASSERT (tc->snd_una != tc->snd_una_max
996  || tc->sack_sb.last_sacked_bytes);
997  tc->rcv_dupacks++;
998 
999  if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD && !tc->bytes_acked)
1000  {
1001  ASSERT (tcp_in_fastrecovery (tc));
1002  /* Pure duplicate ack. If some data got acked, it's handled lower */
1003  tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
1004  return;
1005  }
1006  else if (tcp_should_fastrecover (tc))
1007  {
1008  /* Things are already bad */
1009  if (tcp_in_cong_recovery (tc))
1010  {
1011  tc->rcv_dupacks = 0;
1012  goto partial_ack_test;
1013  }
1014 
1015  /* If of of the two conditions lower hold, reset dupacks
1016  * 1) Cumulative ack does not cover more than congestion threshold,
1017  * and the following doesn't hold: the congestion window is
1018  * greater than SMSS bytes and the difference between highest_ack
1019  * and prev_highest_ack is at most 4*SMSS bytes (XXX)
1020  * 2) RFC6582 heuristic to avoid multiple fast retransmits
1021  */
1022  if ((seq_gt (tc->snd_una, tc->snd_congestion)
1023  || !(tc->cwnd > tc->snd_mss
1024  && tc->bytes_acked <= 4 * tc->snd_mss))
1025  || tc->rcv_opts.tsecr != tc->tsecr_last_ack)
1026  {
1027  tc->rcv_dupacks = 0;
1028  return;
1029  }
1030 
1032  tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
1033 
1034  /* The first segment MUST be retransmitted */
1036 
1037  /* Post retransmit update cwnd to ssthresh and account for the
1038  * three segments that have left the network and should've been
1039  * buffered at the receiver XXX */
1040  tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss;
1041 
1042  /* If cwnd allows, send more data */
1043  if (tcp_opts_sack_permitted (&tc->rcv_opts)
1044  && scoreboard_first_hole (&tc->sack_sb))
1045  {
1046  scoreboard_init_high_rxt (&tc->sack_sb);
1048  }
1049  else
1050  {
1052  }
1053 
1054  return;
1055  }
1056  else if (!tc->bytes_acked
1057  || (tc->bytes_acked && !tcp_in_cong_recovery (tc)))
1058  {
1059  tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
1060  return;
1061  }
1062  else
1063  goto partial_ack;
1064  }
1065 
1066 partial_ack_test:
1067 
1068  if (!tc->bytes_acked)
1069  return;
1070 
1071 partial_ack:
1072  /*
1073  * Legitimate ACK. 1) See if we can exit recovery
1074  */
1075  /* XXX limit this only to first partial ack? */
1077 
1078  if (seq_geq (tc->snd_una, tc->snd_congestion))
1079  {
1080  /* If spurious return, we've already updated everything */
1081  if (tcp_cc_recover (tc))
1082  {
1083  tc->tsecr_last_ack = tc->rcv_opts.tsecr;
1084  return;
1085  }
1086 
1087  tc->snd_nxt = tc->snd_una_max;
1088 
1089  /* Treat as congestion avoidance ack */
1090  tc->cc_algo->rcv_ack (tc);
1091  tc->tsecr_last_ack = tc->rcv_opts.tsecr;
1092  return;
1093  }
1094 
1095  /*
1096  * Legitimate ACK. 2) If PARTIAL ACK try to retransmit
1097  */
1098  TCP_EVT_DBG (TCP_EVT_CC_PACK, tc);
1099 
1100  /* RFC6675: If the incoming ACK is a cumulative acknowledgment,
1101  * reset dupacks to 0 */
1102  tc->rcv_dupacks = 0;
1103 
1105 
1106  /* Post RTO timeout don't try anything fancy */
1107  if (tcp_in_recovery (tc))
1108  return;
1109 
1110  /* Remove retransmitted bytes that have been delivered */
1111  ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv
1112  >= tc->sack_sb.last_bytes_delivered);
1113  rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv
1114  - tc->sack_sb.last_bytes_delivered;
1115  if (rxt_delivered && seq_gt (tc->sack_sb.high_rxt, tc->snd_una))
1116  {
1117  /* If we have sacks and we haven't gotten an ack beyond high_rxt,
1118  * remove sacked bytes delivered */
1119  ASSERT (tc->snd_rxt_bytes >= rxt_delivered);
1120  tc->snd_rxt_bytes -= rxt_delivered;
1121  }
1122  else
1123  {
1124  /* Either all retransmitted holes have been acked, or we're
1125  * "in the blind" and retransmitting segment by segment */
1126  tc->snd_rxt_bytes = 0;
1127  }
1128 
1129  tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK);
1130 
1131  /*
1132  * Since this was a partial ack, try to retransmit some more data
1133  */
1134  tcp_fast_retransmit (tc);
1135 }
1136 
1137 void
1139 {
1140  tc->cc_algo = tcp_cc_algo_get (TCP_CC_NEWRENO);
1141  tc->cc_algo->init (tc);
1142 }
1143 
1144 /**
1145  * Process incoming ACK
1146  */
1147 static int
1149  tcp_header_t * th, u32 * next, u32 * error)
1150 {
1151  u32 prev_snd_wnd, prev_snd_una;
1152  u8 is_dack;
1153 
1154  TCP_EVT_DBG (TCP_EVT_CC_STAT, tc);
1155 
1156  /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */
1157  if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)))
1158  {
1159  /* If we have outstanding data and this is within the window, accept it,
1160  * probably retransmit has timed out. Otherwise ACK segment and then
1161  * drop it */
1162  if (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max))
1163  {
1164  tcp_make_ack (tc, b);
1165  *next = tcp_next_output (tc->c_is_ip4);
1166  *error = TCP_ERROR_ACK_INVALID;
1167  TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 0,
1168  vnet_buffer (b)->tcp.ack_number);
1169  return -1;
1170  }
1171 
1172  TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 2,
1173  vnet_buffer (b)->tcp.ack_number);
1174 
1175  tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
1176  *error = TCP_ERROR_ACK_FUTURE;
1177  }
1178 
1179  /* If old ACK, probably it's an old dupack */
1180  if (PREDICT_FALSE (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)))
1181  {
1182  *error = TCP_ERROR_ACK_OLD;
1183  TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 1,
1184  vnet_buffer (b)->tcp.ack_number);
1185  if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
1186  {
1187  TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc);
1188  tcp_cc_handle_event (tc, 1);
1189  }
1190  /* Don't drop yet */
1191  return 0;
1192  }
1193 
1194  /*
1195  * Looks okay, process feedback
1196  */
1197 
1198  TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
1199 
1200  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1201  tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
1202 
1203  prev_snd_wnd = tc->snd_wnd;
1204  prev_snd_una = tc->snd_una;
1205  tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number,
1206  vnet_buffer (b)->tcp.ack_number,
1207  clib_net_to_host_u16 (th->window) << tc->snd_wscale);
1208  tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una;
1209  tc->snd_una = vnet_buffer (b)->tcp.ack_number + tc->sack_sb.snd_una_adv;
1210  tcp_validate_txf_size (tc, tc->bytes_acked);
1211 
1212  if (tc->bytes_acked)
1213  tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number);
1214 
1215  /*
1216  * Check if we have congestion event
1217  */
1218 
1219  if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack))
1220  {
1221  tcp_cc_handle_event (tc, is_dack);
1222  *error = TCP_ERROR_ACK_DUP;
1223  TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1);
1224  return vnet_buffer (b)->tcp.data_len ? 0 : -1;
1225  }
1226 
1227  /*
1228  * Update congestion control (slow start/congestion avoidance)
1229  */
1230  tcp_cc_update (tc, b);
1231 
1232  return 0;
1233 }
1234 
1235 /**
1236  * Build SACK list as per RFC2018.
1237  *
1238  * Makes sure the first block contains the segment that generated the current
1239  * ACK and the following ones are the ones most recently reported in SACK
1240  * blocks.
1241  *
1242  * @param tc TCP connection for which the SACK list is updated
1243  * @param start Start sequence number of the newest SACK block
1244  * @param end End sequence of the newest SACK block
1245  */
1246 void
1248 {
1249  sack_block_t *new_list = 0, *block = 0;
1250  int i;
1251 
1252  /* If the first segment is ooo add it to the list. Last write might've moved
1253  * rcv_nxt over the first segment. */
1254  if (seq_lt (tc->rcv_nxt, start))
1255  {
1256  vec_add2 (new_list, block, 1);
1257  block->start = start;
1258  block->end = end;
1259  }
1260 
1261  /* Find the blocks still worth keeping. */
1262  for (i = 0; i < vec_len (tc->snd_sacks); i++)
1263  {
1264  /* Discard if rcv_nxt advanced beyond current block */
1265  if (seq_leq (tc->snd_sacks[i].start, tc->rcv_nxt))
1266  continue;
1267 
1268  /* Merge or drop if segment overlapped by the new segment */
1269  if (block && (seq_geq (tc->snd_sacks[i].end, new_list[0].start)
1270  && seq_leq (tc->snd_sacks[i].start, new_list[0].end)))
1271  {
1272  if (seq_lt (tc->snd_sacks[i].start, new_list[0].start))
1273  new_list[0].start = tc->snd_sacks[i].start;
1274  if (seq_lt (new_list[0].end, tc->snd_sacks[i].end))
1275  new_list[0].end = tc->snd_sacks[i].end;
1276  continue;
1277  }
1278 
1279  /* Save to new SACK list if we have space. */
1280  if (vec_len (new_list) < TCP_MAX_SACK_BLOCKS)
1281  {
1282  vec_add1 (new_list, tc->snd_sacks[i]);
1283  }
1284  else
1285  {
1286  clib_warning ("sack discarded");
1287  }
1288  }
1289 
1290  ASSERT (vec_len (new_list) <= TCP_MAX_SACK_BLOCKS);
1291 
1292  /* Replace old vector with new one */
1293  vec_free (tc->snd_sacks);
1294  tc->snd_sacks = new_list;
1295 }
1296 
1297 /** Enqueue data for delivery to application */
1298 always_inline int
1300  u16 data_len)
1301 {
1302  int written;
1303 
1304  /* Pure ACK. Update rcv_nxt and be done. */
1305  if (PREDICT_FALSE (data_len == 0))
1306  {
1307  tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end;
1308  return TCP_ERROR_PURE_ACK;
1309  }
1310 
1311  written = stream_session_enqueue_data (&tc->connection, b, 0,
1312  1 /* queue event */ , 1);
1313 
1314  TCP_EVT_DBG (TCP_EVT_INPUT, tc, 0, data_len, written);
1315 
1316  /* Update rcv_nxt */
1317  if (PREDICT_TRUE (written == data_len))
1318  {
1319  tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end;
1320  }
1321  /* If more data written than expected, account for out-of-order bytes. */
1322  else if (written > data_len)
1323  {
1324  tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end + written - data_len;
1325 
1326  /* Send ACK confirming the update */
1327  tc->flags |= TCP_CONN_SNDACK;
1328  }
1329  else if (written > 0)
1330  {
1331  /* We've written something but FIFO is probably full now */
1332  tc->rcv_nxt += written;
1333 
1334  /* Depending on how fast the app is, all remaining buffers in burst will
1335  * not be enqueued. Inform peer */
1336  tc->flags |= TCP_CONN_SNDACK;
1337 
1338  return TCP_ERROR_PARTIALLY_ENQUEUED;
1339  }
1340  else
1341  {
1342  tc->flags |= TCP_CONN_SNDACK;
1343  return TCP_ERROR_FIFO_FULL;
1344  }
1345 
1346  /* Update SACK list if need be */
1347  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1348  {
1349  /* Remove SACK blocks that have been delivered */
1350  tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt);
1351  }
1352 
1353  return TCP_ERROR_ENQUEUED;
1354 }
1355 
1356 /** Enqueue out-of-order data */
1357 always_inline int
1359  u16 data_len)
1360 {
1361  stream_session_t *s0;
1362  int rv;
1363 
1364  ASSERT (seq_gt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt));
1365 
1366  /* Pure ACK. Do nothing */
1367  if (PREDICT_FALSE (data_len == 0))
1368  {
1369  return TCP_ERROR_PURE_ACK;
1370  }
1371 
1372  /* Enqueue out-of-order data with relative offset */
1373  rv = stream_session_enqueue_data (&tc->connection, b,
1374  vnet_buffer (b)->tcp.seq_number -
1375  tc->rcv_nxt, 0 /* queue event */ , 0);
1376 
1377  /* Nothing written */
1378  if (rv)
1379  {
1380  TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, 0);
1381  return TCP_ERROR_FIFO_FULL;
1382  }
1383 
1384  TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, data_len);
1385 
1386  /* Update SACK list if in use */
1387  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1388  {
1389  ooo_segment_t *newest;
1390  u32 start, end;
1391 
1392  s0 = stream_session_get (tc->c_s_index, tc->c_thread_index);
1393 
1394  /* Get the newest segment from the fifo */
1395  newest = svm_fifo_newest_ooo_segment (s0->server_rx_fifo);
1396  if (newest)
1397  {
1398  start =
1399  tc->rcv_nxt + ooo_segment_offset (s0->server_rx_fifo, newest);
1400  end = start + ooo_segment_length (s0->server_rx_fifo, newest);
1401  tcp_update_sack_list (tc, start, end);
1402 
1403  ASSERT (seq_gt (start, tc->rcv_nxt));
1404  }
1405  }
1406 
1407  return TCP_ERROR_ENQUEUED;
1408 }
1409 
1410 /**
1411  * Check if ACK could be delayed. If ack can be delayed, it should return
1412  * true for a full frame. If we're always acking return 0.
1413  */
1414 always_inline int
1416 {
1417  /* Send ack if ... */
1418  if (TCP_ALWAYS_ACK
1419  /* just sent a rcv wnd 0 */
1420  || (tc->flags & TCP_CONN_SENT_RCV_WND0) != 0
1421  /* constrained to send ack */
1422  || (tc->flags & TCP_CONN_SNDACK) != 0
1423  /* we're almost out of tx wnd */
1424  || tcp_available_snd_space (tc) < 4 * tc->snd_mss)
1425  return 0;
1426 
1427  return 1;
1428 }
1429 
1430 static int
1432  u16 n_data_bytes, u32 * next0)
1433 {
1434  u32 error = 0, n_bytes_to_drop;
1435 
1436  /* Handle out-of-order data */
1437  if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt))
1438  {
1439  /* Old sequence numbers allowed through because they overlapped
1440  * the rx window */
1441  if (seq_lt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt))
1442  {
1443  error = TCP_ERROR_SEGMENT_OLD;
1444  *next0 = TCP_NEXT_DROP;
1445 
1446  /* Completely in the past (possible retransmit) */
1447  if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt))
1448  goto done;
1449 
1450  /* Chop off the bytes in the past */
1451  n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number;
1452  n_data_bytes -= n_bytes_to_drop;
1453  vlib_buffer_advance (b, n_bytes_to_drop);
1454 
1455  goto in_order;
1456  }
1457 
1458  error = tcp_session_enqueue_ooo (tc, b, n_data_bytes);
1459 
1460  /* N.B. Should not filter burst of dupacks. Two issues 1) dupacks open
1461  * cwnd on remote peer when congested 2) acks leaving should have the
1462  * latest rcv_wnd since the burst may eaten up all of it, so only the
1463  * old ones could be filtered.
1464  */
1465 
1466  /* RFC2581: Send DUPACK for fast retransmit */
1467  tcp_make_ack (tc, b);
1468  *next0 = tcp_next_output (tc->c_is_ip4);
1469 
1470  /* Mark as DUPACK. We may filter these in output if
1471  * the burst fills the holes. */
1472  if (n_data_bytes)
1473  vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_DUPACK;
1474 
1475  TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc);
1476  goto done;
1477  }
1478 
1479 in_order:
1480 
1481  /* In order data, enqueue. Fifo figures out by itself if any out-of-order
1482  * segments can be enqueued after fifo tail offset changes. */
1483  error = tcp_session_enqueue_data (tc, b, n_data_bytes);
1484 
1485  if (n_data_bytes == 0)
1486  {
1487  *next0 = TCP_NEXT_DROP;
1488  goto done;
1489  }
1490 
1491  /* Check if ACK can be delayed */
1492  if (tcp_can_delack (tc))
1493  {
1494  if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK))
1495  tcp_timer_set (tc, TCP_TIMER_DELACK, TCP_DELACK_TIME);
1496  goto done;
1497  }
1498 
1499  *next0 = tcp_next_output (tc->c_is_ip4);
1500  tcp_make_ack (tc, b);
1501 
1502 done:
1503  return error;
1504 }
1505 
1506 typedef struct
1507 {
1510 } tcp_rx_trace_t;
1511 
1512 u8 *
1513 format_tcp_rx_trace (u8 * s, va_list * args)
1514 {
1515  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1516  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1517  tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
1518  uword indent = format_get_indent (s);
1519 
1520  s = format (s, "%U\n%U%U",
1521  format_tcp_header, &t->tcp_header, 128,
1522  format_white_space, indent,
1524 
1525  return s;
1526 }
1527 
1528 u8 *
1529 format_tcp_rx_trace_short (u8 * s, va_list * args)
1530 {
1531  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1532  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1533  tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
1534 
1535  s = format (s, "%d -> %d (%U)",
1536  clib_net_to_host_u16 (t->tcp_header.src_port),
1537  clib_net_to_host_u16 (t->tcp_header.dst_port), format_tcp_state,
1538  t->tcp_connection.state);
1539 
1540  return s;
1541 }
1542 
1543 void
1545  tcp_header_t * th0, vlib_buffer_t * b0, u8 is_ip4)
1546 {
1547  if (tc0)
1548  {
1549  clib_memcpy (&t0->tcp_connection, tc0, sizeof (t0->tcp_connection));
1550  }
1551  else
1552  {
1553  th0 = tcp_buffer_hdr (b0);
1554  }
1555  clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header));
1556 }
1557 
1558 always_inline void
1560 {
1561  if (PREDICT_TRUE (!val))
1562  return;
1563 
1564  if (is_ip4)
1565  vlib_node_increment_counter (vm, tcp4_established_node.index, evt, val);
1566  else
1567  vlib_node_increment_counter (vm, tcp6_established_node.index, evt, val);
1568 }
1569 
1572  vlib_frame_t * from_frame, int is_ip4)
1573 {
1574  u32 n_left_from, next_index, *from, *to_next;
1575  u32 my_thread_index = vm->thread_index, errors = 0;
1576  tcp_main_t *tm = vnet_get_tcp_main ();
1577  u8 is_fin = 0;
1578 
1579  from = vlib_frame_vector_args (from_frame);
1580  n_left_from = from_frame->n_vectors;
1581 
1582  next_index = node->cached_next_index;
1583 
1584  while (n_left_from > 0)
1585  {
1586  u32 n_left_to_next;
1587 
1588  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1589  while (n_left_from > 0 && n_left_to_next > 0)
1590  {
1591  u32 bi0;
1592  vlib_buffer_t *b0;
1593  tcp_header_t *th0 = 0;
1594  tcp_connection_t *tc0;
1595  u32 next0 = TCP_ESTABLISHED_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED;
1596 
1597  bi0 = from[0];
1598  to_next[0] = bi0;
1599  from += 1;
1600  to_next += 1;
1601  n_left_from -= 1;
1602  n_left_to_next -= 1;
1603 
1604  b0 = vlib_get_buffer (vm, bi0);
1605  tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
1606  my_thread_index);
1607 
1608  if (PREDICT_FALSE (tc0 == 0))
1609  {
1610  error0 = TCP_ERROR_INVALID_CONNECTION;
1611  goto done;
1612  }
1613 
1614  th0 = tcp_buffer_hdr (b0);
1615  is_fin = (th0->flags & TCP_FLAG_FIN) != 0;
1616 
1617  /* SYNs, FINs and data consume sequence numbers */
1618  vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number
1619  + tcp_is_syn (th0) + is_fin + vnet_buffer (b0)->tcp.data_len;
1620 
1621  /* TODO header prediction fast path */
1622 
1623  /* 1-4: check SEQ, RST, SYN */
1624  if (PREDICT_FALSE (tcp_segment_validate (vm, tc0, b0, th0, &next0)))
1625  {
1626  error0 = TCP_ERROR_SEGMENT_INVALID;
1627  TCP_EVT_DBG (TCP_EVT_SEG_INVALID, tc0,
1628  vnet_buffer (b0)->tcp.seq_number,
1629  vnet_buffer (b0)->tcp.seq_end);
1630  goto done;
1631  }
1632 
1633  /* 5: check the ACK field */
1634  if (tcp_rcv_ack (tc0, b0, th0, &next0, &error0))
1635  {
1636  goto done;
1637  }
1638 
1639  /* 6: check the URG bit TODO */
1640 
1641  /* 7: process the segment text */
1642 
1643  vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset);
1644  error0 = tcp_segment_rcv (tm, tc0, b0,
1645  vnet_buffer (b0)->tcp.data_len, &next0);
1646 
1647  /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a
1648  * dangling reference. */
1649 
1650  /* 8: check the FIN bit */
1651  if (is_fin)
1652  {
1653  /* Enter CLOSE-WAIT and notify session. Don't send ACK, instead
1654  * wait for session to call close. To avoid lingering
1655  * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */
1656  tc0->state = TCP_STATE_CLOSE_WAIT;
1657  TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
1658  stream_session_disconnect_notify (&tc0->connection);
1659  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
1660  }
1661 
1662  done:
1663  b0->error = node->errors[error0];
1665  {
1666  tcp_rx_trace_t *t0 =
1667  vlib_add_trace (vm, node, b0, sizeof (*t0));
1668  tcp_set_rx_trace_data (t0, tc0, th0, b0, is_ip4);
1669  }
1670 
1671  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1672  n_left_to_next, bi0, next0);
1673  }
1674 
1675  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1676  }
1677 
1678  errors = session_manager_flush_enqueue_events (my_thread_index);
1679  tcp_established_inc_counter (vm, is_ip4, TCP_ERROR_EVENT_FIFO_FULL, errors);
1680  return from_frame->n_vectors;
1681 }
1682 
1683 static uword
1685  vlib_frame_t * from_frame)
1686 {
1687  return tcp46_established_inline (vm, node, from_frame, 1 /* is_ip4 */ );
1688 }
1689 
1690 static uword
1692  vlib_frame_t * from_frame)
1693 {
1694  return tcp46_established_inline (vm, node, from_frame, 0 /* is_ip4 */ );
1695 }
1696 
1697 /* *INDENT-OFF* */
1699 {
1700  .function = tcp4_established,
1701  .name = "tcp4-established",
1702  /* Takes a vector of packets. */
1703  .vector_size = sizeof (u32),
1704  .n_errors = TCP_N_ERROR,
1705  .error_strings = tcp_error_strings,
1706  .n_next_nodes = TCP_ESTABLISHED_N_NEXT,
1707  .next_nodes =
1708  {
1709 #define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n,
1711 #undef _
1712  },
1713  .format_trace = format_tcp_rx_trace_short,
1714 };
1715 /* *INDENT-ON* */
1716 
1718 
1719 /* *INDENT-OFF* */
1721 {
1722  .function = tcp6_established,
1723  .name = "tcp6-established",
1724  /* Takes a vector of packets. */
1725  .vector_size = sizeof (u32),
1726  .n_errors = TCP_N_ERROR,
1727  .error_strings = tcp_error_strings,
1728  .n_next_nodes = TCP_ESTABLISHED_N_NEXT,
1729  .next_nodes =
1730  {
1731 #define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n,
1733 #undef _
1734  },
1735  .format_trace = format_tcp_rx_trace_short,
1736 };
1737 /* *INDENT-ON* */
1738 
1739 
1741 
1744 
1747  vlib_frame_t * from_frame, int is_ip4)
1748 {
1749  tcp_main_t *tm = vnet_get_tcp_main ();
1750  u32 n_left_from, next_index, *from, *to_next;
1751  u32 my_thread_index = vm->thread_index, errors = 0;
1752  u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP;
1753 
1754  from = vlib_frame_vector_args (from_frame);
1755  n_left_from = from_frame->n_vectors;
1756 
1757  next_index = node->cached_next_index;
1758 
1759  while (n_left_from > 0)
1760  {
1761  u32 n_left_to_next;
1762 
1763  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1764 
1765  while (n_left_from > 0 && n_left_to_next > 0)
1766  {
1767  u32 bi0, ack0, seq0;
1768  vlib_buffer_t *b0;
1769  tcp_rx_trace_t *t0;
1770  tcp_header_t *tcp0 = 0;
1771  tcp_connection_t *tc0;
1772  tcp_connection_t *new_tc0;
1773  u32 next0 = TCP_SYN_SENT_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED;
1774 
1775  bi0 = from[0];
1776  to_next[0] = bi0;
1777  from += 1;
1778  to_next += 1;
1779  n_left_from -= 1;
1780  n_left_to_next -= 1;
1781 
1782  b0 = vlib_get_buffer (vm, bi0);
1783  tc0 =
1785  tcp.connection_index);
1786 
1787  ack0 = vnet_buffer (b0)->tcp.ack_number;
1788  seq0 = vnet_buffer (b0)->tcp.seq_number;
1789  tcp0 = tcp_buffer_hdr (b0);
1790 
1791  if (PREDICT_FALSE
1792  (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0)))
1793  goto drop;
1794 
1795  /* SYNs, FINs and data consume sequence numbers */
1796  vnet_buffer (b0)->tcp.seq_end = seq0 + tcp_is_syn (tcp0)
1797  + tcp_is_fin (tcp0) + vnet_buffer (b0)->tcp.data_len;
1798 
1799  /*
1800  * 1. check the ACK bit
1801  */
1802 
1803  /*
1804  * If the ACK bit is set
1805  * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless
1806  * the RST bit is set, if so drop the segment and return)
1807  * <SEQ=SEG.ACK><CTL=RST>
1808  * and discard the segment. Return.
1809  * If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable.
1810  */
1811  if (tcp_ack (tcp0))
1812  {
1813  if (ack0 <= tc0->iss || ack0 > tc0->snd_nxt)
1814  {
1815  if (!tcp_rst (tcp0))
1816  tcp_send_reset (b0, is_ip4);
1817 
1818  goto drop;
1819  }
1820 
1821  /* Make sure ACK is valid */
1822  if (tc0->snd_una > ack0)
1823  goto drop;
1824  }
1825 
1826  /*
1827  * 2. check the RST bit
1828  */
1829 
1830  if (tcp_rst (tcp0))
1831  {
1832  /* If ACK is acceptable, signal client that peer is not
1833  * willing to accept connection and drop connection*/
1834  if (tcp_ack (tcp0))
1835  {
1836  stream_session_connect_notify (&tc0->connection, sst,
1837  1 /* fail */ );
1838  tcp_connection_cleanup (tc0);
1839  }
1840  goto drop;
1841  }
1842 
1843  /*
1844  * 3. check the security and precedence (skipped)
1845  */
1846 
1847  /*
1848  * 4. check the SYN bit
1849  */
1850 
1851  /* No SYN flag. Drop. */
1852  if (!tcp_syn (tcp0))
1853  goto drop;
1854 
1855  /* Stop connection establishment and retransmit timers */
1856  tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH);
1857  tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT_SYN);
1858 
1859  /* Valid SYN or SYN-ACK. Move connection from half-open pool to
1860  * current thread pool. */
1861  pool_get (tm->connections[my_thread_index], new_tc0);
1862  clib_memcpy (new_tc0, tc0, sizeof (*new_tc0));
1863 
1864  new_tc0->c_thread_index = my_thread_index;
1865  new_tc0->c_c_index = new_tc0 - tm->connections[my_thread_index];
1866 
1867  /* Cleanup half-open connection XXX lock */
1868  pool_put (tm->half_open_connections, tc0);
1869 
1870  new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end;
1871  new_tc0->irs = seq0;
1872 
1873  /* Parse options */
1874  if (tcp_options_parse (tcp0, &new_tc0->rcv_opts))
1875  goto drop;
1876 
1877  if (tcp_opts_tstamp (&new_tc0->rcv_opts))
1878  {
1879  new_tc0->tsval_recent = new_tc0->rcv_opts.tsval;
1880  new_tc0->tsval_recent_age = tcp_time_now ();
1881  }
1882 
1883  if (tcp_opts_wscale (&new_tc0->rcv_opts))
1884  new_tc0->snd_wscale = new_tc0->rcv_opts.wscale;
1885 
1886  new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
1887  << new_tc0->snd_wscale;
1888  new_tc0->snd_wl1 = seq0;
1889  new_tc0->snd_wl2 = ack0;
1890 
1891  tcp_connection_init_vars (new_tc0);
1892 
1893  /* SYN-ACK: See if we can switch to ESTABLISHED state */
1894  if (tcp_ack (tcp0))
1895  {
1896  /* Our SYN is ACKed: we have iss < ack = snd_una */
1897 
1898  /* TODO Dequeue acknowledged segments if we support Fast Open */
1899  new_tc0->snd_una = ack0;
1900  new_tc0->state = TCP_STATE_ESTABLISHED;
1901 
1902  /* Make sure las is initialized for the wnd computation */
1903  new_tc0->rcv_las = new_tc0->rcv_nxt;
1904 
1905  /* Notify app that we have connection. If session layer can't
1906  * allocate session send reset */
1907  if (stream_session_connect_notify (&new_tc0->connection, sst,
1908  0))
1909  {
1910  tcp_connection_cleanup (new_tc0);
1911  tcp_send_reset (b0, is_ip4);
1912  goto drop;
1913  }
1914 
1915  stream_session_init_fifos_pointers (&new_tc0->connection,
1916  new_tc0->irs + 1,
1917  new_tc0->iss + 1);
1918  /* Make sure after data segment processing ACK is sent */
1919  new_tc0->flags |= TCP_CONN_SNDACK;
1920  }
1921  /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */
1922  else
1923  {
1924  new_tc0->state = TCP_STATE_SYN_RCVD;
1925 
1926  /* Notify app that we have connection */
1928  (&new_tc0->connection, sst, 0))
1929  {
1930  tcp_connection_cleanup (new_tc0);
1931  tcp_send_reset (b0, is_ip4);
1932  goto drop;
1933  }
1934 
1935  stream_session_init_fifos_pointers (&new_tc0->connection,
1936  new_tc0->irs + 1,
1937  new_tc0->iss + 1);
1938  tcp_make_synack (new_tc0, b0);
1939  next0 = tcp_next_output (is_ip4);
1940 
1941  goto drop;
1942  }
1943 
1944  /* Read data, if any */
1945  if (vnet_buffer (b0)->tcp.data_len)
1946  {
1947  vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset);
1948  error0 = tcp_segment_rcv (tm, new_tc0, b0,
1949  vnet_buffer (b0)->tcp.data_len,
1950  &next0);
1951  if (error0 == TCP_ERROR_PURE_ACK)
1952  error0 = TCP_ERROR_SYN_ACKS_RCVD;
1953  }
1954  else
1955  {
1956  tcp_make_ack (new_tc0, b0);
1957  next0 = tcp_next_output (new_tc0->c_is_ip4);
1958  }
1959 
1960  drop:
1961 
1962  b0->error = error0 ? node->errors[error0] : 0;
1964  {
1965  t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
1966  clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header));
1967  clib_memcpy (&t0->tcp_connection, tc0,
1968  sizeof (t0->tcp_connection));
1969  }
1970 
1971  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1972  n_left_to_next, bi0, next0);
1973  }
1974 
1975  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1976  }
1977 
1978  errors = session_manager_flush_enqueue_events (my_thread_index);
1979  if (errors)
1980  {
1981  if (is_ip4)
1983  TCP_ERROR_EVENT_FIFO_FULL, errors);
1984  else
1986  TCP_ERROR_EVENT_FIFO_FULL, errors);
1987  }
1988 
1989  return from_frame->n_vectors;
1990 }
1991 
1992 static uword
1994  vlib_frame_t * from_frame)
1995 {
1996  return tcp46_syn_sent_inline (vm, node, from_frame, 1 /* is_ip4 */ );
1997 }
1998 
1999 static uword
2001  vlib_frame_t * from_frame)
2002 {
2003  return tcp46_syn_sent_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2004 }
2005 
2006 /* *INDENT-OFF* */
2008 {
2009  .function = tcp4_syn_sent,
2010  .name = "tcp4-syn-sent",
2011  /* Takes a vector of packets. */
2012  .vector_size = sizeof (u32),
2013  .n_errors = TCP_N_ERROR,
2014  .error_strings = tcp_error_strings,
2015  .n_next_nodes = TCP_SYN_SENT_N_NEXT,
2016  .next_nodes =
2017  {
2018 #define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n,
2020 #undef _
2021  },
2022  .format_trace = format_tcp_rx_trace_short,
2023 };
2024 /* *INDENT-ON* */
2025 
2027 
2028 /* *INDENT-OFF* */
2030 {
2031  .function = tcp6_syn_sent_rcv,
2032  .name = "tcp6-syn-sent",
2033  /* Takes a vector of packets. */
2034  .vector_size = sizeof (u32),
2035  .n_errors = TCP_N_ERROR,
2036  .error_strings = tcp_error_strings,
2037  .n_next_nodes = TCP_SYN_SENT_N_NEXT,
2038  .next_nodes =
2039  {
2040 #define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n,
2042 #undef _
2043  },
2044  .format_trace = format_tcp_rx_trace_short,
2045 };
2046 /* *INDENT-ON* */
2047 
2049 /**
2050  * Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED
2051  * as per RFC793 p. 64
2052  */
2055  vlib_frame_t * from_frame, int is_ip4)
2056 {
2057  tcp_main_t *tm = vnet_get_tcp_main ();
2058  u32 n_left_from, next_index, *from, *to_next;
2059  u32 my_thread_index = vm->thread_index, errors = 0;
2060 
2061  from = vlib_frame_vector_args (from_frame);
2062  n_left_from = from_frame->n_vectors;
2063 
2064  next_index = node->cached_next_index;
2065 
2066  while (n_left_from > 0)
2067  {
2068  u32 n_left_to_next;
2069 
2070  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
2071 
2072  while (n_left_from > 0 && n_left_to_next > 0)
2073  {
2074  u32 bi0;
2075  vlib_buffer_t *b0;
2076  tcp_header_t *tcp0 = 0;
2077  tcp_connection_t *tc0;
2078  u32 next0 = TCP_RCV_PROCESS_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED;
2079 
2080  bi0 = from[0];
2081  to_next[0] = bi0;
2082  from += 1;
2083  to_next += 1;
2084  n_left_from -= 1;
2085  n_left_to_next -= 1;
2086 
2087  b0 = vlib_get_buffer (vm, bi0);
2088  tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
2089  my_thread_index);
2090  if (PREDICT_FALSE (tc0 == 0))
2091  {
2092  error0 = TCP_ERROR_INVALID_CONNECTION;
2093  goto drop;
2094  }
2095 
2096  tcp0 = tcp_buffer_hdr (b0);
2097 
2098  /* SYNs, FINs and data consume sequence numbers */
2099  vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number
2100  + tcp_is_syn (tcp0) + tcp_is_fin (tcp0)
2101  + vnet_buffer (b0)->tcp.data_len;
2102 
2103  /*
2104  * Special treatment for CLOSED
2105  */
2106  switch (tc0->state)
2107  {
2108  case TCP_STATE_CLOSED:
2109  goto drop;
2110  break;
2111  }
2112 
2113  /*
2114  * For all other states (except LISTEN)
2115  */
2116 
2117  /* 1-4: check SEQ, RST, SYN */
2118  if (PREDICT_FALSE
2119  (tcp_segment_validate (vm, tc0, b0, tcp0, &next0)))
2120  {
2121  error0 = TCP_ERROR_SEGMENT_INVALID;
2122  goto drop;
2123  }
2124 
2125  /* 5: check the ACK field */
2126  switch (tc0->state)
2127  {
2128  case TCP_STATE_SYN_RCVD:
2129  /*
2130  * If the segment acknowledgment is not acceptable, form a
2131  * reset segment,
2132  * <SEQ=SEG.ACK><CTL=RST>
2133  * and send it.
2134  */
2135  if (!tcp_rcv_ack_is_acceptable (tc0, b0))
2136  {
2137  tcp_send_reset (b0, is_ip4);
2138  goto drop;
2139  }
2140 
2141  /* Update rtt and rto */
2142  tc0->bytes_acked = 1;
2143  tcp_update_rtt (tc0, vnet_buffer (b0)->tcp.ack_number);
2144 
2145  /* Switch state to ESTABLISHED */
2146  tc0->state = TCP_STATE_ESTABLISHED;
2147 
2148  /* Initialize session variables */
2149  tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
2150  tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
2151  << tc0->rcv_opts.wscale;
2152  tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
2153  tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
2154 
2155  /* Shoulder tap the server */
2156  stream_session_accept_notify (&tc0->connection);
2157 
2158  /* Reset SYN-ACK retransmit timer */
2160  break;
2161  case TCP_STATE_ESTABLISHED:
2162  /* We can get packets in established state here because they
2163  * were enqueued before state change */
2164  if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
2165  goto drop;
2166 
2167  break;
2168  case TCP_STATE_FIN_WAIT_1:
2169  /* In addition to the processing for the ESTABLISHED state, if
2170  * our FIN is now acknowledged then enter FIN-WAIT-2 and
2171  * continue processing in that state. */
2172  if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
2173  goto drop;
2174 
2175  /* If FIN is ACKed */
2176  if (tc0->snd_una == tc0->snd_una_max)
2177  {
2178  tc0->state = TCP_STATE_FIN_WAIT_2;
2179  /* Stop all timers, 2MSL will be set lower */
2181  }
2182  break;
2183  case TCP_STATE_FIN_WAIT_2:
2184  /* In addition to the processing for the ESTABLISHED state, if
2185  * the retransmission queue is empty, the user's CLOSE can be
2186  * acknowledged ("ok") but do not delete the TCB. */
2187  if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
2188  goto drop;
2189  /* check if rtx queue is empty and ack CLOSE TODO */
2190  break;
2191  case TCP_STATE_CLOSE_WAIT:
2192  /* Do the same processing as for the ESTABLISHED state. */
2193  if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
2194  goto drop;
2195  break;
2196  case TCP_STATE_CLOSING:
2197  /* In addition to the processing for the ESTABLISHED state, if
2198  * the ACK acknowledges our FIN then enter the TIME-WAIT state,
2199  * otherwise ignore the segment. */
2200  if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
2201  goto drop;
2202 
2203  /* XXX test that send queue empty */
2204  tc0->state = TCP_STATE_TIME_WAIT;
2205  goto drop;
2206 
2207  break;
2208  case TCP_STATE_LAST_ACK:
2209  /* The only thing that [should] arrive in this state is an
2210  * acknowledgment of our FIN. If our FIN is now acknowledged,
2211  * delete the TCB, enter the CLOSED state, and return. */
2212 
2213  if (!tcp_rcv_ack_is_acceptable (tc0, b0))
2214  goto drop;
2215 
2216  /* Apparently our FIN was lost */
2217  if (tcp_fin (tcp0))
2218  {
2219  /* Don't "make" fin since that increments snd_nxt */
2220  tcp_send_fin (tc0);
2221  goto drop;
2222  }
2223 
2224  tc0->state = TCP_STATE_CLOSED;
2225 
2226  /* Don't delete the connection/session yet. Instead, wait a
2227  * reasonable amount of time until the pipes are cleared. In
2228  * particular, this makes sure that we won't have dead sessions
2229  * when processing events on the tx path */
2230  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME);
2231 
2232  /* Stop retransmit */
2234 
2235  goto drop;
2236 
2237  break;
2238  case TCP_STATE_TIME_WAIT:
2239  /* The only thing that can arrive in this state is a
2240  * retransmission of the remote FIN. Acknowledge it, and restart
2241  * the 2 MSL timeout. */
2242 
2243  if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
2244  goto drop;
2245 
2246  tcp_make_ack (tc0, b0);
2247  tcp_timer_reset (tc0, TCP_TIMER_WAITCLOSE);
2248  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
2249 
2250  goto drop;
2251 
2252  break;
2253  default:
2254  ASSERT (0);
2255  }
2256 
2257  /* 6: check the URG bit TODO */
2258 
2259  /* 7: process the segment text */
2260  switch (tc0->state)
2261  {
2262  case TCP_STATE_ESTABLISHED:
2263  case TCP_STATE_FIN_WAIT_1:
2264  case TCP_STATE_FIN_WAIT_2:
2265  vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset);
2266  error0 = tcp_segment_rcv (tm, tc0, b0,
2267  vnet_buffer (b0)->tcp.data_len,
2268  &next0);
2269  break;
2270  case TCP_STATE_CLOSE_WAIT:
2271  case TCP_STATE_CLOSING:
2272  case TCP_STATE_LAST_ACK:
2273  case TCP_STATE_TIME_WAIT:
2274  /* This should not occur, since a FIN has been received from the
2275  * remote side. Ignore the segment text. */
2276  break;
2277  }
2278 
2279  /* 8: check the FIN bit */
2280  if (!tcp_fin (tcp0))
2281  goto drop;
2282 
2283  switch (tc0->state)
2284  {
2285  case TCP_STATE_ESTABLISHED:
2286  case TCP_STATE_SYN_RCVD:
2287  /* Send FIN-ACK notify app and enter CLOSE-WAIT */
2289  tcp_make_fin (tc0, b0);
2290  next0 = tcp_next_output (tc0->c_is_ip4);
2291  stream_session_disconnect_notify (&tc0->connection);
2292  tc0->state = TCP_STATE_CLOSE_WAIT;
2293  break;
2294  case TCP_STATE_CLOSE_WAIT:
2295  case TCP_STATE_CLOSING:
2296  case TCP_STATE_LAST_ACK:
2297  /* move along .. */
2298  break;
2299  case TCP_STATE_FIN_WAIT_1:
2300  tc0->state = TCP_STATE_TIME_WAIT;
2302  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
2303  break;
2304  case TCP_STATE_FIN_WAIT_2:
2305  /* Got FIN, send ACK! */
2306  tc0->state = TCP_STATE_TIME_WAIT;
2308  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
2309  tcp_make_ack (tc0, b0);
2310  next0 = tcp_next_output (is_ip4);
2311  break;
2312  case TCP_STATE_TIME_WAIT:
2313  /* Remain in the TIME-WAIT state. Restart the 2 MSL time-wait
2314  * timeout.
2315  */
2316  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
2317  break;
2318  }
2319  TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
2320 
2321  drop:
2322  b0->error = error0 ? node->errors[error0] : 0;
2323 
2325  {
2326  tcp_rx_trace_t *t0 =
2327  vlib_add_trace (vm, node, b0, sizeof (*t0));
2328  tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4);
2329  }
2330 
2331  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
2332  n_left_to_next, bi0, next0);
2333  }
2334 
2335  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
2336  }
2337 
2338  errors = session_manager_flush_enqueue_events (my_thread_index);
2339  if (errors)
2340  {
2341  if (is_ip4)
2343  TCP_ERROR_EVENT_FIFO_FULL, errors);
2344  else
2346  TCP_ERROR_EVENT_FIFO_FULL, errors);
2347  }
2348 
2349  return from_frame->n_vectors;
2350 }
2351 
2352 static uword
2354  vlib_frame_t * from_frame)
2355 {
2356  return tcp46_rcv_process_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2357 }
2358 
2359 static uword
2361  vlib_frame_t * from_frame)
2362 {
2363  return tcp46_rcv_process_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2364 }
2365 
2366 /* *INDENT-OFF* */
2368 {
2369  .function = tcp4_rcv_process,
2370  .name = "tcp4-rcv-process",
2371  /* Takes a vector of packets. */
2372  .vector_size = sizeof (u32),
2373  .n_errors = TCP_N_ERROR,
2374  .error_strings = tcp_error_strings,
2375  .n_next_nodes = TCP_RCV_PROCESS_N_NEXT,
2376  .next_nodes =
2377  {
2378 #define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n,
2380 #undef _
2381  },
2382  .format_trace = format_tcp_rx_trace_short,
2383 };
2384 /* *INDENT-ON* */
2385 
2387 
2388 /* *INDENT-OFF* */
2390 {
2391  .function = tcp6_rcv_process,
2392  .name = "tcp6-rcv-process",
2393  /* Takes a vector of packets. */
2394  .vector_size = sizeof (u32),
2395  .n_errors = TCP_N_ERROR,
2396  .error_strings = tcp_error_strings,
2397  .n_next_nodes = TCP_RCV_PROCESS_N_NEXT,
2398  .next_nodes =
2399  {
2400 #define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n,
2402 #undef _
2403  },
2404  .format_trace = format_tcp_rx_trace_short,
2405 };
2406 /* *INDENT-ON* */
2407 
2409 
2412 
2413 /**
2414  * LISTEN state processing as per RFC 793 p. 65
2415  */
2418  vlib_frame_t * from_frame, int is_ip4)
2419 {
2420  u32 n_left_from, next_index, *from, *to_next;
2421  u32 my_thread_index = vm->thread_index;
2422  tcp_main_t *tm = vnet_get_tcp_main ();
2423  u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP;
2424 
2425  from = vlib_frame_vector_args (from_frame);
2426  n_left_from = from_frame->n_vectors;
2427 
2428  next_index = node->cached_next_index;
2429 
2430  while (n_left_from > 0)
2431  {
2432  u32 n_left_to_next;
2433 
2434  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
2435 
2436  while (n_left_from > 0 && n_left_to_next > 0)
2437  {
2438  u32 bi0;
2439  vlib_buffer_t *b0;
2440  tcp_rx_trace_t *t0;
2441  tcp_header_t *th0 = 0;
2442  tcp_connection_t *lc0;
2443  ip4_header_t *ip40;
2444  ip6_header_t *ip60;
2445  tcp_connection_t *child0;
2446  u32 error0 = TCP_ERROR_SYNS_RCVD, next0 = TCP_LISTEN_NEXT_DROP;
2447 
2448  bi0 = from[0];
2449  to_next[0] = bi0;
2450  from += 1;
2451  to_next += 1;
2452  n_left_from -= 1;
2453  n_left_to_next -= 1;
2454 
2455  b0 = vlib_get_buffer (vm, bi0);
2456  lc0 = tcp_listener_get (vnet_buffer (b0)->tcp.connection_index);
2457 
2458  if (is_ip4)
2459  {
2460  ip40 = vlib_buffer_get_current (b0);
2461  th0 = ip4_next_header (ip40);
2462  }
2463  else
2464  {
2465  ip60 = vlib_buffer_get_current (b0);
2466  th0 = ip6_next_header (ip60);
2467  }
2468 
2469  /* Create child session. For syn-flood protection use filter */
2470 
2471  /* 1. first check for an RST: handled in dispatch */
2472  /* if (tcp_rst (th0))
2473  goto drop; */
2474 
2475  /* 2. second check for an ACK: handled in dispatch */
2476  /* if (tcp_ack (th0))
2477  {
2478  tcp_send_reset (b0, is_ip4);
2479  goto drop;
2480  } */
2481 
2482  /* 3. check for a SYN (did that already) */
2483 
2484  /* Create child session and send SYN-ACK */
2485  pool_get (tm->connections[my_thread_index], child0);
2486  memset (child0, 0, sizeof (*child0));
2487 
2488  child0->c_c_index = child0 - tm->connections[my_thread_index];
2489  child0->c_lcl_port = lc0->c_lcl_port;
2490  child0->c_rmt_port = th0->src_port;
2491  child0->c_is_ip4 = is_ip4;
2492  child0->c_thread_index = my_thread_index;
2493  child0->state = TCP_STATE_SYN_RCVD;
2494 
2495  if (is_ip4)
2496  {
2497  child0->c_lcl_ip4.as_u32 = ip40->dst_address.as_u32;
2498  child0->c_rmt_ip4.as_u32 = ip40->src_address.as_u32;
2499  }
2500  else
2501  {
2502  clib_memcpy (&child0->c_lcl_ip6, &ip60->dst_address,
2503  sizeof (ip6_address_t));
2504  clib_memcpy (&child0->c_rmt_ip6, &ip60->src_address,
2505  sizeof (ip6_address_t));
2506  }
2507 
2508  if (stream_session_accept (&child0->connection, lc0->c_s_index, sst,
2509  0 /* notify */ ))
2510  {
2511  error0 = TCP_ERROR_CREATE_SESSION_FAIL;
2512  goto drop;
2513  }
2514 
2515  if (tcp_options_parse (th0, &child0->rcv_opts))
2516  {
2517  goto drop;
2518  }
2519 
2520  child0->irs = vnet_buffer (b0)->tcp.seq_number;
2521  child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1;
2522  child0->rcv_las = child0->rcv_nxt;
2523 
2524  /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK}
2525  * segments are used to initialize PAWS. */
2526  if (tcp_opts_tstamp (&child0->rcv_opts))
2527  {
2528  child0->tsval_recent = child0->rcv_opts.tsval;
2529  child0->tsval_recent_age = tcp_time_now ();
2530  }
2531 
2532  if (tcp_opts_wscale (&child0->rcv_opts))
2533  child0->snd_wscale = child0->rcv_opts.wscale;
2534 
2535  child0->snd_wnd = clib_net_to_host_u16 (th0->window)
2536  << child0->snd_wscale;
2537  child0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
2538  child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
2539 
2540  tcp_connection_init_vars (child0);
2541 
2542  TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0);
2543 
2544  /* Reuse buffer to make syn-ack and send */
2545  tcp_make_synack (child0, b0);
2546  next0 = tcp_next_output (is_ip4);
2547 
2548  /* Init fifo pointers after we have iss */
2549  stream_session_init_fifos_pointers (&child0->connection,
2550  child0->irs + 1,
2551  child0->iss + 1);
2552  drop:
2554  {
2555  t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
2556  clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header));
2557  clib_memcpy (&t0->tcp_connection, lc0,
2558  sizeof (t0->tcp_connection));
2559  }
2560 
2561  b0->error = node->errors[error0];
2562 
2563  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
2564  n_left_to_next, bi0, next0);
2565  }
2566 
2567  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
2568  }
2569  return from_frame->n_vectors;
2570 }
2571 
2572 static uword
2574  vlib_frame_t * from_frame)
2575 {
2576  return tcp46_listen_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2577 }
2578 
2579 static uword
2581  vlib_frame_t * from_frame)
2582 {
2583  return tcp46_listen_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2584 }
2585 
2586 /* *INDENT-OFF* */
2588 {
2589  .function = tcp4_listen,
2590  .name = "tcp4-listen",
2591  /* Takes a vector of packets. */
2592  .vector_size = sizeof (u32),
2593  .n_errors = TCP_N_ERROR,
2594  .error_strings = tcp_error_strings,
2595  .n_next_nodes = TCP_LISTEN_N_NEXT,
2596  .next_nodes =
2597  {
2598 #define _(s,n) [TCP_LISTEN_NEXT_##s] = n,
2600 #undef _
2601  },
2602  .format_trace = format_tcp_rx_trace_short,
2603 };
2604 /* *INDENT-ON* */
2605 
2607 
2608 /* *INDENT-OFF* */
2610 {
2611  .function = tcp6_listen,
2612  .name = "tcp6-listen",
2613  /* Takes a vector of packets. */
2614  .vector_size = sizeof (u32),
2615  .n_errors = TCP_N_ERROR,
2616  .error_strings = tcp_error_strings,
2617  .n_next_nodes = TCP_LISTEN_N_NEXT,
2618  .next_nodes =
2619  {
2620 #define _(s,n) [TCP_LISTEN_NEXT_##s] = n,
2622 #undef _
2623  },
2624  .format_trace = format_tcp_rx_trace_short,
2625 };
2626 /* *INDENT-ON* */
2627 
2629 
2632 
2633 typedef enum _tcp_input_next
2634 {
2643 
2644 #define foreach_tcp4_input_next \
2645  _ (DROP, "error-drop") \
2646  _ (LISTEN, "tcp4-listen") \
2647  _ (RCV_PROCESS, "tcp4-rcv-process") \
2648  _ (SYN_SENT, "tcp4-syn-sent") \
2649  _ (ESTABLISHED, "tcp4-established") \
2650  _ (RESET, "tcp4-reset")
2651 
2652 #define foreach_tcp6_input_next \
2653  _ (DROP, "error-drop") \
2654  _ (LISTEN, "tcp6-listen") \
2655  _ (RCV_PROCESS, "tcp6-rcv-process") \
2656  _ (SYN_SENT, "tcp6-syn-sent") \
2657  _ (ESTABLISHED, "tcp6-established") \
2658  _ (RESET, "tcp6-reset")
2659 
2660 #define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN)
2661 
2664  vlib_frame_t * from_frame, int is_ip4)
2665 {
2666  u32 n_left_from, next_index, *from, *to_next;
2667  u32 my_thread_index = vm->thread_index;
2668  tcp_main_t *tm = vnet_get_tcp_main ();
2669 
2670  from = vlib_frame_vector_args (from_frame);
2671  n_left_from = from_frame->n_vectors;
2672 
2673  next_index = node->cached_next_index;
2674 
2675  while (n_left_from > 0)
2676  {
2677  u32 n_left_to_next;
2678 
2679  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
2680 
2681  while (n_left_from > 0 && n_left_to_next > 0)
2682  {
2683  int n_advance_bytes0, n_data_bytes0;
2684  u32 bi0;
2685  vlib_buffer_t *b0;
2686  tcp_header_t *tcp0 = 0;
2687  tcp_connection_t *tc0;
2688  ip4_header_t *ip40;
2689  ip6_header_t *ip60;
2690  u32 error0 = TCP_ERROR_NO_LISTENER, next0 = TCP_INPUT_NEXT_DROP;
2691  u8 flags0;
2692 
2693  bi0 = from[0];
2694  to_next[0] = bi0;
2695  from += 1;
2696  to_next += 1;
2697  n_left_from -= 1;
2698  n_left_to_next -= 1;
2699 
2700  b0 = vlib_get_buffer (vm, bi0);
2701  vnet_buffer (b0)->tcp.flags = 0;
2702 
2703  /* Checksum computed by ipx_local no need to compute again */
2704 
2705  if (is_ip4)
2706  {
2707  ip40 = vlib_buffer_get_current (b0);
2708  tcp0 = ip4_next_header (ip40);
2709  n_advance_bytes0 = (ip4_header_bytes (ip40)
2710  + tcp_header_bytes (tcp0));
2711  n_data_bytes0 = clib_net_to_host_u16 (ip40->length)
2712  - n_advance_bytes0;
2713 
2714  /* lookup session */
2715  tc0 =
2716  (tcp_connection_t *)
2718  &ip40->src_address,
2719  tcp0->dst_port,
2720  tcp0->src_port,
2721  SESSION_TYPE_IP4_TCP,
2722  my_thread_index);
2723  }
2724  else
2725  {
2726  ip60 = vlib_buffer_get_current (b0);
2727  tcp0 = ip6_next_header (ip60);
2728  n_advance_bytes0 = tcp_header_bytes (tcp0);
2729  n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length)
2730  - n_advance_bytes0;
2731  n_advance_bytes0 += sizeof (ip60[0]);
2732 
2733  tc0 =
2734  (tcp_connection_t *)
2736  &ip60->dst_address,
2737  tcp0->src_port,
2738  tcp0->dst_port,
2739  SESSION_TYPE_IP6_TCP,
2740  my_thread_index);
2741  }
2742 
2743  /* Length check */
2744  if (PREDICT_FALSE (n_advance_bytes0 < 0))
2745  {
2746  error0 = TCP_ERROR_LENGTH;
2747  goto done;
2748  }
2749 
2750  /* Session exists */
2751  if (PREDICT_TRUE (0 != tc0))
2752  {
2753  /* Save connection index */
2754  vnet_buffer (b0)->tcp.connection_index = tc0->c_c_index;
2755  vnet_buffer (b0)->tcp.seq_number =
2756  clib_net_to_host_u32 (tcp0->seq_number);
2757  vnet_buffer (b0)->tcp.ack_number =
2758  clib_net_to_host_u32 (tcp0->ack_number);
2759 
2760  vnet_buffer (b0)->tcp.hdr_offset = (u8 *) tcp0
2761  - (u8 *) vlib_buffer_get_current (b0);
2762  vnet_buffer (b0)->tcp.data_offset = n_advance_bytes0;
2763  vnet_buffer (b0)->tcp.data_len = n_data_bytes0;
2764 
2765  flags0 = tcp0->flags & filter_flags;
2766  next0 = tm->dispatch_table[tc0->state][flags0].next;
2767  error0 = tm->dispatch_table[tc0->state][flags0].error;
2768 
2769  if (PREDICT_FALSE (error0 == TCP_ERROR_DISPATCH
2770  || next0 == TCP_INPUT_NEXT_RESET))
2771  {
2772  /* Overload tcp flags to store state */
2773  tcp_state_t state0 = tc0->state;
2774  vnet_buffer (b0)->tcp.flags = tc0->state;
2775 
2776  if (error0 == TCP_ERROR_DISPATCH)
2777  clib_warning ("disp error state %U flags %U",
2779  (int) flags0);
2780  }
2781  }
2782  else
2783  {
2784  /* Send reset */
2785  next0 = TCP_INPUT_NEXT_RESET;
2786  error0 = TCP_ERROR_NO_LISTENER;
2787  }
2788 
2789  done:
2790  b0->error = error0 ? node->errors[error0] : 0;
2791 
2793  {
2794  tcp_rx_trace_t *t0 =
2795  vlib_add_trace (vm, node, b0, sizeof (*t0));
2796  tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4);
2797  }
2798  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
2799  n_left_to_next, bi0, next0);
2800  }
2801 
2802  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
2803  }
2804 
2805  return from_frame->n_vectors;
2806 }
2807 
2808 static uword
2810  vlib_frame_t * from_frame)
2811 {
2812  return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2813 }
2814 
2815 static uword
2817  vlib_frame_t * from_frame)
2818 {
2819  return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2820 }
2821 
2822 /* *INDENT-OFF* */
2824 {
2825  .function = tcp4_input,
2826  .name = "tcp4-input",
2827  /* Takes a vector of packets. */
2828  .vector_size = sizeof (u32),
2829  .n_errors = TCP_N_ERROR,
2830  .error_strings = tcp_error_strings,
2831  .n_next_nodes = TCP_INPUT_N_NEXT,
2832  .next_nodes =
2833  {
2834 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
2836 #undef _
2837  },
2838  .format_buffer = format_tcp_header,
2839  .format_trace = format_tcp_rx_trace,
2840 };
2841 /* *INDENT-ON* */
2842 
2844 
2845 /* *INDENT-OFF* */
2847 {
2848  .function = tcp6_input,
2849  .name = "tcp6-input",
2850  /* Takes a vector of packets. */
2851  .vector_size = sizeof (u32),
2852  .n_errors = TCP_N_ERROR,
2853  .error_strings = tcp_error_strings,
2854  .n_next_nodes = TCP_INPUT_N_NEXT,
2855  .next_nodes =
2856  {
2857 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
2859 #undef _
2860  },
2861  .format_buffer = format_tcp_header,
2862  .format_trace = format_tcp_rx_trace,
2863 };
2864 /* *INDENT-ON* */
2865 
2867 
2868 static void
2870 {
2871  int i, j;
2872  for (i = 0; i < ARRAY_LEN (tm->dispatch_table); i++)
2873  for (j = 0; j < ARRAY_LEN (tm->dispatch_table[i]); j++)
2874  {
2875  tm->dispatch_table[i][j].next = TCP_INPUT_NEXT_DROP;
2876  tm->dispatch_table[i][j].error = TCP_ERROR_DISPATCH;
2877  }
2878 
2879 #define _(t,f,n,e) \
2880 do { \
2881  tm->dispatch_table[TCP_STATE_##t][f].next = (n); \
2882  tm->dispatch_table[TCP_STATE_##t][f].error = (e); \
2883 } while (0)
2884 
2885  /* SYNs for new connections -> tcp-listen. */
2886  _(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE);
2887  _(LISTEN, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE);
2888  _(LISTEN, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_NONE);
2889  /* ACK for for a SYN-ACK -> tcp-rcv-process. */
2890  _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2891  _(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2892  /* SYN-ACK for a SYN */
2894  TCP_ERROR_NONE);
2895  _(SYN_SENT, TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
2896  _(SYN_SENT, TCP_FLAG_RST, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
2898  TCP_ERROR_NONE);
2899  /* ACK for for established connection -> tcp-established. */
2900  _(ESTABLISHED, TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
2901  /* FIN for for established connection -> tcp-established. */
2902  _(ESTABLISHED, TCP_FLAG_FIN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
2904  TCP_ERROR_NONE);
2905  _(ESTABLISHED, TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
2907  TCP_ERROR_NONE);
2908  /* ACK or FIN-ACK to our FIN */
2909  _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2911  TCP_ERROR_NONE);
2912  /* FIN in reply to our FIN from the other side */
2913  _(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2914  /* FIN confirming that the peer (app) has closed */
2915  _(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2916  _(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2918  TCP_ERROR_NONE);
2919  _(CLOSE_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2921  TCP_ERROR_NONE);
2922  _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2923  _(LAST_ACK, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2925  TCP_ERROR_NONE);
2926  _(LAST_ACK, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2927  _(TIME_WAIT, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2929  TCP_ERROR_NONE);
2930  _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED);
2931  _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED);
2932 #undef _
2933 }
2934 
2935 clib_error_t *
2937 {
2938  clib_error_t *error = 0;
2939  tcp_main_t *tm = vnet_get_tcp_main ();
2940 
2941  if ((error = vlib_call_init_function (vm, tcp_init)))
2942  return error;
2943 
2944  /* Initialize dispatch table. */
2946 
2947  return error;
2948 }
2949 
2951 
2952 /*
2953  * fd.io coding-style-patch-verification: ON
2954  *
2955  * Local Variables:
2956  * eval: (c-set-style "gnu")
2957  * End:
2958  */
#define tcp_in_cong_recovery(tc)
Definition: tcp.h:284
static int tcp_session_enqueue_ooo(tcp_connection_t *tc, vlib_buffer_t *b, u16 data_len)
Enqueue out-of-order data.
Definition: tcp_input.c:1358
static void tcp_update_timestamp(tcp_connection_t *tc, u32 seq, u32 seq_end)
Update tsval recent.
Definition: tcp_input.c:242
int session_manager_flush_enqueue_events(u32 thread_index)
Flushes queue of sessions that are to be notified of new data enqueued events.
Definition: session.c:656
static u8 tcp_should_fastrecover(tcp_connection_t *tc)
Definition: tcp_input.c:975
#define TCP_2MSL_TIME
Definition: tcp.h:98
End of options.
Definition: tcp_packet.h:104
sll srl srl sll sra u16x4 i
Definition: vector_sse2.h:337
#define tcp_fastrecovery_1_smss_off(tc)
Definition: tcp.h:282
#define clib_min(x, y)
Definition: clib.h:332
#define CLIB_UNUSED(x)
Definition: clib.h:79
static void tcp_cc_update(tcp_connection_t *tc, vlib_buffer_t *b)
Definition: tcp_input.c:947
vlib_node_registration_t tcp6_rcv_process_node
(constructor) VLIB_REGISTER_NODE (tcp6_rcv_process_node)
Definition: tcp_input.c:2389
#define tcp_in_recovery(tc)
Definition: tcp.h:278
#define TCP_OPTION_LEN_SACK_PERMITTED
Definition: tcp_packet.h:167
static int tcp_rcv_ack_is_acceptable(tcp_connection_t *tc0, vlib_buffer_t *tb0)
Definition: tcp_input.c:362
#define seq_leq(_s1, _s2)
Definition: tcp.h:438
void tcp_make_fin(tcp_connection_t *tc, vlib_buffer_t *b)
Convert buffer to FIN-ACK.
Definition: tcp_output.c:518
void tcp_send_reset(vlib_buffer_t *pkt, u8 is_ip4)
Send reset without reusing existing buffer.
Definition: tcp_output.c:684
struct _sack_block sack_block_t
int stream_session_accept(transport_connection_t *tc, u32 listener_index, u8 sst, u8 notify)
Accept a stream session.
Definition: session.c:838
void tcp_rcv_sacks(tcp_connection_t *tc, u32 ack)
Definition: tcp_input.c:686
ip4_address_t src_address
Definition: ip4_packet.h:164
static u8 tcp_cc_is_spurious_retransmit(tcp_connection_t *tc)
Definition: tcp_input.c:917
static uword tcp46_input_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
Definition: tcp_input.c:2663
enum _tcp_state_next tcp_state_next_t
#define tcp_rst(_th)
Definition: tcp_packet.h:81
static uword tcp6_input(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:2816
Selective Ack permitted.
Definition: tcp_packet.h:108
#define TCP_FLAG_SYN
Definition: fa_node.h:8
#define tcp_opts_tstamp(_to)
Definition: tcp_packet.h:157
#define PREDICT_TRUE(x)
Definition: clib.h:98
void tcp_fast_retransmit(tcp_connection_t *tc)
Do fast retransmit.
Definition: tcp_output.c:1408
static int tcp_segment_validate(vlib_main_t *vm, tcp_connection_t *tc0, vlib_buffer_t *b0, tcp_header_t *th0, u32 *next0)
Validate incoming segment as per RFC793 p.
Definition: tcp_input.c:269
static void tcp_dispatch_table_init(tcp_main_t *tm)
Definition: tcp_input.c:2869
int stream_session_enqueue_data(transport_connection_t *tc, vlib_buffer_t *b, u32 offset, u8 queue_event, u8 is_in_order)
Definition: session.c:494
static int ip4_header_bytes(ip4_header_t *i)
Definition: ip4_packet.h:227
struct _sack_scoreboard sack_scoreboard_t
static tcp_connection_t * tcp_half_open_connection_get(u32 conn_index)
Definition: tcp.h:415
void tcp_update_rto(tcp_connection_t *tc)
Definition: tcp_input.c:405
void vlib_put_next_frame(vlib_main_t *vm, vlib_node_runtime_t *r, u32 next_index, u32 n_vectors_left)
Release pointer to next frame vector data.
Definition: main.c:459
void scoreboard_update_bytes(tcp_connection_t *tc, sack_scoreboard_t *sb)
Definition: tcp_input.c:578
#define tcp_doff(_th)
Definition: tcp_packet.h:78
struct _tcp_main tcp_main_t
u32 thread_index
Definition: main.h:159
void tcp_connection_timers_reset(tcp_connection_t *tc)
Stop all connection timers.
Definition: tcp.c:337
#define vec_add1(V, E)
Add 1 element to end of vector (unspecified alignment).
Definition: vec.h:522
#define tcp_recovery_off(tc)
Definition: tcp.h:276
#define clib_abs(x)
Definition: clib.h:339
struct _vlib_node_registration vlib_node_registration_t
static sack_scoreboard_hole_t * scoreboard_prev_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp.h:699
static int tcp_update_rtt(tcp_connection_t *tc, u32 ack)
Update RTT estimate and RTO timer.
Definition: tcp_input.c:420
#define vec_add2(V, P, N)
Add N elements to end of vector V, return pointer to new elements in P.
Definition: vec.h:561
vlib_node_registration_t tcp4_rcv_process_node
(constructor) VLIB_REGISTER_NODE (tcp4_rcv_process_node)
Definition: tcp_input.c:2367
struct _tcp_connection tcp_connection_t
u8 * format(u8 *s, const char *fmt,...)
Definition: format.c:419
#define tcp_opts_sack(_to)
Definition: tcp_packet.h:159
#define tcp_fin(_th)
Definition: tcp_packet.h:79
void tcp_fast_retransmit_sack(tcp_connection_t *tc)
Do fast retransmit with SACKs.
Definition: tcp_output.c:1291
tcp_connection_t tcp_connection
Definition: tcp_input.c:1509
static void tcp_cc_congestion_undo(tcp_connection_t *tc)
Definition: tcp_input.c:904
vlib_error_t * errors
Vector of errors for this node.
Definition: node.h:419
No operation.
Definition: tcp_packet.h:105
format_function_t format_tcp_flags
Definition: tcp.h:63
#define pool_get(P, E)
Allocate an object E from a pool P (unspecified alignment).
Definition: pool.h:200
u8 n_sack_blocks
Number of SACKs blocks.
Definition: tcp_packet.h:152
struct _tcp_header tcp_header_t
static u32 tcp_available_snd_space(const tcp_connection_t *tc)
Definition: tcp.h:511
static uword tcp6_listen(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:2580
ip6_address_t src_address
Definition: ip6_packet.h:341
vlib_node_registration_t tcp6_syn_sent_node
(constructor) VLIB_REGISTER_NODE (tcp6_syn_sent_node)
Definition: tcp_input.c:1743
struct _sack_scoreboard_hole sack_scoreboard_hole_t
u8 wscale
Window scale advertised.
Definition: tcp_packet.h:148
static void tcp_established_inc_counter(vlib_main_t *vm, u8 is_ip4, u8 evt, u8 val)
Definition: tcp_input.c:1559
#define vec_reset_length(v)
Reset vector length to zero NULL-pointer tolerant.
static void tcp_dequeue_acked(tcp_connection_t *tc, u32 ack)
Dequeue bytes that have been acked and while at it update RTT estimates.
Definition: tcp_input.c:465
struct _stream_session_t stream_session_t
#define tcp_fastrecovery_on(tc)
Definition: tcp.h:273
Limit MSS.
Definition: tcp_packet.h:106
static uword tcp4_listen(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:2573
static u32 scoreboard_hole_index(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp.h:748
#define tcp_is_fin(_th)
Definition: tcp_packet.h:90
static uword tcp6_rcv_process(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:2360
static uword tcp4_syn_sent(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:1993
#define seq_gt(_s1, _s2)
Definition: tcp.h:439
#define VLIB_INIT_FUNCTION(x)
Definition: init.h:111
vlib_node_registration_t tcp4_established_node
(constructor) VLIB_REGISTER_NODE (tcp4_established_node)
Definition: tcp_input.c:78
#define TCP_CLOSEWAIT_TIME
Definition: tcp.h:99
void stream_session_accept_notify(transport_connection_t *tc)
Definition: session.c:753
#define always_inline
Definition: clib.h:84
static uword format_get_indent(u8 *s)
Definition: format.h:72
#define TCP_OPTION_LEN_SACK_BLOCK
Definition: tcp_packet.h:169
ip4_address_t dst_address
Definition: ip4_packet.h:164
#define TCP_FLAG_ACK
Definition: fa_node.h:11
u8 * format_white_space(u8 *s, va_list *va)
Definition: std-formats.c:113
#define TCP_DELACK_TIME
Definition: tcp.h:96
static tcp_header_t * tcp_buffer_hdr(vlib_buffer_t *b)
Definition: tcp.h:372
static void tcp_cc_recovery_exit(tcp_connection_t *tc)
Definition: tcp_input.c:884
enum _tcp_state tcp_state_t
#define TCP_ALWAYS_ACK
On/off delayed acks.
Definition: tcp.h:37
vlib_node_registration_t tcp6_input_node
(constructor) VLIB_REGISTER_NODE (tcp6_input_node)
Definition: tcp_input.c:2631
static u8 tcp_ack_is_dupack(tcp_connection_t *tc, vlib_buffer_t *b, u32 prev_snd_wnd, u32 prev_snd_una)
Check if duplicate ack as per RFC5681 Sec.
Definition: tcp_input.c:485
#define TCP_RTO_MAX
Definition: tcp.h:103
static u32 ooo_segment_length(svm_fifo_t *f, ooo_segment_t *s)
Definition: svm_fifo.h:158
static void * ip4_next_header(ip4_header_t *i)
Definition: ip4_packet.h:233
static u32 tcp_time_now(void)
Definition: tcp.h:540
sack_block_t * sacks
SACK blocks.
Definition: tcp_packet.h:151
#define vec_end(v)
End (last data address) of vector.
static tcp_cc_algorithm_t * tcp_cc_algo_get(tcp_cc_algorithm_type_e type)
Definition: tcp.h:774
static u32 scoreboard_hole_bytes(sack_scoreboard_hole_t *hole)
Definition: tcp.h:742
#define vlib_call_init_function(vm, x)
Definition: init.h:162
#define TCP_MAX_SACK_BLOCKS
Max number of SACK blocks stored.
Definition: tcp.h:155
#define tcp_validate_txf_size(_tc, _a)
Definition: tcp.h:664
int stream_session_connect_notify(transport_connection_t *tc, u8 sst, u8 is_fail)
Definition: session.c:704
static int tcp_segment_rcv(tcp_main_t *tm, tcp_connection_t *tc, vlib_buffer_t *b, u16 n_data_bytes, u32 *next0)
Definition: tcp_input.c:1431
#define TCP_EVT_DBG(_evt, _args...)
Definition: tcp_debug.h:183
transport_connection_t * stream_session_lookup_transport4(ip4_address_t *lcl, ip4_address_t *rmt, u16 lcl_port, u16 rmt_port, u8 proto, u32 my_thread_index)
Definition: session.c:314
#define timestamp_lt(_t1, _t2)
Definition: tcp.h:444
static void tcp_timer_set(tcp_connection_t *tc, u8 timer_id, u32 interval)
Definition: tcp.h:571
#define TCP_OPTION_LEN_WINDOW_SCALE
Definition: tcp_packet.h:166
u32 stream_session_dequeue_drop(transport_connection_t *tc, u32 max_bytes)
Definition: session.c:583
#define TCP_INVALID_SACK_HOLE_INDEX
Definition: tcp.h:156
#define pool_elt_at_index(p, i)
Returns pointer to element at given index.
Definition: pool.h:397
void tcp_cc_init(tcp_connection_t *tc)
Definition: tcp_input.c:1138
u8 * format_tcp_rx_trace(u8 *s, va_list *args)
Definition: tcp_input.c:1513
void tcp_cc_fastrecovery_exit(tcp_connection_t *tc)
Definition: tcp_input.c:894
static uword tcp46_listen_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
LISTEN state processing as per RFC 793 p.
Definition: tcp_input.c:2417
#define tcp_in_fastrecovery(tc)
Definition: tcp.h:277
void tcp_retransmit_first_unacked(tcp_connection_t *tc)
Retransmit first unacked segment.
Definition: tcp_output.c:1258
void scoreboard_init_high_rxt(sack_scoreboard_t *sb)
Definition: tcp_input.c:677
#define foreach_tcp4_input_next
Definition: tcp_input.c:2644
static sack_scoreboard_hole_t * scoreboard_next_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp.h:691
static u32 ooo_segment_offset(svm_fifo_t *f, ooo_segment_t *s)
Definition: svm_fifo.h:146
static void * vlib_buffer_get_current(vlib_buffer_t *b)
Get pointer to current data to process.
Definition: buffer.h:188
#define filter_flags
Definition: tcp_input.c:2660
#define pool_put(P, E)
Free an object E in pool P.
Definition: pool.h:241
#define foreach_tcp6_input_next
Definition: tcp_input.c:2652
void tcp_fast_retransmit_no_sack(tcp_connection_t *tc)
Fast retransmit without SACK info.
Definition: tcp_output.c:1365
#define TCP_CLEANUP_TIME
Definition: tcp.h:100
#define PREDICT_FALSE(x)
Definition: clib.h:97
#define vec_del1(v, i)
Delete the element at index I.
Definition: vec.h:805
int tcp_options_parse(tcp_header_t *th, tcp_options_t *to)
Parse TCP header options.
Definition: tcp_input.c:123
#define TCP_FLAG_FIN
Definition: fa_node.h:7
#define vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, bi0, next0)
Finish enqueueing one buffer forward in the graph.
Definition: buffer_node.h:216
static sack_scoreboard_hole_t * scoreboard_last_hole(sack_scoreboard_t *sb)
Definition: tcp.h:715
#define vlib_get_next_frame(vm, node, next_index, vectors, n_vectors_left)
Get pointer to next frame vector data by (vlib_node_runtime_t, next_index).
Definition: node_funcs.h:366
vlib_node_registration_t tcp4_listen_node
(constructor) VLIB_REGISTER_NODE (tcp4_listen_node)
Definition: tcp_input.c:2410
#define TCP_OPTION_LEN_TIMESTAMP
Definition: tcp_packet.h:168
static ooo_segment_t * svm_fifo_newest_ooo_segment(svm_fifo_t *f)
Definition: svm_fifo.h:128
vlib_error_t error
Error code for buffers to be enqueued to error handler.
Definition: buffer.h:113
Selective Ack block.
Definition: tcp_packet.h:109
vlib_node_registration_t tcp6_established_node
(constructor) VLIB_REGISTER_NODE (tcp6_established_node)
Definition: tcp_input.c:79
static int tcp_can_delack(tcp_connection_t *tc)
Check if ACK could be delayed.
Definition: tcp_input.c:1415
static void vlib_node_increment_counter(vlib_main_t *vm, u32 node_index, u32 counter_index, u64 increment)
Definition: node_funcs.h:1131
#define TCP_FLAG_RST
Definition: fa_node.h:9
#define TCP_MAX_WND_SCALE
Definition: tcp_packet.h:172
static void tcp_timer_reset(tcp_connection_t *tc, u8 timer_id)
Definition: tcp.h:579
static sack_scoreboard_hole_t * scoreboard_first_hole(sack_scoreboard_t *sb)
Definition: tcp.h:707
static uword tcp6_syn_sent_rcv(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:2000
vlib_node_registration_t tcp4_syn_sent_node
(constructor) VLIB_REGISTER_NODE (tcp4_syn_sent_node)
Definition: tcp_input.c:1742
u16 n_vectors
Definition: node.h:345
#define vec_free(V)
Free vector&#39;s memory (no header).
Definition: vec.h:340
#define TCP_DUPACK_THRESHOLD
Definition: tcp.h:34
format_function_t format_tcp_state
Definition: tcp.h:62
#define clib_warning(format, args...)
Definition: error.h:59
#define VLIB_BUFFER_IS_TRACED
Definition: buffer.h:85
void stream_session_init_fifos_pointers(transport_connection_t *tc, u32 rx_pointer, u32 tx_pointer)
Init fifo tail and head pointers.
Definition: session.c:694
#define clib_memcpy(a, b, c)
Definition: string.h:69
static int tcp_rcv_ack(tcp_connection_t *tc, vlib_buffer_t *b, tcp_header_t *th, u32 *next, u32 *error)
Process incoming ACK.
Definition: tcp_input.c:1148
tcp_header_t tcp_header
Definition: tcp_input.c:1508
format_function_t format_tcp_header
Definition: format.h:102
void tcp_make_synack(tcp_connection_t *ts, vlib_buffer_t *b)
Convert buffer to SYN-ACK.
Definition: tcp_output.c:538
#define ARRAY_LEN(x)
Definition: clib.h:59
#define TCP_RTT_MAX
Definition: tcp.h:105
sack_scoreboard_hole_t * scoreboard_next_rxt_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *start, u8 have_sent_1_smss, u8 *can_rescue, u8 *snd_limited)
Figure out the next hole to retransmit.
Definition: tcp_input.c:622
u16 mss
Option flags, see above.
Definition: tcp_packet.h:147
static void * ip6_next_header(ip6_header_t *i)
Definition: ip6_packet.h:351
void tcp_make_ack(tcp_connection_t *ts, vlib_buffer_t *b)
Convert buffer to ACK.
Definition: tcp_output.c:503
void stream_session_disconnect_notify(transport_connection_t *tc)
Notification from transport that connection is being closed.
Definition: session.c:771
static void tcp_timer_update(tcp_connection_t *tc, u8 timer_id, u32 interval)
Definition: tcp.h:590
#define TCP_PAWS_IDLE
24 days
Definition: tcp.h:30
u16 cached_next_index
Next frame index that vector arguments were last enqueued to last time this node ran.
Definition: node.h:460
clib_error_t * tcp_input_init(vlib_main_t *vm)
Definition: tcp_input.c:2936
#define ASSERT(truth)
#define tcp_syn(_th)
Definition: tcp_packet.h:80
unsigned int u32
Definition: types.h:88
static void tcp_estimate_rtt(tcp_connection_t *tc, u32 mrtt)
Compute smoothed RTT as per VJ&#39;s &#39;88 SIGCOMM and RFC6298.
Definition: tcp_input.c:379
enum _tcp_rcv_process_next tcp_rcv_process_next_t
static uword tcp4_established(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:1684
#define seq_geq(_s1, _s2)
Definition: tcp.h:440
static uword tcp46_established_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
Definition: tcp_input.c:1571
static void vlib_buffer_advance(vlib_buffer_t *b, word l)
Advance current data pointer by the supplied (signed!) amount.
Definition: buffer.h:201
static int tcp_segment_check_paws(tcp_connection_t *tc)
RFC1323: Check against wrapped sequence numbers (PAWS).
Definition: tcp_input.c:232
static void tcp_cc_handle_event(tcp_connection_t *tc, u32 is_dack)
One function to rule them all ...
Definition: tcp_input.c:985
enum _tcp_input_next tcp_input_next_t
void tcp_update_sack_list(tcp_connection_t *tc, u32 start, u32 end)
Build SACK list as per RFC2018.
Definition: tcp_input.c:1247
Out-of-order segment.
Definition: svm_fifo.h:27
static u8 tcp_segment_in_rcv_wnd(tcp_connection_t *tc, u32 seq, u32 end_seq)
Validate segment sequence number.
Definition: tcp_input.c:109
#define clib_max(x, y)
Definition: clib.h:325
u64 uword
Definition: types.h:112
static void * vlib_add_trace(vlib_main_t *vm, vlib_node_runtime_t *r, vlib_buffer_t *b, u32 n_data_bytes)
Definition: trace_funcs.h:55
VLIB_NODE_FUNCTION_MULTIARCH(tcp4_established_node, tcp4_established)
static uword tcp6_established(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:1691
#define seq_lt(_s1, _s2)
Definition: tcp.h:437
#define tcp_is_syn(_th)
Definition: tcp_packet.h:89
#define tcp_opts_wscale(_to)
Definition: tcp_packet.h:158
enum _tcp_syn_sent_next tcp_syn_sent_next_t
static sack_scoreboard_hole_t * scoreboard_get_hole(sack_scoreboard_t *sb, u32 index)
Definition: tcp.h:683
static void tcp_update_snd_wnd(tcp_connection_t *tc, u32 seq, u32 ack, u32 snd_wnd)
Try to update snd_wnd based on feedback received from peer.
Definition: tcp_input.c:843
unsigned short u16
Definition: types.h:57
void tcp_connection_reset(tcp_connection_t *tc)
Notify session that connection has been reset.
Definition: tcp.c:152
u32 tsval
Timestamp value.
Definition: tcp_packet.h:149
enum _tcp_established_next tcp_established_next_t
u16 payload_length
Definition: ip6_packet.h:332
sack_scoreboard_hole_t * scoreboard_insert_hole(sack_scoreboard_t *sb, u32 prev_index, u32 start, u32 end)
Definition: tcp_input.c:541
u32 tsecr
Echoed/reflected time stamp.
Definition: tcp_packet.h:150
vlib_node_registration_t tcp4_input_node
(constructor) VLIB_REGISTER_NODE (tcp4_input_node)
Definition: tcp_input.c:2630
void tcp_send_fin(tcp_connection_t *tc)
Send FIN.
Definition: tcp_output.c:866
#define vec_len(v)
Number of elements in vector (rvalue-only, NULL tolerant)
unsigned char u8
Definition: types.h:56
enum _tcp_listen_next tcp_listen_next_t
#define foreach_tcp_state_next
Definition: tcp_input.c:29
static uword tcp4_rcv_process(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:2353
static void tcp_retransmit_timer_update(tcp_connection_t *tc)
Definition: tcp.h:645
static int tcp_session_enqueue_data(tcp_connection_t *tc, vlib_buffer_t *b, u16 data_len)
Enqueue data for delivery to application.
Definition: tcp_input.c:1299
static u8 tcp_should_fastrecover_sack(tcp_connection_t *tc)
Definition: tcp_input.c:969
#define seq_max(_s1, _s2)
Definition: tcp.h:441
static void * vlib_frame_vector_args(vlib_frame_t *f)
Get pointer to frame vector data.
Definition: node_funcs.h:269
void tcp_connection_init_vars(tcp_connection_t *tc)
Initialize tcp connection variables.
Definition: tcp.c:444
#define TCP_OPTION_LEN_MSS
Definition: tcp_packet.h:165
#define tcp_next_output(is_ip4)
Definition: tcp_input.c:75
clib_error_t * tcp_init(vlib_main_t *vm)
Definition: tcp.c:1115
#define TCP_RTO_MIN
Definition: tcp.h:104
u8 * format_tcp_connection(u8 *s, va_list *args)
Definition: tcp.c:669
void tcp_set_rx_trace_data(tcp_rx_trace_t *t0, tcp_connection_t *tc0, tcp_header_t *th0, vlib_buffer_t *b0, u8 is_ip4)
Definition: tcp_input.c:1544
#define vnet_buffer(b)
Definition: buffer.h:303
static tcp_connection_t * tcp_connection_get(u32 conn_index, u32 thread_index)
Definition: tcp.h:382
#define VLIB_REGISTER_NODE(x,...)
Definition: node.h:144
static int tcp_header_bytes(tcp_header_t *t)
Definition: tcp_packet.h:93
void tcp_connection_cleanup(tcp_connection_t *tc)
Cleans up connection state.
Definition: tcp.c:104
static uword tcp4_input(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:2809
Window scale.
Definition: tcp_packet.h:107
vlib_node_registration_t tcp6_listen_node
(constructor) VLIB_REGISTER_NODE (tcp6_listen_node)
Definition: tcp_input.c:2411
#define tcp_opts_sack_permitted(_to)
Definition: tcp_packet.h:160
int tcp_cc_recover(tcp_connection_t *tc)
Definition: tcp_input.c:925
Timestamps.
Definition: tcp_packet.h:110
void scoreboard_remove_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp_input.c:510
transport_connection_t * stream_session_lookup_transport6(ip6_address_t *lcl, ip6_address_t *rmt, u16 lcl_port, u16 rmt_port, u8 proto, u32 my_thread_index)
Definition: session.c:348
u32 flags
buffer flags: VLIB_BUFFER_IS_TRACED: trace this buffer.
Definition: buffer.h:74
static void tcp_persist_timer_set(tcp_connection_t *tc)
Definition: tcp.h:622
static tcp_main_t * vnet_get_tcp_main()
Definition: tcp.h:366
static uword tcp46_syn_sent_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
Definition: tcp_input.c:1746
#define tcp_fastrecovery_off(tc)
Definition: tcp.h:274
static uword tcp46_rcv_process_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED as per RFC793 p...
Definition: tcp_input.c:2054
static void tcp_retransmit_timer_reset(tcp_connection_t *tc)
Definition: tcp.h:609
static vlib_buffer_t * vlib_get_buffer(vlib_main_t *vm, u32 buffer_index)
Translate buffer index into buffer pointer.
Definition: buffer_funcs.h:57
u8 * format_tcp_rx_trace_short(u8 *s, va_list *args)
Definition: tcp_input.c:1529
#define tcp_ack(_th)
Definition: tcp_packet.h:83
static u8 tcp_timer_is_active(tcp_connection_t *tc, tcp_timers_e timer)
Definition: tcp.h:659
static tcp_connection_t * tcp_listener_get(u32 tli)
Definition: tcp.h:409
ip6_address_t dst_address
Definition: ip6_packet.h:341
static u8 tcp_ack_is_cc_event(tcp_connection_t *tc, vlib_buffer_t *b, u32 prev_snd_wnd, u32 prev_snd_una, u8 *is_dack)
Checks if ack is a congestion control event.
Definition: tcp_input.c:498
static stream_session_t * stream_session_get(u32 si, u32 thread_index)
Definition: session.h:281
void tcp_cc_init_congestion(tcp_connection_t *tc)
Definition: tcp_input.c:875
static void tcp_persist_timer_reset(tcp_connection_t *tc)
Definition: tcp.h:639
static char * tcp_error_strings[]
Definition: tcp_input.c:22