xref: /dragonfly/sys/netinet/tcp_sack.c (revision 63f17add1cf6119ec8f692990df2892d86244f2f)
1 /*
2  * Copyright (c) 2003, 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2003, 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  * $DragonFly: src/sys/netinet/tcp_sack.c,v 1.8 2008/08/15 21:37:16 nth Exp $
34  */
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/kernel.h>
39 #include <sys/malloc.h>
40 #include <sys/queue.h>
41 #include <sys/thread.h>
42 #include <sys/types.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/resourcevar.h>
46 
47 #include <net/if.h>
48 
49 #include <netinet/in.h>
50 #include <netinet/in_systm.h>
51 #include <netinet/ip.h>
52 #include <netinet/in_var.h>
53 #include <netinet/in_pcb.h>
54 #include <netinet/ip_var.h>
55 #include <netinet/tcp.h>
56 #include <netinet/tcp_seq.h>
57 #include <netinet/tcp_var.h>
58 
59 /*
60  * Implemented:
61  *
62  * RFC 2018
63  * RFC 2883
64  * RFC 3517
65  * RFC 6675
66  */
67 
68 struct sackblock {
69           tcp_seq                       sblk_start;
70           tcp_seq                       sblk_end;
71           TAILQ_ENTRY(sackblock)        sblk_list;
72 };
73 
74 #define   MAXSAVEDBLOCKS      8                             /* per connection limit */
75 
76 static int insert_block(struct scoreboard *scb,
77                               const struct raw_sackblock *raw_sb, boolean_t *update);
78 
79 static MALLOC_DEFINE(M_SACKBLOCK, "sblk", "sackblock struct");
80 
81 /*
82  * Per-tcpcb initialization.
83  */
84 void
tcp_sack_tcpcb_init(struct tcpcb * tp)85 tcp_sack_tcpcb_init(struct tcpcb *tp)
86 {
87           struct scoreboard *scb = &tp->scb;
88 
89           scb->nblocks = 0;
90           TAILQ_INIT(&scb->sackblocks);
91           scb->lastfound = NULL;
92 }
93 
94 /*
95  * Find the SACK block containing or immediately preceding "seq".
96  * The boolean result indicates whether the sequence is actually
97  * contained in the SACK block.
98  */
99 static boolean_t
sack_block_lookup(struct scoreboard * scb,tcp_seq seq,struct sackblock ** sb)100 sack_block_lookup(struct scoreboard *scb, tcp_seq seq, struct sackblock **sb)
101 {
102           static struct krate sackkrate = { .freq = 1 };
103           struct sackblock *hint = scb->lastfound;
104           struct sackblock *cur, *last, *prev;
105 
106           if (TAILQ_EMPTY(&scb->sackblocks)) {
107                     *sb = NULL;
108                     return FALSE;
109           }
110 
111           if (hint == NULL) {
112                     /* No hint.  Search from start to end. */
113                     cur = TAILQ_FIRST(&scb->sackblocks);
114                     last = NULL;
115                     prev = TAILQ_LAST(&scb->sackblocks, sackblock_list);
116           } else  {
117                     if (SEQ_GEQ(seq, hint->sblk_start)) {
118                               /* Search from hint to end of list. */
119                               cur = hint;
120                               last = NULL;
121                               prev = TAILQ_LAST(&scb->sackblocks, sackblock_list);
122                     } else {
123                               /* Search from front of list to hint. */
124                               cur = TAILQ_FIRST(&scb->sackblocks);
125                               last = hint;
126                               prev = TAILQ_PREV(hint, sackblock_list, sblk_list);
127                     }
128           }
129 
130           do {
131                     /*
132                      * Ensure we can't crash if the list really blows up due to
133                      * delta sign wraps when comparing seq against sblk_start vs
134                      * sblk_end.
135                      */
136                     if (cur == NULL) {
137                               krateprintf(&sackkrate,
138                                             "tcp_sack: fatal corrupt seq\n");
139                               *sb = NULL;
140                               return FALSE;
141                     }
142 
143                     /*
144                      * Check completion
145                      */
146                     if (SEQ_GT(cur->sblk_end, seq)) {
147                               if (SEQ_GEQ(seq, cur->sblk_start)) {
148                                         *sb = scb->lastfound = cur;
149                                         return TRUE;
150                               } else {
151                                         *sb = scb->lastfound =
152                                             TAILQ_PREV(cur, sackblock_list, sblk_list);
153                                         return FALSE;
154                               }
155                     }
156 
157                     /*
158                      * seq is greater than sblk_end, nominally proceed to the
159                      * next block.
160                      *
161                      * It is possible for an overflow to cause the comparison
162                      * between seq an sblk_start vs sblk_end to make it appear
163                      * that seq is less than sblk_start and also greater than
164                      * sblk_end.  If we allow the case to fall through we can
165                      * end up with cur == NULL on the next loop.
166                      */
167                     if (SEQ_LT(seq, cur->sblk_start)) {
168                               krateprintf(&sackkrate,
169                                             "tcp_sack: corrupt seq "
170                                             "0x%08x vs 0x%08x-0x%08x\n",
171                                             seq, cur->sblk_start, cur->sblk_end);
172                               if (SEQ_GEQ(seq, cur->sblk_start)) {
173                                         *sb = scb->lastfound = cur;
174                                         return TRUE;
175                               } else {
176                                         *sb = scb->lastfound =
177                                             TAILQ_PREV(cur, sackblock_list, sblk_list);
178                                         return FALSE;
179                               }
180                     }
181                     cur = TAILQ_NEXT(cur, sblk_list);
182           } while (cur != last);
183 
184           *sb = scb->lastfound = prev;
185           return FALSE;
186 }
187 
188 /*
189  * Allocate a SACK block.
190  */
191 static __inline struct sackblock *
alloc_sackblock(struct scoreboard * scb,const struct raw_sackblock * raw_sb)192 alloc_sackblock(struct scoreboard *scb, const struct raw_sackblock *raw_sb)
193 {
194           struct sackblock *sb;
195 
196           if (scb->freecache != NULL) {
197                     sb = scb->freecache;
198                     scb->freecache = NULL;
199                     tcpstat.tcps_sacksbfast++;
200           } else {
201                     sb = kmalloc(sizeof(struct sackblock), M_SACKBLOCK, M_NOWAIT);
202                     if (sb == NULL) {
203                               tcpstat.tcps_sacksbfailed++;
204                               return NULL;
205                     }
206           }
207           sb->sblk_start = raw_sb->rblk_start;
208           sb->sblk_end = raw_sb->rblk_end;
209           return sb;
210 }
211 
212 static __inline struct sackblock *
alloc_sackblock_limit(struct scoreboard * scb,const struct raw_sackblock * raw_sb)213 alloc_sackblock_limit(struct scoreboard *scb,
214     const struct raw_sackblock *raw_sb)
215 {
216           if (scb->nblocks == MAXSAVEDBLOCKS) {
217                     /*
218                      * Should try to kick out older blocks XXX JH
219                      * May be able to coalesce with existing block.
220                      * Or, go other way and free all blocks if we hit
221                      * this limit.
222                      */
223                     tcpstat.tcps_sacksboverflow++;
224                     return NULL;
225           }
226           return alloc_sackblock(scb, raw_sb);
227 }
228 
229 /*
230  * Free a SACK block.
231  */
232 static __inline void
free_sackblock(struct scoreboard * scb,struct sackblock * s)233 free_sackblock(struct scoreboard *scb, struct sackblock *s)
234 {
235           if (scb->freecache == NULL) {
236                     /* YYY Maybe use the latest freed block? */
237                     scb->freecache = s;
238                     return;
239           }
240           kfree(s, M_SACKBLOCK);
241 }
242 
243 /*
244  * Free up SACK blocks for data that's been acked.
245  */
246 static void
tcp_sack_ack_blocks(struct tcpcb * tp,tcp_seq th_ack)247 tcp_sack_ack_blocks(struct tcpcb *tp, tcp_seq th_ack)
248 {
249           struct scoreboard *scb = &tp->scb;
250           struct sackblock *sb, *nb;
251 
252           sb = TAILQ_FIRST(&scb->sackblocks);
253           while (sb && SEQ_LEQ(sb->sblk_end, th_ack)) {
254                     nb = TAILQ_NEXT(sb, sblk_list);
255                     if (scb->lastfound == sb)
256                               scb->lastfound = NULL;
257                     TAILQ_REMOVE(&scb->sackblocks, sb, sblk_list);
258                     free_sackblock(scb, sb);
259                     --scb->nblocks;
260                     KASSERT(scb->nblocks >= 0,
261                         ("SACK block count underflow: %d < 0", scb->nblocks));
262                     sb = nb;
263           }
264           if (sb && SEQ_GEQ(th_ack, sb->sblk_start)) {
265                     /* Other side reneged? XXX */
266                     tcpstat.tcps_sackrenege++;
267                     tcp_sack_discard(tp);
268           }
269 }
270 
271 /*
272  * Delete and free SACK blocks saved in scoreboard.
273  */
274 static void
tcp_sack_cleanup(struct scoreboard * scb)275 tcp_sack_cleanup(struct scoreboard *scb)
276 {
277           struct sackblock *sb, *nb;
278 
279           TAILQ_FOREACH_MUTABLE(sb, &scb->sackblocks, sblk_list, nb) {
280                     free_sackblock(scb, sb);
281                     --scb->nblocks;
282           }
283           KASSERT(scb->nblocks == 0,
284               ("SACK block %d count not zero", scb->nblocks));
285           TAILQ_INIT(&scb->sackblocks);
286           scb->lastfound = NULL;
287 }
288 
289 /*
290  * Discard SACK scoreboard, HighRxt, RescueRxt and LostSeq.
291  */
292 void
tcp_sack_discard(struct tcpcb * tp)293 tcp_sack_discard(struct tcpcb *tp)
294 {
295           tcp_sack_cleanup(&tp->scb);
296           tp->rexmt_high = tp->snd_una;
297           tp->sack_flags &= ~TSACK_F_SACKRESCUED;
298           tp->scb.lostseq = tp->snd_una;
299 }
300 
301 /*
302  * Delete and free SACK blocks saved in scoreboard.
303  * Delete the one slot block cache.
304  */
305 void
tcp_sack_destroy(struct scoreboard * scb)306 tcp_sack_destroy(struct scoreboard *scb)
307 {
308           tcp_sack_cleanup(scb);
309           if (scb->freecache != NULL) {
310                     kfree(scb->freecache, M_SACKBLOCK);
311                     scb->freecache = NULL;
312           }
313 }
314 
315 /*
316  * Cleanup the reported SACK block information
317  */
318 void
tcp_sack_report_cleanup(struct tcpcb * tp)319 tcp_sack_report_cleanup(struct tcpcb *tp)
320 {
321           tp->sack_flags &=
322               ~(TSACK_F_DUPSEG | TSACK_F_ENCLOSESEG | TSACK_F_SACKLEFT);
323           tp->reportblk.rblk_start = tp->reportblk.rblk_end;
324 }
325 
326 /*
327  * Whether SACK report is needed or not
328  */
329 boolean_t
tcp_sack_report_needed(const struct tcpcb * tp)330 tcp_sack_report_needed(const struct tcpcb *tp)
331 {
332           if ((tp->sack_flags &
333                (TSACK_F_DUPSEG | TSACK_F_ENCLOSESEG | TSACK_F_SACKLEFT)) ||
334               tp->reportblk.rblk_start != tp->reportblk.rblk_end)
335                     return TRUE;
336           else
337                     return FALSE;
338 }
339 
340 /*
341  * Returns          0 if not D-SACK block,
342  *                  1 if D-SACK,
343  *                  2 if duplicate of out-of-order D-SACK block.
344  */
345 int
tcp_sack_ndsack_blocks(const struct raw_sackblock * blocks,const int numblocks,tcp_seq snd_una)346 tcp_sack_ndsack_blocks(const struct raw_sackblock *blocks, const int numblocks,
347     tcp_seq snd_una)
348 {
349           if (numblocks == 0)
350                     return 0;
351 
352           if (SEQ_LT(blocks[0].rblk_start, snd_una))
353                     return 1;
354 
355           /* block 0 inside block 1 */
356           if (numblocks > 1 &&
357               SEQ_GEQ(blocks[0].rblk_start, blocks[1].rblk_start) &&
358               SEQ_LEQ(blocks[0].rblk_end, blocks[1].rblk_end))
359                     return 2;
360 
361           return 0;
362 }
363 
364 /*
365  * Update scoreboard on new incoming ACK.
366  */
367 static void
tcp_sack_add_blocks(struct tcpcb * tp,struct tcpopt * to)368 tcp_sack_add_blocks(struct tcpcb *tp, struct tcpopt *to)
369 {
370           const int numblocks = to->to_nsackblocks;
371           struct raw_sackblock *blocks = to->to_sackblocks;
372           struct scoreboard *scb = &tp->scb;
373           int startblock, i;
374 
375           if (tcp_sack_ndsack_blocks(blocks, numblocks, tp->snd_una) > 0)
376                     startblock = 1;
377           else
378                     startblock = 0;
379 
380           to->to_flags |= TOF_SACK_REDUNDANT;
381           for (i = startblock; i < numblocks; i++) {
382                     struct raw_sackblock *newsackblock = &blocks[i];
383                     boolean_t update;
384                     int error;
385 
386                     /* Guard against ACK reordering */
387                     if (SEQ_LEQ(newsackblock->rblk_start, tp->snd_una))
388                               continue;
389 
390                     /* Don't accept bad SACK blocks */
391                     if (SEQ_GT(newsackblock->rblk_end, tp->snd_max)) {
392                               tcpstat.tcps_rcvbadsackopt++;
393                               break;              /* skip all other blocks */
394                     }
395                     tcpstat.tcps_sacksbupdate++;
396 
397                     error = insert_block(scb, newsackblock, &update);
398                     if (update)
399                               to->to_flags &= ~TOF_SACK_REDUNDANT;
400                     if (error)
401                               break;
402           }
403 }
404 
405 void
tcp_sack_update_scoreboard(struct tcpcb * tp,struct tcpopt * to)406 tcp_sack_update_scoreboard(struct tcpcb *tp, struct tcpopt *to)
407 {
408           struct scoreboard *scb = &tp->scb;
409           int rexmt_high_update = 0;
410 
411           tcp_sack_ack_blocks(tp, tp->snd_una);
412           tcp_sack_add_blocks(tp, to);
413           tcp_sack_update_lostseq(scb, tp->snd_una, tp->t_maxseg,
414               tp->t_rxtthresh);
415           if (SEQ_LT(tp->rexmt_high, tp->snd_una)) {
416                     tp->rexmt_high = tp->snd_una;
417                     rexmt_high_update = 1;
418           }
419           if (tp->sack_flags & TSACK_F_SACKRESCUED) {
420                     if (SEQ_LEQ(tp->rexmt_rescue, tp->snd_una)) {
421                               tp->sack_flags &= ~TSACK_F_SACKRESCUED;
422                     } else if (tcp_aggressive_rescuesack && rexmt_high_update &&
423                         SEQ_LT(tp->rexmt_rescue, tp->rexmt_high)) {
424                               /* Drag RescueRxt along with HighRxt */
425                               tp->rexmt_rescue = tp->rexmt_high;
426                     }
427           }
428 }
429 
430 /*
431  * Insert SACK block into sender's scoreboard.
432  */
433 static int
insert_block(struct scoreboard * scb,const struct raw_sackblock * raw_sb,boolean_t * update)434 insert_block(struct scoreboard *scb, const struct raw_sackblock *raw_sb,
435     boolean_t *update)
436 {
437           struct sackblock *sb, *workingblock;
438           boolean_t overlap_front;
439 
440           *update = TRUE;
441           if (TAILQ_EMPTY(&scb->sackblocks)) {
442                     struct sackblock *newblock;
443 
444                     KASSERT(scb->nblocks == 0, ("emply scb w/ blocks"));
445 
446                     newblock = alloc_sackblock(scb, raw_sb);
447                     if (newblock == NULL)
448                               return ENOMEM;
449                     TAILQ_INSERT_HEAD(&scb->sackblocks, newblock, sblk_list);
450                     scb->nblocks = 1;
451                     return 0;
452           }
453 
454           KASSERT(scb->nblocks > 0, ("insert_block() called w/ no blocks"));
455           KASSERT(scb->nblocks <= MAXSAVEDBLOCKS,
456               ("too many SACK blocks %d", scb->nblocks));
457 
458           overlap_front = sack_block_lookup(scb, raw_sb->rblk_start, &sb);
459 
460           if (sb == NULL) {
461                     workingblock = alloc_sackblock_limit(scb, raw_sb);
462                     if (workingblock == NULL)
463                               return ENOMEM;
464                     TAILQ_INSERT_HEAD(&scb->sackblocks, workingblock, sblk_list);
465                     ++scb->nblocks;
466           } else {
467                     if (overlap_front || sb->sblk_end == raw_sb->rblk_start) {
468                               tcpstat.tcps_sacksbreused++;
469 
470                               /* Extend old block */
471                               workingblock = sb;
472                               if (SEQ_GT(raw_sb->rblk_end, sb->sblk_end)) {
473                                         sb->sblk_end = raw_sb->rblk_end;
474                               } else {
475                                         /* Exact match, nothing to consolidate */
476                                         *update = FALSE;
477                                         return 0;
478                               }
479                     } else {
480                               workingblock = alloc_sackblock_limit(scb, raw_sb);
481                               if (workingblock == NULL)
482                                         return ENOMEM;
483                               TAILQ_INSERT_AFTER(&scb->sackblocks, sb, workingblock,
484                                   sblk_list);
485                               ++scb->nblocks;
486                     }
487           }
488 
489           /* Consolidate right-hand side. */
490           sb = TAILQ_NEXT(workingblock, sblk_list);
491           while (sb != NULL &&
492               SEQ_GEQ(workingblock->sblk_end, sb->sblk_end)) {
493                     struct sackblock *nextblock;
494 
495                     nextblock = TAILQ_NEXT(sb, sblk_list);
496                     if (scb->lastfound == sb)
497                               scb->lastfound = NULL;
498                     /* Remove completely overlapped block */
499                     TAILQ_REMOVE(&scb->sackblocks, sb, sblk_list);
500                     free_sackblock(scb, sb);
501                     --scb->nblocks;
502                     KASSERT(scb->nblocks > 0,
503                         ("removed overlapped block: %d blocks left", scb->nblocks));
504                     sb = nextblock;
505           }
506           if (sb != NULL &&
507               SEQ_GEQ(workingblock->sblk_end, sb->sblk_start)) {
508                     /* Extend new block to cover partially overlapped old block. */
509                     workingblock->sblk_end = sb->sblk_end;
510                     if (scb->lastfound == sb)
511                               scb->lastfound = NULL;
512                     TAILQ_REMOVE(&scb->sackblocks, sb, sblk_list);
513                     free_sackblock(scb, sb);
514                     --scb->nblocks;
515                     KASSERT(scb->nblocks > 0,
516                         ("removed partial right: %d blocks left", scb->nblocks));
517           }
518           return 0;
519 }
520 
521 #ifdef DEBUG_SACK_BLOCKS
522 static void
tcp_sack_dump_blocks(const struct scoreboard * scb)523 tcp_sack_dump_blocks(const struct scoreboard *scb)
524 {
525           const struct sackblock *sb;
526 
527           kprintf("%d blocks:", scb->nblocks);
528           TAILQ_FOREACH(sb, &scb->sackblocks, sblk_list)
529                     kprintf(" [%u, %u)", sb->sblk_start, sb->sblk_end);
530           kprintf("\n");
531 }
532 #else
533 static __inline void
tcp_sack_dump_blocks(const struct scoreboard * scb)534 tcp_sack_dump_blocks(const struct scoreboard *scb)
535 {
536 }
537 #endif
538 
539 /*
540  * Optimization to quickly determine which packets are lost.
541  */
542 void
tcp_sack_update_lostseq(struct scoreboard * scb,tcp_seq snd_una,u_int maxseg,int rxtthresh)543 tcp_sack_update_lostseq(struct scoreboard *scb, tcp_seq snd_una, u_int maxseg,
544     int rxtthresh)
545 {
546           struct sackblock *sb;
547           int nsackblocks = 0;
548           int bytes_sacked = 0;
549           int rxtthresh_bytes;
550 
551           if (tcp_do_rfc6675)
552                     rxtthresh_bytes = (rxtthresh - 1) * maxseg;
553           else
554                     rxtthresh_bytes = rxtthresh * maxseg;
555 
556           sb = TAILQ_LAST(&scb->sackblocks, sackblock_list);
557           while (sb != NULL) {
558                     ++nsackblocks;
559                     bytes_sacked += sb->sblk_end - sb->sblk_start;
560                     if (nsackblocks == rxtthresh ||
561                         bytes_sacked >= rxtthresh_bytes) {
562                               scb->lostseq = sb->sblk_start;
563                               return;
564                     }
565                     sb = TAILQ_PREV(sb, sackblock_list, sblk_list);
566           }
567           scb->lostseq = snd_una;
568 }
569 
570 /*
571  * Return whether the given sequence number is considered lost.
572  */
573 boolean_t
tcp_sack_islost(const struct scoreboard * scb,tcp_seq seqnum)574 tcp_sack_islost(const struct scoreboard *scb, tcp_seq seqnum)
575 {
576           return SEQ_LT(seqnum, scb->lostseq);
577 }
578 
579 /*
580  * True if at least "amount" has been SACKed.  Used by Early Retransmit.
581  */
582 boolean_t
tcp_sack_has_sacked(const struct scoreboard * scb,u_int amount)583 tcp_sack_has_sacked(const struct scoreboard *scb, u_int amount)
584 {
585           const struct sackblock *sb;
586           int bytes_sacked = 0;
587 
588           TAILQ_FOREACH(sb, &scb->sackblocks, sblk_list) {
589                     bytes_sacked += sb->sblk_end - sb->sblk_start;
590                     if (bytes_sacked >= amount)
591                               return TRUE;
592           }
593           return FALSE;
594 }
595 
596 /*
597  * Number of bytes SACKed below seq.
598  */
599 int
tcp_sack_bytes_below(const struct scoreboard * scb,tcp_seq seq)600 tcp_sack_bytes_below(const struct scoreboard *scb, tcp_seq seq)
601 {
602           const struct sackblock *sb;
603           int bytes_sacked = 0;
604 
605           sb = TAILQ_FIRST(&scb->sackblocks);
606           while (sb && SEQ_GT(seq, sb->sblk_start)) {
607                     bytes_sacked += seq_min(seq, sb->sblk_end) - sb->sblk_start;
608                     sb = TAILQ_NEXT(sb, sblk_list);
609           }
610           return bytes_sacked;
611 }
612 
613 /*
614  * Return estimate of the number of bytes outstanding in the network.
615  */
616 uint32_t
tcp_sack_compute_pipe(const struct tcpcb * tp)617 tcp_sack_compute_pipe(const struct tcpcb *tp)
618 {
619           const struct scoreboard *scb = &tp->scb;
620           const struct sackblock *sb;
621           int nlost, nretransmitted;
622           tcp_seq end;
623 
624           nlost = tp->snd_max - scb->lostseq;
625           nretransmitted = tp->rexmt_high - tp->snd_una;
626 
627           TAILQ_FOREACH(sb, &scb->sackblocks, sblk_list) {
628                     if (SEQ_LT(sb->sblk_start, tp->rexmt_high)) {
629                               end = seq_min(sb->sblk_end, tp->rexmt_high);
630                               nretransmitted -= end - sb->sblk_start;
631                     }
632                     if (SEQ_GEQ(sb->sblk_start, scb->lostseq))
633                               nlost -= sb->sblk_end - sb->sblk_start;
634           }
635 
636           return (nlost + nretransmitted);
637 }
638 
639 /*
640  * Return the sequence number and length of the next segment to transmit
641  * when in Fast Recovery.
642  */
643 boolean_t
tcp_sack_nextseg(struct tcpcb * tp,tcp_seq * nextrexmt,uint32_t * plen,boolean_t * rescue)644 tcp_sack_nextseg(struct tcpcb *tp, tcp_seq *nextrexmt, uint32_t *plen,
645     boolean_t *rescue)
646 {
647           struct scoreboard *scb = &tp->scb;
648           struct socket *so = tp->t_inpcb->inp_socket;
649           struct sackblock *sb;
650           const struct sackblock *lastblock =
651               TAILQ_LAST(&scb->sackblocks, sackblock_list);
652           tcp_seq torexmt;
653           long len, off, sendwin;
654 
655           /* skip SACKed data */
656           tcp_sack_skip_sacked(scb, &tp->rexmt_high);
657 
658           /* Look for lost data. */
659           torexmt = tp->rexmt_high;
660           *rescue = FALSE;
661           if (lastblock != NULL) {
662                     if (SEQ_LT(torexmt, lastblock->sblk_end) &&
663                         tcp_sack_islost(scb, torexmt)) {
664 sendunsacked:
665                               *nextrexmt = torexmt;
666                               /* If the left-hand edge has been SACKed, pull it in. */
667                               if (sack_block_lookup(scb, torexmt + tp->t_maxseg, &sb))
668                                         *plen = sb->sblk_start - torexmt;
669                               else
670                                         *plen = tp->t_maxseg;
671                               return TRUE;
672                     }
673           }
674 
675           /* See if unsent data available within send window. */
676           off = tp->snd_max - tp->snd_una;
677           sendwin = min(tp->snd_wnd, tp->snd_bwnd);
678           len = (long) ulmin(so->so_snd.ssb_cc, sendwin) - off;
679           if (len > 0) {
680                     *nextrexmt = tp->snd_max;     /* Send new data. */
681                     *plen = tp->t_maxseg;
682                     return TRUE;
683           }
684 
685           /* We're less certain this data has been lost. */
686           if (lastblock != NULL && SEQ_LT(torexmt, lastblock->sblk_end))
687                     goto sendunsacked;
688 
689           /* Rescue retransmission */
690           if (tcp_do_rescuesack || tcp_do_rfc6675) {
691                     tcpstat.tcps_sackrescue_try++;
692                     if (tp->sack_flags & TSACK_F_SACKRESCUED) {
693                               if (!tcp_aggressive_rescuesack)
694                                         return FALSE;
695 
696                               /*
697                                * Aggressive variant of the rescue retransmission.
698                                *
699                                * The idea of the rescue retransmission is to sustain
700                                * the ACK clock thus to avoid timeout retransmission.
701                                *
702                                * Under some situations, the conservative approach
703                                * suggested in the draft
704                                * http://tools.ietf.org/html/
705                                * draft-nishida-tcpm-rescue-retransmission-00
706                                * could not sustain ACK clock, since it only allows
707                                * one rescue retransmission before a cumulative ACK
708                                * covers the segement transmitted by rescue
709                                * retransmission.
710                                *
711                                * We try to locate the next unSACKed segment which
712                                * follows the previously sent rescue segment.  If
713                                * there is no such segment, we loop back to the first
714                                * unacknowledged segment.
715                                */
716 
717                               /*
718                                * Skip SACKed data, but here we follow
719                                * the last transmitted rescue segment.
720                                */
721                               torexmt = tp->rexmt_rescue;
722                               tcp_sack_skip_sacked(scb, &torexmt);
723                     }
724                     if (torexmt == tp->snd_max) {
725                               /* Nothing left to retransmit; restart */
726                               torexmt = tp->snd_una;
727                     }
728                     *rescue = TRUE;
729                     goto sendunsacked;
730           } else if (tcp_do_smartsack && lastblock == NULL) {
731                     tcpstat.tcps_sackrescue_try++;
732                     *rescue = TRUE;
733                     goto sendunsacked;
734           }
735 
736           return FALSE;
737 }
738 
739 /*
740  * Return the next sequence number higher than "*prexmt" that has
741  * not been SACKed.
742  */
743 void
tcp_sack_skip_sacked(struct scoreboard * scb,tcp_seq * prexmt)744 tcp_sack_skip_sacked(struct scoreboard *scb, tcp_seq *prexmt)
745 {
746           struct sackblock *sb;
747 
748           /* skip SACKed data */
749           if (sack_block_lookup(scb, *prexmt, &sb))
750                     *prexmt = sb->sblk_end;
751 }
752 
753 /*
754  * The length of the first amount of unSACKed data
755  */
756 uint32_t
tcp_sack_first_unsacked_len(const struct tcpcb * tp)757 tcp_sack_first_unsacked_len(const struct tcpcb *tp)
758 {
759           const struct sackblock *sb;
760 
761           sb = TAILQ_FIRST(&tp->scb.sackblocks);
762           if (sb == NULL)
763                     return tp->t_maxseg;
764 
765           KASSERT(SEQ_LT(tp->snd_una, sb->sblk_start),
766               ("invalid sb start %u, snd_una %u",
767                sb->sblk_start, tp->snd_una));
768           return (sb->sblk_start - tp->snd_una);
769 }
770 
771 #ifdef later
772 void
tcp_sack_save_scoreboard(struct scoreboard * scb)773 tcp_sack_save_scoreboard(struct scoreboard *scb)
774 {
775           struct scoreboard *scb = &tp->scb;
776 
777           scb->sackblocks_prev = scb->sackblocks;
778           TAILQ_INIT(&scb->sackblocks);
779 }
780 
781 void
tcp_sack_revert_scoreboard(struct scoreboard * scb,tcp_seq snd_una,u_int maxseg)782 tcp_sack_revert_scoreboard(struct scoreboard *scb, tcp_seq snd_una,
783                                  u_int maxseg)
784 {
785           struct sackblock *sb;
786 
787           scb->sackblocks = scb->sackblocks_prev;
788           scb->nblocks = 0;
789           TAILQ_FOREACH(sb, &scb->sackblocks, sblk_list)
790                     ++scb->nblocks;
791           tcp_sack_ack_blocks(scb, snd_una);
792           scb->lastfound = NULL;
793 }
794 #endif
795 
796 #ifdef DEBUG_SACK_HISTORY
797 static void
tcp_sack_dump_history(const char * msg,const struct tcpcb * tp)798 tcp_sack_dump_history(const char *msg, const struct tcpcb *tp)
799 {
800           int i;
801           static int ndumped;
802 
803           /* only need a couple of these to debug most problems */
804           if (++ndumped > 900)
805                     return;
806 
807           kprintf("%s:\tnsackhistory %d: ", msg, tp->nsackhistory);
808           for (i = 0; i < tp->nsackhistory; ++i)
809                     kprintf("[%u, %u) ", tp->sackhistory[i].rblk_start,
810                         tp->sackhistory[i].rblk_end);
811           kprintf("\n");
812 }
813 #else
814 static __inline void
tcp_sack_dump_history(const char * msg,const struct tcpcb * tp)815 tcp_sack_dump_history(const char *msg, const struct tcpcb *tp)
816 {
817 }
818 #endif
819 
820 /*
821  * Remove old SACK blocks from the SACK history that have already been ACKed.
822  */
823 static void
tcp_sack_ack_history(struct tcpcb * tp)824 tcp_sack_ack_history(struct tcpcb *tp)
825 {
826           int i, nblocks, openslot;
827 
828           tcp_sack_dump_history("before tcp_sack_ack_history", tp);
829           nblocks = tp->nsackhistory;
830           for (i = openslot = 0; i < nblocks; ++i) {
831                     if (SEQ_LEQ(tp->sackhistory[i].rblk_end, tp->rcv_nxt)) {
832                               --tp->nsackhistory;
833                               continue;
834                     }
835                     if (SEQ_LT(tp->sackhistory[i].rblk_start, tp->rcv_nxt))
836                               tp->sackhistory[i].rblk_start = tp->rcv_nxt;
837                     if (i == openslot)
838                               ++openslot;
839                     else
840                               tp->sackhistory[openslot++] = tp->sackhistory[i];
841           }
842           tcp_sack_dump_history("after tcp_sack_ack_history", tp);
843           KASSERT(openslot == tp->nsackhistory,
844               ("tcp_sack_ack_history miscounted: %d != %d",
845               openslot, tp->nsackhistory));
846 }
847 
848 /*
849  * Add or merge newblock into reported history.
850  * Also remove or update SACK blocks that will be acked.
851  */
852 static void
tcp_sack_update_reported_history(struct tcpcb * tp,tcp_seq start,tcp_seq end)853 tcp_sack_update_reported_history(struct tcpcb *tp, tcp_seq start, tcp_seq end)
854 {
855           struct raw_sackblock copy[MAX_SACK_REPORT_BLOCKS];
856           int i, cindex;
857 
858           tcp_sack_dump_history("before tcp_sack_update_reported_history", tp);
859           /*
860            * Six cases:
861            *        0) no overlap
862            *        1) newblock == oldblock
863            *        2) oldblock contains newblock
864            *        3) newblock contains oldblock
865            *        4) tail of oldblock overlaps or abuts start of newblock
866            *        5) tail of newblock overlaps or abuts head of oldblock
867            */
868           for (i = cindex = 0; i < tp->nsackhistory; ++i) {
869                     struct raw_sackblock *oldblock = &tp->sackhistory[i];
870                     tcp_seq old_start = oldblock->rblk_start;
871                     tcp_seq old_end = oldblock->rblk_end;
872 
873                     if (SEQ_LT(end, old_start) || SEQ_GT(start, old_end)) {
874                               /* Case 0:  no overlap.  Copy old block. */
875                               copy[cindex++] = *oldblock;
876                               continue;
877                     }
878 
879                     if (SEQ_GEQ(start, old_start) && SEQ_LEQ(end, old_end)) {
880                               /* Cases 1 & 2.  Move block to front of history. */
881                               int j;
882 
883                               start = old_start;
884                               end = old_end;
885                               /* no need to check rest of blocks */
886                               for (j = i + 1; j < tp->nsackhistory; ++j)
887                                         copy[cindex++] = tp->sackhistory[j];
888                               break;
889                     }
890 
891                     if (SEQ_GEQ(old_end, start) && SEQ_LT(old_start, start)) {
892                               /* Case 4:  extend start of new block. */
893                               start = old_start;
894                     } else if (SEQ_GEQ(end, old_start) && SEQ_GT(old_end, end)) {
895                               /* Case 5: extend end of new block */
896                               end = old_end;
897                     } else {
898                               /* Case 3.  Delete old block by not copying it. */
899                               KASSERT(SEQ_LEQ(start, old_start) &&
900                                         SEQ_GEQ(end, old_end),
901                                   ("bad logic: old [%u, %u), new [%u, %u)",
902                                    old_start, old_end, start, end));
903                     }
904           }
905 
906           /* insert new block */
907           tp->sackhistory[0].rblk_start = start;
908           tp->sackhistory[0].rblk_end = end;
909           cindex = min(cindex, MAX_SACK_REPORT_BLOCKS - 1);
910           for (i = 0; i < cindex; ++i)
911                     tp->sackhistory[i + 1] = copy[i];
912           tp->nsackhistory = cindex + 1;
913           tcp_sack_dump_history("after tcp_sack_update_reported_history", tp);
914 }
915 
916 /*
917  * Fill in SACK report to return to data sender.
918  */
919 void
tcp_sack_fill_report(struct tcpcb * tp,u_char * opt,u_int * plen)920 tcp_sack_fill_report(struct tcpcb *tp, u_char *opt, u_int *plen)
921 {
922           u_int optlen = *plen;
923           uint32_t *lp = (uint32_t *)(opt + optlen);
924           uint32_t *olp;
925           tcp_seq hstart = tp->rcv_nxt, hend;
926           int nblocks;
927 
928           KASSERT(TCP_MAXOLEN - optlen >=
929               TCPOLEN_SACK_ALIGNED + TCPOLEN_SACK_BLOCK,
930               ("no room for SACK header and one block: optlen %d", optlen));
931 
932           if (tp->sack_flags & TSACK_F_DUPSEG)
933                     tcpstat.tcps_snddsackopt++;
934           else
935                     tcpstat.tcps_sndsackopt++;
936 
937           olp = lp++;
938           optlen += TCPOLEN_SACK_ALIGNED;
939 
940           tcp_sack_ack_history(tp);
941           if (tp->reportblk.rblk_start != tp->reportblk.rblk_end) {
942                     *lp++ = htonl(tp->reportblk.rblk_start);
943                     *lp++ = htonl(tp->reportblk.rblk_end);
944                     optlen += TCPOLEN_SACK_BLOCK;
945                     hstart = tp->reportblk.rblk_start;
946                     hend = tp->reportblk.rblk_end;
947                     if (tp->sack_flags & TSACK_F_ENCLOSESEG) {
948                               KASSERT(TCP_MAXOLEN - optlen >= TCPOLEN_SACK_BLOCK,
949                                   ("no room for enclosing SACK block: oplen %d",
950                                   optlen));
951                               *lp++ = htonl(tp->encloseblk.rblk_start);
952                               *lp++ = htonl(tp->encloseblk.rblk_end);
953                               optlen += TCPOLEN_SACK_BLOCK;
954                               hstart = tp->encloseblk.rblk_start;
955                               hend = tp->encloseblk.rblk_end;
956                     }
957                     if (SEQ_GT(hstart, tp->rcv_nxt))
958                               tcp_sack_update_reported_history(tp, hstart, hend);
959           }
960           if (tcp_do_smartsack && (tp->sack_flags & TSACK_F_SACKLEFT)) {
961                     /* Fill in from left!  Walk re-assembly queue. */
962                     struct tseg_qent *q;
963 
964                     q = TAILQ_FIRST(&tp->t_segq);
965                     while (q != NULL &&
966                         TCP_MAXOLEN - optlen >= TCPOLEN_SACK_BLOCK) {
967                               *lp++ = htonl(q->tqe_th->th_seq);
968                               *lp++ = htonl(TCP_SACK_BLKEND(
969                                   q->tqe_th->th_seq + q->tqe_len,
970                                   q->tqe_th->th_flags));
971                               optlen += TCPOLEN_SACK_BLOCK;
972                               q = TAILQ_NEXT(q, tqe_q);
973                     }
974           } else {
975                     int n = 0;
976 
977                     /* Fill in SACK blocks from right side. */
978                     while (n < tp->nsackhistory &&
979                         TCP_MAXOLEN - optlen >= TCPOLEN_SACK_BLOCK) {
980                               if (tp->sackhistory[n].rblk_start != hstart) {
981                                         *lp++ = htonl(tp->sackhistory[n].rblk_start);
982                                         *lp++ = htonl(tp->sackhistory[n].rblk_end);
983                                         optlen += TCPOLEN_SACK_BLOCK;
984                               }
985                               ++n;
986                     }
987           }
988           tp->reportblk.rblk_start = tp->reportblk.rblk_end;
989           tp->sack_flags &=
990               ~(TSACK_F_DUPSEG | TSACK_F_ENCLOSESEG | TSACK_F_SACKLEFT);
991           nblocks = (lp - olp - 1) / 2;
992           *olp = htonl(TCPOPT_SACK_ALIGNED |
993                          (TCPOLEN_SACK + nblocks * TCPOLEN_SACK_BLOCK));
994           *plen = optlen;
995 }
996