1 /* $NetBSD: tcp_sack.c,v 1.36 2018/05/18 18:58:51 maxv Exp $ */
2 
3 /*
4  * Copyright (c) 2005 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Kentaro A. Kurahone.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
34  *        The Regents of the University of California.  All rights reserved.
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 4. Neither the name of the University nor the names of its contributors
45  *    may be used to endorse or promote products derived from this software
46  *    without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58  * SUCH DAMAGE.
59  *
60  *        @(#)tcp_sack.c      8.12 (Berkeley) 5/24/95
61  * $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $
62  */
63 
64 /*
65  *        @@(#)COPYRIGHT      1.1 (NRL) 17 January 1995
66  *
67  * NRL grants permission for redistribution and use in source and binary
68  * forms, with or without modification, of the software and documentation
69  * created at NRL provided that the following conditions are met:
70  *
71  * 1. Redistributions of source code must retain the above copyright
72  *    notice, this list of conditions and the following disclaimer.
73  * 2. Redistributions in binary form must reproduce the above copyright
74  *    notice, this list of conditions and the following disclaimer in the
75  *    documentation and/or other materials provided with the distribution.
76  * 3. All advertising materials mentioning features or use of this software
77  *    must display the following acknowledgements:
78  *        This product includes software developed by the University of
79  *        California, Berkeley and its contributors.
80  *        This product includes software developed at the Information
81  *        Technology Division, US Naval Research Laboratory.
82  * 4. Neither the name of the NRL nor the names of its contributors
83  *    may be used to endorse or promote products derived from this software
84  *    without specific prior written permission.
85  *
86  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
87  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
88  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
89  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
90  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
91  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
92  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
93  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
94  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
95  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
96  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
97  *
98  * The views and conclusions contained in the software and documentation
99  * are those of the authors and should not be interpreted as representing
100  * official policies, either expressed or implied, of the US Naval
101  * Research Laboratory (NRL).
102  */
103 
104 #include <sys/cdefs.h>
105 __KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.36 2018/05/18 18:58:51 maxv Exp $");
106 
107 #ifdef _KERNEL_OPT
108 #include "opt_inet.h"
109 #include "opt_inet_csum.h"
110 #include "opt_tcp_debug.h"
111 #include "opt_ddb.h"
112 #endif
113 
114 #include <sys/param.h>
115 #include <sys/systm.h>
116 #include <sys/mbuf.h>
117 #include <sys/protosw.h>
118 #include <sys/socket.h>
119 #include <sys/socketvar.h>
120 #include <sys/errno.h>
121 #include <sys/syslog.h>
122 #include <sys/pool.h>
123 #include <sys/domain.h>
124 #include <sys/kernel.h>
125 
126 #include <net/if.h>
127 #include <net/route.h>
128 #include <net/if_types.h>
129 
130 #include <netinet/in.h>
131 #include <netinet/in_systm.h>
132 #include <netinet/ip.h>
133 #include <netinet/in_pcb.h>
134 #include <netinet/in_var.h>
135 #include <netinet/ip_var.h>
136 
137 #ifdef INET6
138 #include <netinet/ip6.h>
139 #include <netinet6/ip6_var.h>
140 #include <netinet6/in6_pcb.h>
141 #include <netinet6/ip6_var.h>
142 #include <netinet6/in6_var.h>
143 #include <netinet/icmp6.h>
144 #endif
145 
146 #ifndef INET6
147 #include <netinet/ip6.h>
148 #endif
149 
150 #include <netinet/tcp.h>
151 #include <netinet/tcp_fsm.h>
152 #include <netinet/tcp_seq.h>
153 #include <netinet/tcp_timer.h>
154 #include <netinet/tcp_var.h>
155 #include <netinet/tcp_debug.h>
156 
157 /* SACK block pool. */
158 static struct pool sackhole_pool;
159 
160 void
tcp_sack_init(void)161 tcp_sack_init(void)
162 {
163 
164           pool_init(&sackhole_pool, sizeof(struct sackhole), 0, 0, 0,
165               "sackholepl", NULL, IPL_SOFTNET);
166 }
167 
168 static struct sackhole *
sack_allochole(struct tcpcb * tp)169 sack_allochole(struct tcpcb *tp)
170 {
171           struct sackhole *hole;
172 
173           if (tp->snd_numholes >= tcp_sack_tp_maxholes ||
174               tcp_sack_globalholes >= tcp_sack_globalmaxholes) {
175                     return NULL;
176           }
177           hole = pool_get(&sackhole_pool, PR_NOWAIT);
178           if (hole == NULL) {
179                     return NULL;
180           }
181           tp->snd_numholes++;
182           tcp_sack_globalholes++;
183 
184           return hole;
185 }
186 
187 static struct sackhole *
sack_inserthole(struct tcpcb * tp,tcp_seq start,tcp_seq end,struct sackhole * prev)188 sack_inserthole(struct tcpcb *tp, tcp_seq start, tcp_seq end,
189     struct sackhole *prev)
190 {
191           struct sackhole *hole;
192 
193           hole = sack_allochole(tp);
194           if (hole == NULL) {
195                     return NULL;
196           }
197           hole->start = hole->rxmit = start;
198           hole->end = end;
199           if (prev != NULL) {
200                     TAILQ_INSERT_AFTER(&tp->snd_holes, prev, hole, sackhole_q);
201           } else {
202                     TAILQ_INSERT_TAIL(&tp->snd_holes, hole, sackhole_q);
203           }
204           return hole;
205 }
206 
207 static struct sackhole *
sack_removehole(struct tcpcb * tp,struct sackhole * hole)208 sack_removehole(struct tcpcb *tp, struct sackhole *hole)
209 {
210           struct sackhole *next;
211 
212           next = TAILQ_NEXT(hole, sackhole_q);
213           tp->snd_numholes--;
214           tcp_sack_globalholes--;
215           TAILQ_REMOVE(&tp->snd_holes, hole, sackhole_q);
216           pool_put(&sackhole_pool, hole);
217 
218           return next;
219 }
220 
221 /*
222  * tcp_new_dsack: record the reception of a duplicated segment.
223  */
224 
225 void
tcp_new_dsack(struct tcpcb * tp,tcp_seq seq,u_int32_t len)226 tcp_new_dsack(struct tcpcb *tp, tcp_seq seq, u_int32_t len)
227 {
228 
229           if (TCP_SACK_ENABLED(tp)) {
230                     tp->rcv_dsack_block.left = seq;
231                     tp->rcv_dsack_block.right = seq + len;
232                     tp->rcv_sack_flags |= TCPSACK_HAVED;
233           }
234 }
235 
236 /*
237  * tcp_sack_option: parse the given SACK option and update the scoreboard.
238  */
239 
240 void
tcp_sack_option(struct tcpcb * tp,const struct tcphdr * th,const u_char * cp,int optlen)241 tcp_sack_option(struct tcpcb *tp, const struct tcphdr *th, const u_char *cp,
242     int optlen)
243 {
244           struct sackblk
245               t_sack_block[(MAX_TCPOPTLEN - 2) / (sizeof(u_int32_t) * 2)];
246           struct sackblk *sack = NULL;
247           struct sackhole *cur = NULL;
248           struct sackhole *tmp = NULL;
249           const char *lp = cp + 2;
250           int i, j, num_sack_blks;
251           tcp_seq left, right, acked;
252 
253           /*
254            * If we aren't processing SACK responses, this is not an ACK
255            * or the peer sends us a sack option with invalid length, don't
256            * update the scoreboard.
257            */
258           if (!TCP_SACK_ENABLED(tp) || ((th->th_flags & TH_ACK) == 0) ||
259                               (optlen % 8 != 2 || optlen < 10)) {
260                     return;
261           }
262 
263           /*
264            * If we don't want any SACK holes to be allocated, just return.
265            */
266           if (tcp_sack_globalmaxholes == 0 || tcp_sack_tp_maxholes == 0) {
267                     return;
268           }
269 
270           /* If the ACK is outside [snd_una, snd_max], ignore the SACK options. */
271           if (SEQ_LT(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))
272                     return;
273 
274           /*
275            * Extract SACK blocks.
276            *
277            * Note that t_sack_block is sorted so that we only need to do
278            * one pass over the sequence number space. (SACK "fast-path")
279            */
280           num_sack_blks = optlen / 8;
281           acked = (SEQ_GT(th->th_ack, tp->snd_una)) ? th->th_ack : tp->snd_una;
282           for (i = 0; i < num_sack_blks; i++, lp += sizeof(uint32_t) * 2) {
283                     memcpy(&left, lp, sizeof(uint32_t));
284                     memcpy(&right, lp + sizeof(uint32_t), sizeof(uint32_t));
285                     left = ntohl(left);
286                     right = ntohl(right);
287 
288                     if (SEQ_LEQ(right, acked) || SEQ_GT(right, tp->snd_max) ||
289                         SEQ_GEQ(left, right)) {
290                               /* SACK entry that's old, or invalid. */
291                               i--;
292                               num_sack_blks--;
293                               continue;
294                     }
295 
296                     /* Insertion sort. */
297                     for (j = i; (j > 0) && SEQ_LT(left, t_sack_block[j - 1].left);
298                         j--) {
299                               t_sack_block[j].left = t_sack_block[j - 1].left;
300                               t_sack_block[j].right = t_sack_block[j - 1].right;
301                     }
302                     t_sack_block[j].left = left;
303                     t_sack_block[j].right = right;
304           }
305 
306           /* Update the scoreboard. */
307           cur = TAILQ_FIRST(&tp->snd_holes);
308           for (i = 0; i < num_sack_blks; i++) {
309                     sack = &t_sack_block[i];
310                     /*
311                      * FACK TCP.  Update snd_fack so we can enter Fast
312                      * Recovery early.
313                      */
314                     if (SEQ_GEQ(sack->right, tp->snd_fack))
315                               tp->snd_fack = sack->right;
316 
317                     if (TAILQ_EMPTY(&tp->snd_holes)) {
318                               /* First hole. */
319                               cur = sack_inserthole(tp, th->th_ack, sack->left, NULL);
320                               if (cur == NULL) {
321                                         /* ENOBUFS, bail out*/
322                                         return;
323                               }
324                               tp->rcv_lastsack = sack->right;
325                               continue; /* With next sack block */
326                     }
327 
328                     /* Go through the list of holes. */
329                     while (cur) {
330                               if (SEQ_LEQ(sack->right, cur->start))
331                                         /* SACKs data before the current hole */
332                                         break; /* No use going through more holes */
333 
334                               if (SEQ_GEQ(sack->left, cur->end)) {
335                                         /* SACKs data beyond the current hole */
336                                         cur = TAILQ_NEXT(cur, sackhole_q);
337                                         continue;
338                               }
339 
340                               if (SEQ_LEQ(sack->left, cur->start)) {
341                                         /* Data acks at least the beginning of hole */
342                                         if (SEQ_GEQ(sack->right, cur->end)) {
343                                                   /* Acks entire hole, so delete hole */
344                                                   cur = sack_removehole(tp, cur);
345                                                   break;
346                                         }
347 
348                                         /* Otherwise, move start of hole forward */
349                                         cur->start = sack->right;
350                                         cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
351                                         break;
352                               }
353 
354                               if (SEQ_GEQ(sack->right, cur->end)) {
355                                         /* Move end of hole backward. */
356                                         cur->end = sack->left;
357                                         cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
358                                         cur = TAILQ_NEXT(cur, sackhole_q);
359                                         break;
360                               }
361 
362                               if (SEQ_LT(cur->start, sack->left) &&
363                                   SEQ_GT(cur->end, sack->right)) {
364                                         /*
365                                          * ACKs some data in middle of a hole; need to
366                                          * split current hole
367                                          */
368                                         tmp = sack_inserthole(tp, sack->right, cur->end,
369                                             cur);
370                                         if (tmp == NULL) {
371                                                   return;
372                                         }
373                                         tmp->rxmit = SEQ_MAX(cur->rxmit, tmp->start);
374                                         cur->end = sack->left;
375                                         cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
376                                         cur = tmp;
377                                         break;
378                               }
379                     }
380 
381                     /* At this point, we have reached the tail of the list. */
382                     if (SEQ_LT(tp->rcv_lastsack, sack->left)) {
383                               /*
384                                * Need to append new hole at end.
385                                */
386                               cur = sack_inserthole(tp, tp->rcv_lastsack, sack->left,
387                                   NULL);
388                               if (cur == NULL) {
389                                         return;
390                               }
391                     }
392                     if (SEQ_LT(tp->rcv_lastsack, sack->right)) {
393                               tp->rcv_lastsack = sack->right;
394                     }
395           }
396 }
397 
398 /*
399  * tcp_del_sackholes: remove holes covered by a cumulative ACK.
400  */
401 
402 void
tcp_del_sackholes(struct tcpcb * tp,const struct tcphdr * th)403 tcp_del_sackholes(struct tcpcb *tp, const struct tcphdr *th)
404 {
405           /* Max because this could be an older ack that just arrived. */
406           tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
407                     th->th_ack : tp->snd_una;
408           struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
409 
410           while (cur) {
411                     if (SEQ_LEQ(cur->end, lastack)) {
412                               cur = sack_removehole(tp, cur);
413                     } else if (SEQ_LT(cur->start, lastack)) {
414                               cur->start = lastack;
415                               if (SEQ_LT(cur->rxmit, cur->start))
416                                         cur->rxmit = cur->start;
417                               break;
418                     } else
419                               break;
420           }
421 }
422 
423 /*
424  * tcp_free_sackholes: clear the scoreboard.
425  */
426 
427 void
tcp_free_sackholes(struct tcpcb * tp)428 tcp_free_sackholes(struct tcpcb *tp)
429 {
430           struct sackhole *sack;
431 
432           /* Free up the SACK hole list. */
433           while ((sack = TAILQ_FIRST(&tp->snd_holes)) != NULL) {
434                     sack_removehole(tp, sack);
435           }
436           KASSERT(tp->snd_numholes == 0);
437 }
438 
439 /*
440  * Returns pointer to a sackhole if there are any pending retransmissions;
441  * NULL otherwise.
442  */
443 struct sackhole *
tcp_sack_output(struct tcpcb * tp,int * sack_bytes_rexmt)444 tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
445 {
446           struct sackhole *cur = NULL;
447 
448           if (!TCP_SACK_ENABLED(tp))
449                     return (NULL);
450 
451           *sack_bytes_rexmt = 0;
452           TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
453                     if (SEQ_LT(cur->rxmit, cur->end)) {
454                               if (SEQ_LT(cur->rxmit, tp->snd_una)) {
455                                         /* old SACK hole */
456                                         continue;
457                               }
458                               *sack_bytes_rexmt += (cur->rxmit - cur->start);
459                               break;
460                     }
461                     *sack_bytes_rexmt += (cur->rxmit - cur->start);
462           }
463 
464           return (cur);
465 }
466 
467 /*
468  * After a timeout, the SACK list may be rebuilt.  This SACK information
469  * should be used to avoid retransmitting SACKed data.  This function
470  * traverses the SACK list to see if snd_nxt should be moved forward.
471  */
472 void
tcp_sack_adjust(struct tcpcb * tp)473 tcp_sack_adjust(struct tcpcb *tp)
474 {
475           struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
476           struct sackhole *n = NULL;
477 
478           if (TAILQ_EMPTY(&tp->snd_holes))
479                     return; /* No holes */
480           if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
481                     return; /* We're already beyond any SACKed blocks */
482 
483           /*
484            * Two cases for which we want to advance snd_nxt:
485            * i) snd_nxt lies between end of one hole and beginning of another
486            * ii) snd_nxt lies between end of last hole and rcv_lastsack
487            */
488           while ((n = TAILQ_NEXT(cur, sackhole_q)) != NULL) {
489                     if (SEQ_LT(tp->snd_nxt, cur->end))
490                               return;
491                     if (SEQ_GEQ(tp->snd_nxt, n->start))
492                               cur = n;
493                     else {
494                               tp->snd_nxt = n->start;
495                               return;
496                     }
497           }
498           if (SEQ_LT(tp->snd_nxt, cur->end))
499                     return;
500           tp->snd_nxt = tp->rcv_lastsack;
501 
502           return;
503 }
504 
505 /*
506  * tcp_sack_numblks: return the number of SACK blocks to send.
507  */
508 
509 int
tcp_sack_numblks(const struct tcpcb * tp)510 tcp_sack_numblks(const struct tcpcb *tp)
511 {
512           int numblks;
513 
514           if (!TCP_SACK_ENABLED(tp)) {
515                     return 0;
516           }
517 
518           numblks = (((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) ? 1 : 0) +
519               tp->t_segqlen;
520 
521           if (numblks == 0) {
522                     return 0;
523           }
524 
525           if (numblks > TCP_SACK_MAX) {
526                     numblks = TCP_SACK_MAX;
527           }
528 
529           return numblks;
530 }
531 
532 #if defined(DDB)
533 void sack_dump(const struct tcpcb *);
534 
535 void
sack_dump(const struct tcpcb * tp)536 sack_dump(const struct tcpcb *tp)
537 {
538           const struct sackhole *cur;
539 
540           printf("snd_una=%" PRIu32 ", snd_max=%" PRIu32 "\n",
541               tp->snd_una, tp->snd_max);
542           printf("rcv_lastsack=%" PRIu32 ", snd_fack=%" PRIu32 "\n",
543               tp->rcv_lastsack, tp->snd_fack);
544           printf("numholes=%d\n", tp->snd_numholes);
545           TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
546                     printf("\t%" PRIu32 "-%" PRIu32 ", rxmit=%" PRIu32 "\n",
547                         cur->start, cur->end, cur->rxmit);
548           }
549 }
550 #endif /* defined(DDB) */
551