xref: /freebsd-11-stable/sys/netinet/tcp_lro.c (revision a992e81ef0755176c30e79a1bdfc731764f8f3eb)
1 /*-
2  * Copyright (c) 2007, Myricom Inc.
3  * Copyright (c) 2008, Intel Corporation.
4  * Copyright (c) 2012 The FreeBSD Foundation
5  * Copyright (c) 2016 Mellanox Technologies.
6  * All rights reserved.
7  *
8  * Portions of this software were developed by Bjoern Zeeb
9  * under sponsorship from the FreeBSD Foundation.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 
36 #include "opt_inet.h"
37 #include "opt_inet6.h"
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/kernel.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/socket.h>
45 #include <sys/sysctl.h>
46 
47 #include <net/if.h>
48 #include <net/if_var.h>
49 #include <net/ethernet.h>
50 #include <net/vnet.h>
51 
52 #include <netinet/in_systm.h>
53 #include <netinet/in.h>
54 #include <netinet/ip6.h>
55 #include <netinet/ip.h>
56 #include <netinet/ip_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_seq.h>
59 #include <netinet/tcp_lro.h>
60 #include <netinet/tcp_var.h>
61 
62 #include <netinet6/ip6_var.h>
63 
64 #include <machine/in_cksum.h>
65 
66 static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures");
67 
68 #define	TCP_LRO_UPDATE_CSUM	1
69 #ifndef	TCP_LRO_UPDATE_CSUM
70 #define	TCP_LRO_INVALID_CSUM	0x0000
71 #endif
72 
73 static void	tcp_lro_rx_done(struct lro_ctrl *lc);
74 
75 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro,  CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
76     "TCP LRO");
77 
78 static unsigned	tcp_lro_entries = TCP_LRO_ENTRIES;
79 SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries,
80     CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0,
81     "default number of LRO entries");
82 
83 static __inline void
tcp_lro_active_insert(struct lro_ctrl * lc,struct lro_entry * le)84 tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_entry *le)
85 {
86 
87 	LIST_INSERT_HEAD(&lc->lro_active, le, next);
88 }
89 
90 static __inline void
tcp_lro_active_remove(struct lro_entry * le)91 tcp_lro_active_remove(struct lro_entry *le)
92 {
93 
94 	LIST_REMOVE(le, next);
95 }
96 
97 int
tcp_lro_init(struct lro_ctrl * lc)98 tcp_lro_init(struct lro_ctrl *lc)
99 {
100 	return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0));
101 }
102 
103 int
tcp_lro_init_args(struct lro_ctrl * lc,struct ifnet * ifp,unsigned lro_entries,unsigned lro_mbufs)104 tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp,
105     unsigned lro_entries, unsigned lro_mbufs)
106 {
107 	struct lro_entry *le;
108 	size_t size;
109 	unsigned i;
110 
111 	lc->lro_bad_csum = 0;
112 	lc->lro_queued = 0;
113 	lc->lro_flushed = 0;
114 	lc->lro_mbuf_count = 0;
115 	lc->lro_mbuf_max = lro_mbufs;
116 	lc->lro_cnt = lro_entries;
117 	lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX;
118 	lc->lro_length_lim = TCP_LRO_LENGTH_MAX;
119 	lc->ifp = ifp;
120 	LIST_INIT(&lc->lro_free);
121 	LIST_INIT(&lc->lro_active);
122 
123 	/* compute size to allocate */
124 	size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) +
125 	    (lro_entries * sizeof(*le));
126 	lc->lro_mbuf_data = (struct lro_mbuf_sort *)
127 	    malloc(size, M_LRO, M_NOWAIT | M_ZERO);
128 
129 	/* check for out of memory */
130 	if (lc->lro_mbuf_data == NULL) {
131 		memset(lc, 0, sizeof(*lc));
132 		return (ENOMEM);
133 	}
134 	/* compute offset for LRO entries */
135 	le = (struct lro_entry *)
136 	    (lc->lro_mbuf_data + lro_mbufs);
137 
138 	/* setup linked list */
139 	for (i = 0; i != lro_entries; i++)
140 		LIST_INSERT_HEAD(&lc->lro_free, le + i, next);
141 
142 	return (0);
143 }
144 
145 void
tcp_lro_free(struct lro_ctrl * lc)146 tcp_lro_free(struct lro_ctrl *lc)
147 {
148 	struct lro_entry *le;
149 	unsigned x;
150 
151 	/* reset LRO free list */
152 	LIST_INIT(&lc->lro_free);
153 
154 	/* free active mbufs, if any */
155 	while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
156 		tcp_lro_active_remove(le);
157 		m_freem(le->m_head);
158 	}
159 
160 	/* free mbuf array, if any */
161 	for (x = 0; x != lc->lro_mbuf_count; x++)
162 		m_freem(lc->lro_mbuf_data[x].mb);
163 	lc->lro_mbuf_count = 0;
164 
165 	/* free allocated memory, if any */
166 	free(lc->lro_mbuf_data, M_LRO);
167 	lc->lro_mbuf_data = NULL;
168 }
169 
170 #ifdef TCP_LRO_UPDATE_CSUM
171 static uint16_t
tcp_lro_csum_th(struct tcphdr * th)172 tcp_lro_csum_th(struct tcphdr *th)
173 {
174 	uint32_t ch;
175 	uint16_t *p, l;
176 
177 	ch = th->th_sum = 0x0000;
178 	l = th->th_off;
179 	p = (uint16_t *)th;
180 	while (l > 0) {
181 		ch += *p;
182 		p++;
183 		ch += *p;
184 		p++;
185 		l--;
186 	}
187 	while (ch > 0xffff)
188 		ch = (ch >> 16) + (ch & 0xffff);
189 
190 	return (ch & 0xffff);
191 }
192 
193 static uint16_t
tcp_lro_rx_csum_fixup(struct lro_entry * le,void * l3hdr,struct tcphdr * th,uint16_t tcp_data_len,uint16_t csum)194 tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th,
195     uint16_t tcp_data_len, uint16_t csum)
196 {
197 	uint32_t c;
198 	uint16_t cs;
199 
200 	c = csum;
201 
202 	/* Remove length from checksum. */
203 	switch (le->eh_type) {
204 #ifdef INET6
205 	case ETHERTYPE_IPV6:
206 	{
207 		struct ip6_hdr *ip6;
208 
209 		ip6 = (struct ip6_hdr *)l3hdr;
210 		if (le->append_cnt == 0)
211 			cs = ip6->ip6_plen;
212 		else {
213 			uint32_t cx;
214 
215 			cx = ntohs(ip6->ip6_plen);
216 			cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0);
217 		}
218 		break;
219 	}
220 #endif
221 #ifdef INET
222 	case ETHERTYPE_IP:
223 	{
224 		struct ip *ip4;
225 
226 		ip4 = (struct ip *)l3hdr;
227 		if (le->append_cnt == 0)
228 			cs = ip4->ip_len;
229 		else {
230 			cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4),
231 			    IPPROTO_TCP);
232 			cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr,
233 			    htons(cs));
234 		}
235 		break;
236 	}
237 #endif
238 	default:
239 		cs = 0;		/* Keep compiler happy. */
240 	}
241 
242 	cs = ~cs;
243 	c += cs;
244 
245 	/* Remove TCP header csum. */
246 	cs = ~tcp_lro_csum_th(th);
247 	c += cs;
248 	while (c > 0xffff)
249 		c = (c >> 16) + (c & 0xffff);
250 
251 	return (c & 0xffff);
252 }
253 #endif
254 
255 static void
tcp_lro_rx_done(struct lro_ctrl * lc)256 tcp_lro_rx_done(struct lro_ctrl *lc)
257 {
258 	struct lro_entry *le;
259 
260 	while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
261 		tcp_lro_active_remove(le);
262 		tcp_lro_flush(lc, le);
263 	}
264 }
265 
266 void
tcp_lro_flush_inactive(struct lro_ctrl * lc,const struct timeval * timeout)267 tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout)
268 {
269 	struct lro_entry *le, *le_tmp;
270 	struct timeval tv;
271 
272 	if (LIST_EMPTY(&lc->lro_active))
273 		return;
274 
275 	getmicrotime(&tv);
276 	timevalsub(&tv, timeout);
277 	LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) {
278 		if (timevalcmp(&tv, &le->mtime, >=)) {
279 			tcp_lro_active_remove(le);
280 			tcp_lro_flush(lc, le);
281 		}
282 	}
283 }
284 
285 void
tcp_lro_flush(struct lro_ctrl * lc,struct lro_entry * le)286 tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
287 {
288 
289 	if (le->append_cnt > 0) {
290 		struct tcphdr *th;
291 		uint16_t p_len;
292 
293 		p_len = htons(le->p_len);
294 		switch (le->eh_type) {
295 #ifdef INET6
296 		case ETHERTYPE_IPV6:
297 		{
298 			struct ip6_hdr *ip6;
299 
300 			ip6 = le->le_ip6;
301 			ip6->ip6_plen = p_len;
302 			th = (struct tcphdr *)(ip6 + 1);
303 			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
304 			    CSUM_PSEUDO_HDR;
305 			le->p_len += ETHER_HDR_LEN + sizeof(*ip6);
306 			break;
307 		}
308 #endif
309 #ifdef INET
310 		case ETHERTYPE_IP:
311 		{
312 			struct ip *ip4;
313 #ifdef TCP_LRO_UPDATE_CSUM
314 			uint32_t cl;
315 			uint16_t c;
316 #endif
317 
318 			ip4 = le->le_ip4;
319 #ifdef TCP_LRO_UPDATE_CSUM
320 			/* Fix IP header checksum for new length. */
321 			c = ~ip4->ip_sum;
322 			cl = c;
323 			c = ~ip4->ip_len;
324 			cl += c + p_len;
325 			while (cl > 0xffff)
326 				cl = (cl >> 16) + (cl & 0xffff);
327 			c = cl;
328 			ip4->ip_sum = ~c;
329 #else
330 			ip4->ip_sum = TCP_LRO_INVALID_CSUM;
331 #endif
332 			ip4->ip_len = p_len;
333 			th = (struct tcphdr *)(ip4 + 1);
334 			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
335 			    CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
336 			le->p_len += ETHER_HDR_LEN;
337 			break;
338 		}
339 #endif
340 		default:
341 			th = NULL;	/* Keep compiler happy. */
342 		}
343 		le->m_head->m_pkthdr.csum_data = 0xffff;
344 		le->m_head->m_pkthdr.len = le->p_len;
345 
346 		/* Incorporate the latest ACK into the TCP header. */
347 		th->th_ack = le->ack_seq;
348 		th->th_win = le->window;
349 		/* Incorporate latest timestamp into the TCP header. */
350 		if (le->timestamp != 0) {
351 			uint32_t *ts_ptr;
352 
353 			ts_ptr = (uint32_t *)(th + 1);
354 			ts_ptr[1] = htonl(le->tsval);
355 			ts_ptr[2] = le->tsecr;
356 		}
357 #ifdef TCP_LRO_UPDATE_CSUM
358 		/* Update the TCP header checksum. */
359 		le->ulp_csum += p_len;
360 		le->ulp_csum += tcp_lro_csum_th(th);
361 		while (le->ulp_csum > 0xffff)
362 			le->ulp_csum = (le->ulp_csum >> 16) +
363 			    (le->ulp_csum & 0xffff);
364 		th->th_sum = (le->ulp_csum & 0xffff);
365 		th->th_sum = ~th->th_sum;
366 #else
367 		th->th_sum = TCP_LRO_INVALID_CSUM;
368 #endif
369 	}
370 
371 	(*lc->ifp->if_input)(lc->ifp, le->m_head);
372 	lc->lro_queued += le->append_cnt + 1;
373 	lc->lro_flushed++;
374 	bzero(le, sizeof(*le));
375 	LIST_INSERT_HEAD(&lc->lro_free, le, next);
376 }
377 
378 #ifdef HAVE_INLINE_FLSLL
379 #define	tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1))
380 #else
381 static inline uint64_t
tcp_lro_msb_64(uint64_t x)382 tcp_lro_msb_64(uint64_t x)
383 {
384 	x |= (x >> 1);
385 	x |= (x >> 2);
386 	x |= (x >> 4);
387 	x |= (x >> 8);
388 	x |= (x >> 16);
389 	x |= (x >> 32);
390 	return (x & ~(x >> 1));
391 }
392 #endif
393 
394 /*
395  * The tcp_lro_sort() routine is comparable to qsort(), except it has
396  * a worst case complexity limit of O(MIN(N,64)*N), where N is the
397  * number of elements to sort and 64 is the number of sequence bits
398  * available. The algorithm is bit-slicing the 64-bit sequence number,
399  * sorting one bit at a time from the most significant bit until the
400  * least significant one, skipping the constant bits. This is
401  * typically called a radix sort.
402  */
403 static void
tcp_lro_sort(struct lro_mbuf_sort * parray,uint32_t size)404 tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size)
405 {
406 	struct lro_mbuf_sort temp;
407 	uint64_t ones;
408 	uint64_t zeros;
409 	uint32_t x;
410 	uint32_t y;
411 
412 repeat:
413 	/* for small arrays insertion sort is faster */
414 	if (size <= 12) {
415 		for (x = 1; x < size; x++) {
416 			temp = parray[x];
417 			for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--)
418 				parray[y] = parray[y - 1];
419 			parray[y] = temp;
420 		}
421 		return;
422 	}
423 
424 	/* compute sequence bits which are constant */
425 	ones = 0;
426 	zeros = 0;
427 	for (x = 0; x != size; x++) {
428 		ones |= parray[x].seq;
429 		zeros |= ~parray[x].seq;
430 	}
431 
432 	/* compute bits which are not constant into "ones" */
433 	ones &= zeros;
434 	if (ones == 0)
435 		return;
436 
437 	/* pick the most significant bit which is not constant */
438 	ones = tcp_lro_msb_64(ones);
439 
440 	/*
441 	 * Move entries having cleared sequence bits to the beginning
442 	 * of the array:
443 	 */
444 	for (x = y = 0; y != size; y++) {
445 		/* skip set bits */
446 		if (parray[y].seq & ones)
447 			continue;
448 		/* swap entries */
449 		temp = parray[x];
450 		parray[x] = parray[y];
451 		parray[y] = temp;
452 		x++;
453 	}
454 
455 	KASSERT(x != 0 && x != size, ("Memory is corrupted\n"));
456 
457 	/* sort zeros */
458 	tcp_lro_sort(parray, x);
459 
460 	/* sort ones */
461 	parray += x;
462 	size -= x;
463 	goto repeat;
464 }
465 
466 void
tcp_lro_flush_all(struct lro_ctrl * lc)467 tcp_lro_flush_all(struct lro_ctrl *lc)
468 {
469 	uint64_t seq;
470 	uint64_t nseq;
471 	unsigned x;
472 
473 	/* check if no mbufs to flush */
474 	if (lc->lro_mbuf_count == 0)
475 		goto done;
476 
477 	/* sort all mbufs according to stream */
478 	tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count);
479 
480 	/* input data into LRO engine, stream by stream */
481 	seq = 0;
482 	for (x = 0; x != lc->lro_mbuf_count; x++) {
483 		struct mbuf *mb;
484 
485 		/* get mbuf */
486 		mb = lc->lro_mbuf_data[x].mb;
487 
488 		/* get sequence number, masking away the packet index */
489 		nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24);
490 
491 		/* check for new stream */
492 		if (seq != nseq) {
493 			seq = nseq;
494 
495 			/* flush active streams */
496 			tcp_lro_rx_done(lc);
497 		}
498 
499 		/* add packet to LRO engine */
500 		if (tcp_lro_rx(lc, mb, 0) != 0) {
501 			/* input packet to network layer */
502 			(*lc->ifp->if_input)(lc->ifp, mb);
503 			lc->lro_queued++;
504 			lc->lro_flushed++;
505 		}
506 	}
507 done:
508 	/* flush active streams */
509 	tcp_lro_rx_done(lc);
510 
511 	lc->lro_mbuf_count = 0;
512 }
513 
514 #ifdef INET6
515 static int
tcp_lro_rx_ipv6(struct lro_ctrl * lc,struct mbuf * m,struct ip6_hdr * ip6,struct tcphdr ** th)516 tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6,
517     struct tcphdr **th)
518 {
519 
520 	/* XXX-BZ we should check the flow-label. */
521 
522 	/* XXX-BZ We do not yet support ext. hdrs. */
523 	if (ip6->ip6_nxt != IPPROTO_TCP)
524 		return (TCP_LRO_NOT_SUPPORTED);
525 
526 	/* Find the TCP header. */
527 	*th = (struct tcphdr *)(ip6 + 1);
528 
529 	return (0);
530 }
531 #endif
532 
533 #ifdef INET
534 static int
tcp_lro_rx_ipv4(struct lro_ctrl * lc,struct mbuf * m,struct ip * ip4,struct tcphdr ** th)535 tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4,
536     struct tcphdr **th)
537 {
538 	int csum_flags;
539 	uint16_t csum;
540 
541 	if (ip4->ip_p != IPPROTO_TCP)
542 		return (TCP_LRO_NOT_SUPPORTED);
543 
544 	/* Ensure there are no options. */
545 	if ((ip4->ip_hl << 2) != sizeof (*ip4))
546 		return (TCP_LRO_CANNOT);
547 
548 	/* .. and the packet is not fragmented. */
549 	if (ip4->ip_off & htons(IP_MF|IP_OFFMASK))
550 		return (TCP_LRO_CANNOT);
551 
552 	/* Legacy IP has a header checksum that needs to be correct. */
553 	csum_flags = m->m_pkthdr.csum_flags;
554 	if (csum_flags & CSUM_IP_CHECKED) {
555 		if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
556 			lc->lro_bad_csum++;
557 			return (TCP_LRO_CANNOT);
558 		}
559 	} else {
560 		csum = in_cksum_hdr(ip4);
561 		if (__predict_false((csum) != 0)) {
562 			lc->lro_bad_csum++;
563 			return (TCP_LRO_CANNOT);
564 		}
565 	}
566 
567 	/* Find the TCP header (we assured there are no IP options). */
568 	*th = (struct tcphdr *)(ip4 + 1);
569 
570 	return (0);
571 }
572 #endif
573 
574 int
tcp_lro_rx(struct lro_ctrl * lc,struct mbuf * m,uint32_t csum)575 tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
576 {
577 	struct lro_entry *le;
578 	struct ether_header *eh;
579 #ifdef INET6
580 	struct ip6_hdr *ip6 = NULL;	/* Keep compiler happy. */
581 #endif
582 #ifdef INET
583 	struct ip *ip4 = NULL;		/* Keep compiler happy. */
584 #endif
585 	struct tcphdr *th;
586 	void *l3hdr = NULL;		/* Keep compiler happy. */
587 	uint32_t *ts_ptr;
588 	tcp_seq seq;
589 	int error, ip_len, l;
590 	uint16_t eh_type, tcp_data_len;
591 	int force_flush = 0;
592 
593 	/* We expect a contiguous header [eh, ip, tcp]. */
594 
595 	eh = mtod(m, struct ether_header *);
596 	eh_type = ntohs(eh->ether_type);
597 	switch (eh_type) {
598 #ifdef INET6
599 	case ETHERTYPE_IPV6:
600 	{
601 		CURVNET_SET(lc->ifp->if_vnet);
602 		if (V_ip6_forwarding != 0) {
603 			/* XXX-BZ stats but changing lro_ctrl is a problem. */
604 			CURVNET_RESTORE();
605 			return (TCP_LRO_CANNOT);
606 		}
607 		CURVNET_RESTORE();
608 		l3hdr = ip6 = (struct ip6_hdr *)(eh + 1);
609 		error = tcp_lro_rx_ipv6(lc, m, ip6, &th);
610 		if (error != 0)
611 			return (error);
612 		tcp_data_len = ntohs(ip6->ip6_plen);
613 		ip_len = sizeof(*ip6) + tcp_data_len;
614 		break;
615 	}
616 #endif
617 #ifdef INET
618 	case ETHERTYPE_IP:
619 	{
620 		CURVNET_SET(lc->ifp->if_vnet);
621 		if (V_ipforwarding != 0) {
622 			/* XXX-BZ stats but changing lro_ctrl is a problem. */
623 			CURVNET_RESTORE();
624 			return (TCP_LRO_CANNOT);
625 		}
626 		CURVNET_RESTORE();
627 		l3hdr = ip4 = (struct ip *)(eh + 1);
628 		error = tcp_lro_rx_ipv4(lc, m, ip4, &th);
629 		if (error != 0)
630 			return (error);
631 		ip_len = ntohs(ip4->ip_len);
632 		tcp_data_len = ip_len - sizeof(*ip4);
633 		break;
634 	}
635 #endif
636 	/* XXX-BZ what happens in case of VLAN(s)? */
637 	default:
638 		return (TCP_LRO_NOT_SUPPORTED);
639 	}
640 
641 	/*
642 	 * If the frame is padded beyond the end of the IP packet, then we must
643 	 * trim the extra bytes off.
644 	 */
645 	l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len);
646 	if (l != 0) {
647 		if (l < 0)
648 			/* Truncated packet. */
649 			return (TCP_LRO_CANNOT);
650 
651 		m_adj(m, -l);
652 	}
653 
654 	/*
655 	 * Check TCP header constraints.
656 	 */
657 	/* Ensure no bits set besides ACK or PSH. */
658 	if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) {
659 		if (th->th_flags & TH_SYN)
660 			return (TCP_LRO_CANNOT);
661 		/*
662 		 * Make sure that previously seen segements/ACKs are delivered
663 		 * before this segement, e.g. FIN.
664 		 */
665 		force_flush = 1;
666 	}
667 
668 	/* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */
669 	/* XXX-BZ Ideally we'd flush on PUSH? */
670 
671 	/*
672 	 * Check for timestamps.
673 	 * Since the only option we handle are timestamps, we only have to
674 	 * handle the simple case of aligned timestamps.
675 	 */
676 	l = (th->th_off << 2);
677 	tcp_data_len -= l;
678 	l -= sizeof(*th);
679 	ts_ptr = (uint32_t *)(th + 1);
680 	if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
681 	    (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
682 	    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) {
683 		/*
684 		 * Make sure that previously seen segements/ACKs are delivered
685 		 * before this segement.
686 		 */
687 		force_flush = 1;
688 	}
689 
690 	/* If the driver did not pass in the checksum, set it now. */
691 	if (csum == 0x0000)
692 		csum = th->th_sum;
693 
694 	seq = ntohl(th->th_seq);
695 
696 	/* Try to find a matching previous segment. */
697 	LIST_FOREACH(le, &lc->lro_active, next) {
698 		if (le->eh_type != eh_type)
699 			continue;
700 		if (le->source_port != th->th_sport ||
701 		    le->dest_port != th->th_dport)
702 			continue;
703 		switch (eh_type) {
704 #ifdef INET6
705 		case ETHERTYPE_IPV6:
706 			if (bcmp(&le->source_ip6, &ip6->ip6_src,
707 			    sizeof(struct in6_addr)) != 0 ||
708 			    bcmp(&le->dest_ip6, &ip6->ip6_dst,
709 			    sizeof(struct in6_addr)) != 0)
710 				continue;
711 			break;
712 #endif
713 #ifdef INET
714 		case ETHERTYPE_IP:
715 			if (le->source_ip4 != ip4->ip_src.s_addr ||
716 			    le->dest_ip4 != ip4->ip_dst.s_addr)
717 				continue;
718 			break;
719 #endif
720 		}
721 
722 		if (force_flush) {
723 			/* Timestamps mismatch; this is a FIN, etc */
724 			tcp_lro_active_remove(le);
725 			tcp_lro_flush(lc, le);
726 			return (TCP_LRO_CANNOT);
727 		}
728 
729 		/* Flush now if appending will result in overflow. */
730 		if (le->p_len > (lc->lro_length_lim - tcp_data_len)) {
731 			tcp_lro_active_remove(le);
732 			tcp_lro_flush(lc, le);
733 			break;
734 		}
735 
736 		/* Try to append the new segment. */
737 		if (__predict_false(seq != le->next_seq ||
738 		    (tcp_data_len == 0 &&
739 		    le->ack_seq == th->th_ack &&
740 		    le->window == th->th_win))) {
741 			/* Out of order packet or duplicate ACK. */
742 			tcp_lro_active_remove(le);
743 			tcp_lro_flush(lc, le);
744 			return (TCP_LRO_CANNOT);
745 		}
746 
747 		if (l != 0) {
748 			uint32_t tsval = ntohl(*(ts_ptr + 1));
749 			/* Make sure timestamp values are increasing. */
750 			/* XXX-BZ flip and use TSTMP_GEQ macro for this? */
751 			if (__predict_false(le->tsval > tsval ||
752 			    *(ts_ptr + 2) == 0))
753 				return (TCP_LRO_CANNOT);
754 			le->tsval = tsval;
755 			le->tsecr = *(ts_ptr + 2);
756 		}
757 		if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) {
758 			le->next_seq += tcp_data_len;
759 			le->ack_seq = th->th_ack;
760 			le->window = th->th_win;
761 			le->append_cnt++;
762 		} else if (th->th_ack == le->ack_seq) {
763 			le->window = WIN_MAX(le->window, th->th_win);
764 			le->append_cnt++;
765 		} else {
766 			/* no data and old ack */
767 			le->append_cnt++;
768 			m_freem(m);
769 			return (0);
770 		}
771 #ifdef TCP_LRO_UPDATE_CSUM
772 		le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th,
773 		    tcp_data_len, ~csum);
774 #endif
775 
776 		if (tcp_data_len == 0) {
777 			m_freem(m);
778 			/*
779 			 * Flush this LRO entry, if this ACK should not
780 			 * be further delayed.
781 			 */
782 			if (le->append_cnt >= lc->lro_ackcnt_lim) {
783 				tcp_lro_active_remove(le);
784 				tcp_lro_flush(lc, le);
785 			}
786 			return (0);
787 		}
788 
789 		le->p_len += tcp_data_len;
790 
791 		/*
792 		 * Adjust the mbuf so that m_data points to the first byte of
793 		 * the ULP payload.  Adjust the mbuf to avoid complications and
794 		 * append new segment to existing mbuf chain.
795 		 */
796 		m_adj(m, m->m_pkthdr.len - tcp_data_len);
797 		m_demote_pkthdr(m);
798 
799 		le->m_tail->m_next = m;
800 		le->m_tail = m_last(m);
801 
802 		/*
803 		 * If a possible next full length packet would cause an
804 		 * overflow, pro-actively flush now.
805 		 */
806 		if (le->p_len > (lc->lro_length_lim - lc->ifp->if_mtu)) {
807 			tcp_lro_active_remove(le);
808 			tcp_lro_flush(lc, le);
809 		} else
810 			getmicrotime(&le->mtime);
811 
812 		return (0);
813 	}
814 
815 	if (force_flush) {
816 		/*
817 		 * Nothing to flush, but this segment can not be further
818 		 * aggregated/delayed.
819 		 */
820 		return (TCP_LRO_CANNOT);
821 	}
822 
823 	/* Try to find an empty slot. */
824 	if (LIST_EMPTY(&lc->lro_free))
825 		return (TCP_LRO_NO_ENTRIES);
826 
827 	/* Start a new segment chain. */
828 	le = LIST_FIRST(&lc->lro_free);
829 	LIST_REMOVE(le, next);
830 	tcp_lro_active_insert(lc, le);
831 	getmicrotime(&le->mtime);
832 
833 	/* Start filling in details. */
834 	switch (eh_type) {
835 #ifdef INET6
836 	case ETHERTYPE_IPV6:
837 		le->le_ip6 = ip6;
838 		le->source_ip6 = ip6->ip6_src;
839 		le->dest_ip6 = ip6->ip6_dst;
840 		le->eh_type = eh_type;
841 		le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6);
842 		break;
843 #endif
844 #ifdef INET
845 	case ETHERTYPE_IP:
846 		le->le_ip4 = ip4;
847 		le->source_ip4 = ip4->ip_src.s_addr;
848 		le->dest_ip4 = ip4->ip_dst.s_addr;
849 		le->eh_type = eh_type;
850 		le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN;
851 		break;
852 #endif
853 	}
854 	le->source_port = th->th_sport;
855 	le->dest_port = th->th_dport;
856 
857 	le->next_seq = seq + tcp_data_len;
858 	le->ack_seq = th->th_ack;
859 	le->window = th->th_win;
860 	if (l != 0) {
861 		le->timestamp = 1;
862 		le->tsval = ntohl(*(ts_ptr + 1));
863 		le->tsecr = *(ts_ptr + 2);
864 	}
865 
866 #ifdef TCP_LRO_UPDATE_CSUM
867 	/*
868 	 * Do not touch the csum of the first packet.  However save the
869 	 * "adjusted" checksum of just the source and destination addresses,
870 	 * the next header and the TCP payload.  The length and TCP header
871 	 * parts may change, so we remove those from the saved checksum and
872 	 * re-add with final values on tcp_lro_flush() if needed.
873 	 */
874 	KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n",
875 	    __func__, le, le->ulp_csum));
876 
877 	le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len,
878 	    ~csum);
879 	th->th_sum = csum;	/* Restore checksum on first packet. */
880 #endif
881 
882 	le->m_head = m;
883 	le->m_tail = m_last(m);
884 
885 	return (0);
886 }
887 
888 void
tcp_lro_queue_mbuf(struct lro_ctrl * lc,struct mbuf * mb)889 tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb)
890 {
891 	/* sanity checks */
892 	if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL ||
893 	    lc->lro_mbuf_max == 0)) {
894 		/* packet drop */
895 		m_freem(mb);
896 		return;
897 	}
898 
899 	/* check if packet is not LRO capable */
900 	if (__predict_false(mb->m_pkthdr.csum_flags == 0 ||
901 	    (lc->ifp->if_capenable & IFCAP_LRO) == 0)) {
902 
903 		/* input packet to network layer */
904 		(*lc->ifp->if_input) (lc->ifp, mb);
905 		return;
906 	}
907 
908 	/* create sequence number */
909 	lc->lro_mbuf_data[lc->lro_mbuf_count].seq =
910 	    (((uint64_t)M_HASHTYPE_GET(mb)) << 56) |
911 	    (((uint64_t)mb->m_pkthdr.flowid) << 24) |
912 	    ((uint64_t)lc->lro_mbuf_count);
913 
914 	/* enter mbuf */
915 	lc->lro_mbuf_data[lc->lro_mbuf_count].mb = mb;
916 
917 	/* flush if array is full */
918 	if (__predict_false(++lc->lro_mbuf_count == lc->lro_mbuf_max))
919 		tcp_lro_flush_all(lc);
920 }
921 
922 /* end */
923