1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2010-2016 Solarflare Communications Inc.
5 * All rights reserved.
6 *
7 * This software was developed in part by Philip Paeps under contract for
8 * Solarflare Communications, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright notice,
14 * this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright notice,
16 * this list of conditions and the following disclaimer in the documentation
17 * and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 * The views and conclusions contained in the software and documentation are
32 * those of the authors and should not be interpreted as representing official
33 * policies, either expressed or implied, of the FreeBSD Project.
34 */
35
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD: stable/12/sys/dev/sfxge/sfxge_rx.c 350405 2019-07-29 09:25:16Z arybchik $");
38
39 #include "opt_rss.h"
40
41 #include <sys/param.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/smp.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/limits.h>
49 #include <sys/syslog.h>
50
51 #include <net/ethernet.h>
52 #include <net/if.h>
53 #include <net/if_vlan_var.h>
54
55 #include <netinet/in.h>
56 #include <netinet/ip.h>
57 #include <netinet/ip6.h>
58 #include <netinet/tcp.h>
59
60 #include <machine/in_cksum.h>
61
62 #ifdef RSS
63 #include <net/rss_config.h>
64 #endif
65
66 #include "common/efx.h"
67
68
69 #include "sfxge.h"
70 #include "sfxge_rx.h"
71
72 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10)
73
74 #ifdef SFXGE_LRO
75
76 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
77 "Large receive offload (LRO) parameters");
78
79 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
80
81 /* Size of the LRO hash table. Must be a power of 2. A larger table
82 * means we can accelerate a larger number of streams.
83 */
84 static unsigned lro_table_size = 128;
85 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
86 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
87 &lro_table_size, 0,
88 "Size of the LRO hash table (must be a power of 2)");
89
90 /* Maximum length of a hash chain. If chains get too long then the lookup
91 * time increases and may exceed the benefit of LRO.
92 */
93 static unsigned lro_chain_max = 20;
94 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
95 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
96 &lro_chain_max, 0,
97 "The maximum length of a hash chain");
98
99 /* Maximum time (in ticks) that a connection can be idle before it's LRO
100 * state is discarded.
101 */
102 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
103 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
104 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
105 &lro_idle_ticks, 0,
106 "The maximum time (in ticks) that a connection can be idle "
107 "before it's LRO state is discarded");
108
109 /* Number of packets with payload that must arrive in-order before a
110 * connection is eligible for LRO. The idea is we should avoid coalescing
111 * segments when the sender is in slow-start because reducing the ACK rate
112 * can damage performance.
113 */
114 static int lro_slow_start_packets = 2000;
115 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
116 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
117 &lro_slow_start_packets, 0,
118 "Number of packets with payload that must arrive in-order before "
119 "a connection is eligible for LRO");
120
121 /* Number of packets with payload that must arrive in-order following loss
122 * before a connection is eligible for LRO. The idea is we should avoid
123 * coalescing segments when the sender is recovering from loss, because
124 * reducing the ACK rate can damage performance.
125 */
126 static int lro_loss_packets = 20;
127 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
128 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
129 &lro_loss_packets, 0,
130 "Number of packets with payload that must arrive in-order "
131 "following loss before a connection is eligible for LRO");
132
133 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
134 #define SFXGE_LRO_L2_ID_VLAN 0x4000
135 #define SFXGE_LRO_L2_ID_IPV6 0x8000
136 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
137 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
138
139 /* Compare IPv6 addresses, avoiding conditional branches */
ipv6_addr_cmp(const struct in6_addr * left,const struct in6_addr * right)140 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
141 const struct in6_addr *right)
142 {
143 #if LONG_BIT == 64
144 const uint64_t *left64 = (const uint64_t *)left;
145 const uint64_t *right64 = (const uint64_t *)right;
146 return (left64[0] - right64[0]) | (left64[1] - right64[1]);
147 #else
148 return (left->s6_addr32[0] - right->s6_addr32[0]) |
149 (left->s6_addr32[1] - right->s6_addr32[1]) |
150 (left->s6_addr32[2] - right->s6_addr32[2]) |
151 (left->s6_addr32[3] - right->s6_addr32[3]);
152 #endif
153 }
154
155 #endif /* SFXGE_LRO */
156
157 void
sfxge_rx_qflush_done(struct sfxge_rxq * rxq)158 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
159 {
160
161 rxq->flush_state = SFXGE_FLUSH_DONE;
162 }
163
164 void
sfxge_rx_qflush_failed(struct sfxge_rxq * rxq)165 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
166 {
167
168 rxq->flush_state = SFXGE_FLUSH_FAILED;
169 }
170
171 #ifdef RSS
172 static uint8_t toep_key[RSS_KEYSIZE];
173 #else
174 static uint8_t toep_key[] = {
175 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
176 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
177 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
178 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
179 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
180 };
181 #endif
182
183 static void
sfxge_rx_post_refill(void * arg)184 sfxge_rx_post_refill(void *arg)
185 {
186 struct sfxge_rxq *rxq = arg;
187 struct sfxge_softc *sc;
188 unsigned int index;
189 struct sfxge_evq *evq;
190 uint16_t magic;
191
192 sc = rxq->sc;
193 index = rxq->index;
194 evq = sc->evq[index];
195 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
196
197 /* This is guaranteed due to the start/stop order of rx and ev */
198 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
199 ("evq not started"));
200 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
201 ("rxq not started"));
202 efx_ev_qpost(evq->common, magic);
203 }
204
205 static void
sfxge_rx_schedule_refill(struct sfxge_rxq * rxq,boolean_t retrying)206 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
207 {
208 /* Initially retry after 100 ms, but back off in case of
209 * repeated failures as we probably have to wait for the
210 * administrator to raise the pool limit. */
211 if (retrying)
212 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
213 else
214 rxq->refill_delay = hz / 10;
215
216 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
217 sfxge_rx_post_refill, rxq);
218 }
219
220 #define SFXGE_REFILL_BATCH 64
221
222 static void
sfxge_rx_qfill(struct sfxge_rxq * rxq,unsigned int target,boolean_t retrying)223 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
224 {
225 struct sfxge_softc *sc;
226 unsigned int index;
227 struct sfxge_evq *evq;
228 unsigned int batch;
229 unsigned int rxfill;
230 unsigned int mblksize;
231 int ntodo;
232 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
233
234 sc = rxq->sc;
235 index = rxq->index;
236 evq = sc->evq[index];
237
238 prefetch_read_many(sc->enp);
239 prefetch_read_many(rxq->common);
240
241 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
242
243 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
244 return;
245
246 rxfill = rxq->added - rxq->completed;
247 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
248 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
249 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
250 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
251 ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
252
253 if (ntodo == 0)
254 return;
255
256 batch = 0;
257 mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
258 while (ntodo-- > 0) {
259 unsigned int id;
260 struct sfxge_rx_sw_desc *rx_desc;
261 bus_dma_segment_t seg;
262 struct mbuf *m;
263
264 id = (rxq->added + batch) & rxq->ptr_mask;
265 rx_desc = &rxq->queue[id];
266 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
267
268 rx_desc->flags = EFX_DISCARD;
269 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
270 sc->rx_cluster_size);
271 if (m == NULL)
272 break;
273
274 /* m_len specifies length of area to be mapped for DMA */
275 m->m_len = mblksize;
276 m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data,
277 CACHE_LINE_SIZE);
278 m->m_data += sc->rx_buffer_align;
279
280 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
281 addr[batch++] = seg.ds_addr;
282
283 if (batch == SFXGE_REFILL_BATCH) {
284 efx_rx_qpost(rxq->common, addr, mblksize, batch,
285 rxq->completed, rxq->added);
286 rxq->added += batch;
287 batch = 0;
288 }
289 }
290
291 if (ntodo != 0)
292 sfxge_rx_schedule_refill(rxq, retrying);
293
294 if (batch != 0) {
295 efx_rx_qpost(rxq->common, addr, mblksize, batch,
296 rxq->completed, rxq->added);
297 rxq->added += batch;
298 }
299
300 /* Make the descriptors visible to the hardware */
301 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
302 BUS_DMASYNC_PREWRITE);
303
304 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
305
306 /* The queue could still be empty if no descriptors were actually
307 * pushed, in which case there will be no event to cause the next
308 * refill, so we must schedule a refill ourselves.
309 */
310 if(rxq->pushed == rxq->completed) {
311 sfxge_rx_schedule_refill(rxq, retrying);
312 }
313 }
314
315 void
sfxge_rx_qrefill(struct sfxge_rxq * rxq)316 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
317 {
318
319 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
320 return;
321
322 /* Make sure the queue is full */
323 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
324 }
325
__sfxge_rx_deliver(struct sfxge_softc * sc,struct mbuf * m)326 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
327 {
328 struct ifnet *ifp = sc->ifnet;
329
330 m->m_pkthdr.rcvif = ifp;
331 m->m_pkthdr.csum_data = 0xffff;
332 ifp->if_input(ifp, m);
333 }
334
335 static void
sfxge_rx_deliver(struct sfxge_rxq * rxq,struct sfxge_rx_sw_desc * rx_desc)336 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
337 {
338 struct sfxge_softc *sc = rxq->sc;
339 struct mbuf *m = rx_desc->mbuf;
340 int flags = rx_desc->flags;
341 int csum_flags;
342
343 /* Convert checksum flags */
344 csum_flags = (flags & EFX_CKSUM_IPV4) ?
345 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
346 if (flags & EFX_CKSUM_TCPUDP)
347 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
348
349 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
350 m->m_pkthdr.flowid =
351 efx_pseudo_hdr_hash_get(rxq->common,
352 EFX_RX_HASHALG_TOEPLITZ,
353 mtod(m, uint8_t *));
354 /* The hash covers a 4-tuple for TCP only */
355 M_HASHTYPE_SET(m,
356 (flags & EFX_PKT_IPV4) ?
357 ((flags & EFX_PKT_TCP) ?
358 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
359 ((flags & EFX_PKT_TCP) ?
360 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
361 }
362 m->m_data += sc->rx_prefix_size;
363 m->m_len = rx_desc->size - sc->rx_prefix_size;
364 m->m_pkthdr.len = m->m_len;
365 m->m_pkthdr.csum_flags = csum_flags;
366 __sfxge_rx_deliver(sc, rx_desc->mbuf);
367
368 rx_desc->flags = EFX_DISCARD;
369 rx_desc->mbuf = NULL;
370 }
371
372 #ifdef SFXGE_LRO
373
374 static void
sfxge_lro_deliver(struct sfxge_lro_state * st,struct sfxge_lro_conn * c)375 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
376 {
377 struct sfxge_softc *sc = st->sc;
378 struct mbuf *m = c->mbuf;
379 struct tcphdr *c_th;
380 int csum_flags;
381
382 KASSERT(m, ("no mbuf to deliver"));
383
384 ++st->n_bursts;
385
386 /* Finish off packet munging and recalculate IP header checksum. */
387 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
388 struct ip *iph = c->nh;
389 iph->ip_len = htons(iph->ip_len);
390 iph->ip_sum = 0;
391 iph->ip_sum = in_cksum_hdr(iph);
392 c_th = (struct tcphdr *)(iph + 1);
393 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
394 CSUM_IP_CHECKED | CSUM_IP_VALID);
395 } else {
396 struct ip6_hdr *iph = c->nh;
397 iph->ip6_plen = htons(iph->ip6_plen);
398 c_th = (struct tcphdr *)(iph + 1);
399 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
400 }
401
402 c_th->th_win = c->th_last->th_win;
403 c_th->th_ack = c->th_last->th_ack;
404 if (c_th->th_off == c->th_last->th_off) {
405 /* Copy TCP options (take care to avoid going negative). */
406 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
407 memcpy(c_th + 1, c->th_last + 1, optlen);
408 }
409
410 m->m_pkthdr.flowid = c->conn_hash;
411 M_HASHTYPE_SET(m,
412 SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
413 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
414
415 m->m_pkthdr.csum_flags = csum_flags;
416 __sfxge_rx_deliver(sc, m);
417
418 c->mbuf = NULL;
419 c->delivered = 1;
420 }
421
422 /* Drop the given connection, and add it to the free list. */
sfxge_lro_drop(struct sfxge_rxq * rxq,struct sfxge_lro_conn * c)423 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
424 {
425 unsigned bucket;
426
427 KASSERT(!c->mbuf, ("found orphaned mbuf"));
428
429 if (c->next_buf.mbuf != NULL) {
430 sfxge_rx_deliver(rxq, &c->next_buf);
431 LIST_REMOVE(c, active_link);
432 }
433
434 bucket = c->conn_hash & rxq->lro.conns_mask;
435 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
436 --rxq->lro.conns_n[bucket];
437 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
438 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
439 }
440
441 /* Stop tracking connections that have gone idle in order to keep hash
442 * chains short.
443 */
sfxge_lro_purge_idle(struct sfxge_rxq * rxq,unsigned now)444 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
445 {
446 struct sfxge_lro_conn *c;
447 unsigned i;
448
449 KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
450 ("found active connections"));
451
452 rxq->lro.last_purge_ticks = now;
453 for (i = 0; i <= rxq->lro.conns_mask; ++i) {
454 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
455 continue;
456
457 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
458 if (now - c->last_pkt_ticks > lro_idle_ticks) {
459 ++rxq->lro.n_drop_idle;
460 sfxge_lro_drop(rxq, c);
461 }
462 }
463 }
464
465 static void
sfxge_lro_merge(struct sfxge_lro_state * st,struct sfxge_lro_conn * c,struct mbuf * mbuf,struct tcphdr * th)466 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
467 struct mbuf *mbuf, struct tcphdr *th)
468 {
469 struct tcphdr *c_th;
470
471 /* Tack the new mbuf onto the chain. */
472 KASSERT(!mbuf->m_next, ("mbuf already chained"));
473 c->mbuf_tail->m_next = mbuf;
474 c->mbuf_tail = mbuf;
475
476 /* Increase length appropriately */
477 c->mbuf->m_pkthdr.len += mbuf->m_len;
478
479 /* Update the connection state flags */
480 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
481 struct ip *iph = c->nh;
482 iph->ip_len += mbuf->m_len;
483 c_th = (struct tcphdr *)(iph + 1);
484 } else {
485 struct ip6_hdr *iph = c->nh;
486 iph->ip6_plen += mbuf->m_len;
487 c_th = (struct tcphdr *)(iph + 1);
488 }
489 c_th->th_flags |= (th->th_flags & TH_PUSH);
490 c->th_last = th;
491 ++st->n_merges;
492
493 /* Pass packet up now if another segment could overflow the IP
494 * length.
495 */
496 if (c->mbuf->m_pkthdr.len > 65536 - 9200)
497 sfxge_lro_deliver(st, c);
498 }
499
500 static void
sfxge_lro_start(struct sfxge_lro_state * st,struct sfxge_lro_conn * c,struct mbuf * mbuf,void * nh,struct tcphdr * th)501 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
502 struct mbuf *mbuf, void *nh, struct tcphdr *th)
503 {
504 /* Start the chain */
505 c->mbuf = mbuf;
506 c->mbuf_tail = c->mbuf;
507 c->nh = nh;
508 c->th_last = th;
509
510 mbuf->m_pkthdr.len = mbuf->m_len;
511
512 /* Mangle header fields for later processing */
513 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
514 struct ip *iph = nh;
515 iph->ip_len = ntohs(iph->ip_len);
516 } else {
517 struct ip6_hdr *iph = nh;
518 iph->ip6_plen = ntohs(iph->ip6_plen);
519 }
520 }
521
522 /* Try to merge or otherwise hold or deliver (as appropriate) the
523 * packet buffered for this connection (c->next_buf). Return a flag
524 * indicating whether the connection is still active for LRO purposes.
525 */
526 static int
sfxge_lro_try_merge(struct sfxge_rxq * rxq,struct sfxge_lro_conn * c)527 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
528 {
529 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
530 char *eh = c->next_eh;
531 int data_length, hdr_length, dont_merge;
532 unsigned th_seq, pkt_length;
533 struct tcphdr *th;
534 unsigned now;
535
536 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
537 struct ip *iph = c->next_nh;
538 th = (struct tcphdr *)(iph + 1);
539 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
540 } else {
541 struct ip6_hdr *iph = c->next_nh;
542 th = (struct tcphdr *)(iph + 1);
543 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
544 }
545
546 hdr_length = (char *) th + th->th_off * 4 - eh;
547 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
548 hdr_length);
549 th_seq = ntohl(th->th_seq);
550 dont_merge = ((data_length <= 0)
551 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
552
553 /* Check for options other than aligned timestamp. */
554 if (th->th_off != 5) {
555 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
556 if (th->th_off == 8 &&
557 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
558 (TCPOPT_NOP << 16) |
559 (TCPOPT_TIMESTAMP << 8) |
560 TCPOLEN_TIMESTAMP)) {
561 /* timestamp option -- okay */
562 } else {
563 dont_merge = 1;
564 }
565 }
566
567 if (__predict_false(th_seq != c->next_seq)) {
568 /* Out-of-order, so start counting again. */
569 if (c->mbuf != NULL)
570 sfxge_lro_deliver(&rxq->lro, c);
571 c->n_in_order_pkts -= lro_loss_packets;
572 c->next_seq = th_seq + data_length;
573 ++rxq->lro.n_misorder;
574 goto deliver_buf_out;
575 }
576 c->next_seq = th_seq + data_length;
577
578 now = ticks;
579 if (now - c->last_pkt_ticks > lro_idle_ticks) {
580 ++rxq->lro.n_drop_idle;
581 if (c->mbuf != NULL)
582 sfxge_lro_deliver(&rxq->lro, c);
583 sfxge_lro_drop(rxq, c);
584 return (0);
585 }
586 c->last_pkt_ticks = ticks;
587
588 if (c->n_in_order_pkts < lro_slow_start_packets) {
589 /* May be in slow-start, so don't merge. */
590 ++rxq->lro.n_slow_start;
591 ++c->n_in_order_pkts;
592 goto deliver_buf_out;
593 }
594
595 if (__predict_false(dont_merge)) {
596 if (c->mbuf != NULL)
597 sfxge_lro_deliver(&rxq->lro, c);
598 if (th->th_flags & (TH_FIN | TH_RST)) {
599 ++rxq->lro.n_drop_closed;
600 sfxge_lro_drop(rxq, c);
601 return (0);
602 }
603 goto deliver_buf_out;
604 }
605
606 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
607
608 if (__predict_true(c->mbuf != NULL)) {
609 /* Remove headers and any padding */
610 rx_buf->mbuf->m_data += hdr_length;
611 rx_buf->mbuf->m_len = data_length;
612
613 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
614 } else {
615 /* Remove any padding */
616 rx_buf->mbuf->m_len = pkt_length;
617
618 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
619 }
620
621 rx_buf->mbuf = NULL;
622 return (1);
623
624 deliver_buf_out:
625 sfxge_rx_deliver(rxq, rx_buf);
626 return (1);
627 }
628
sfxge_lro_new_conn(struct sfxge_lro_state * st,uint32_t conn_hash,uint16_t l2_id,void * nh,struct tcphdr * th)629 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
630 uint16_t l2_id, void *nh, struct tcphdr *th)
631 {
632 unsigned bucket = conn_hash & st->conns_mask;
633 struct sfxge_lro_conn *c;
634
635 if (st->conns_n[bucket] >= lro_chain_max) {
636 ++st->n_too_many;
637 return;
638 }
639
640 if (!TAILQ_EMPTY(&st->free_conns)) {
641 c = TAILQ_FIRST(&st->free_conns);
642 TAILQ_REMOVE(&st->free_conns, c, link);
643 } else {
644 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
645 if (c == NULL)
646 return;
647 c->mbuf = NULL;
648 c->next_buf.mbuf = NULL;
649 }
650
651 /* Create the connection tracking data */
652 ++st->conns_n[bucket];
653 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
654 c->l2_id = l2_id;
655 c->conn_hash = conn_hash;
656 c->source = th->th_sport;
657 c->dest = th->th_dport;
658 c->n_in_order_pkts = 0;
659 c->last_pkt_ticks = *(volatile int *)&ticks;
660 c->delivered = 0;
661 ++st->n_new_stream;
662 /* NB. We don't initialise c->next_seq, and it doesn't matter what
663 * value it has. Most likely the next packet received for this
664 * connection will not match -- no harm done.
665 */
666 }
667
668 /* Process mbuf and decide whether to dispatch it to the stack now or
669 * later.
670 */
671 static void
sfxge_lro(struct sfxge_rxq * rxq,struct sfxge_rx_sw_desc * rx_buf)672 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
673 {
674 struct sfxge_softc *sc = rxq->sc;
675 struct mbuf *m = rx_buf->mbuf;
676 struct ether_header *eh;
677 struct sfxge_lro_conn *c;
678 uint16_t l2_id;
679 uint16_t l3_proto;
680 void *nh;
681 struct tcphdr *th;
682 uint32_t conn_hash;
683 unsigned bucket;
684
685 /* Get the hardware hash */
686 conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
687 EFX_RX_HASHALG_TOEPLITZ,
688 mtod(m, uint8_t *));
689
690 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
691 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
692 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
693 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
694 SFXGE_LRO_L2_ID_VLAN;
695 l3_proto = veh->evl_proto;
696 nh = veh + 1;
697 } else {
698 l2_id = 0;
699 l3_proto = eh->ether_type;
700 nh = eh + 1;
701 }
702
703 /* Check whether this is a suitable packet (unfragmented
704 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
705 * length, and compute a hash if necessary. If not, return.
706 */
707 if (l3_proto == htons(ETHERTYPE_IP)) {
708 struct ip *iph = nh;
709
710 KASSERT(iph->ip_p == IPPROTO_TCP,
711 ("IPv4 protocol is not TCP, but packet marker is set"));
712 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
713 (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
714 goto deliver_now;
715 th = (struct tcphdr *)(iph + 1);
716 } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
717 struct ip6_hdr *iph = nh;
718
719 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
720 ("IPv6 next header is not TCP, but packet marker is set"));
721 l2_id |= SFXGE_LRO_L2_ID_IPV6;
722 th = (struct tcphdr *)(iph + 1);
723 } else {
724 goto deliver_now;
725 }
726
727 bucket = conn_hash & rxq->lro.conns_mask;
728
729 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
730 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
731 continue;
732 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
733 continue;
734 if (c->mbuf != NULL) {
735 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
736 struct ip *c_iph, *iph = nh;
737 c_iph = c->nh;
738 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
739 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
740 continue;
741 } else {
742 struct ip6_hdr *c_iph, *iph = nh;
743 c_iph = c->nh;
744 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
745 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
746 continue;
747 }
748 }
749
750 /* Re-insert at head of list to reduce lookup time. */
751 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
752 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
753
754 if (c->next_buf.mbuf != NULL) {
755 if (!sfxge_lro_try_merge(rxq, c))
756 goto deliver_now;
757 } else {
758 LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
759 active_link);
760 }
761 c->next_buf = *rx_buf;
762 c->next_eh = eh;
763 c->next_nh = nh;
764
765 rx_buf->mbuf = NULL;
766 rx_buf->flags = EFX_DISCARD;
767 return;
768 }
769
770 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
771 deliver_now:
772 sfxge_rx_deliver(rxq, rx_buf);
773 }
774
sfxge_lro_end_of_burst(struct sfxge_rxq * rxq)775 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
776 {
777 struct sfxge_lro_state *st = &rxq->lro;
778 struct sfxge_lro_conn *c;
779 unsigned t;
780
781 while (!LIST_EMPTY(&st->active_conns)) {
782 c = LIST_FIRST(&st->active_conns);
783 if (!c->delivered && c->mbuf != NULL)
784 sfxge_lro_deliver(st, c);
785 if (sfxge_lro_try_merge(rxq, c)) {
786 if (c->mbuf != NULL)
787 sfxge_lro_deliver(st, c);
788 LIST_REMOVE(c, active_link);
789 }
790 c->delivered = 0;
791 }
792
793 t = *(volatile int *)&ticks;
794 if (__predict_false(t != st->last_purge_ticks))
795 sfxge_lro_purge_idle(rxq, t);
796 }
797
798 #else /* !SFXGE_LRO */
799
800 static void
sfxge_lro(struct sfxge_rxq * rxq,struct sfxge_rx_sw_desc * rx_buf)801 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
802 {
803 }
804
805 static void
sfxge_lro_end_of_burst(struct sfxge_rxq * rxq)806 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
807 {
808 }
809
810 #endif /* SFXGE_LRO */
811
812 void
sfxge_rx_qcomplete(struct sfxge_rxq * rxq,boolean_t eop)813 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
814 {
815 struct sfxge_softc *sc = rxq->sc;
816 int if_capenable = sc->ifnet->if_capenable;
817 int lro_enabled = if_capenable & IFCAP_LRO;
818 unsigned int index;
819 struct sfxge_evq *evq;
820 unsigned int completed;
821 unsigned int level;
822 struct mbuf *m;
823 struct sfxge_rx_sw_desc *prev = NULL;
824
825 index = rxq->index;
826 evq = sc->evq[index];
827
828 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
829
830 completed = rxq->completed;
831 while (completed != rxq->pending) {
832 unsigned int id;
833 struct sfxge_rx_sw_desc *rx_desc;
834
835 id = completed++ & rxq->ptr_mask;
836 rx_desc = &rxq->queue[id];
837 m = rx_desc->mbuf;
838
839 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
840 goto discard;
841
842 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
843 goto discard;
844
845 /* Read the length from the pseudo header if required */
846 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
847 uint16_t tmp_size;
848 int rc;
849 rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
850 mtod(m, uint8_t *),
851 &tmp_size);
852 KASSERT(rc == 0, ("cannot get packet length: %d", rc));
853 rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
854 }
855
856 prefetch_read_many(mtod(m, caddr_t));
857
858 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
859 case EFX_PKT_IPV4:
860 if (~if_capenable & IFCAP_RXCSUM)
861 rx_desc->flags &=
862 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
863 break;
864 case EFX_PKT_IPV6:
865 if (~if_capenable & IFCAP_RXCSUM_IPV6)
866 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
867 break;
868 case 0:
869 /* Check for loopback packets */
870 {
871 struct ether_header *etherhp;
872
873 /*LINTED*/
874 etherhp = mtod(m, struct ether_header *);
875
876 if (etherhp->ether_type ==
877 htons(SFXGE_ETHERTYPE_LOOPBACK)) {
878 EFSYS_PROBE(loopback);
879
880 rxq->loopback++;
881 goto discard;
882 }
883 }
884 break;
885 default:
886 KASSERT(B_FALSE,
887 ("Rx descriptor with both IPv4 and IPv6 flags"));
888 goto discard;
889 }
890
891 /* Pass packet up the stack or into LRO (pipelined) */
892 if (prev != NULL) {
893 if (lro_enabled &&
894 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
895 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
896 sfxge_lro(rxq, prev);
897 else
898 sfxge_rx_deliver(rxq, prev);
899 }
900 prev = rx_desc;
901 continue;
902
903 discard:
904 /* Return the packet to the pool */
905 m_free(m);
906 rx_desc->mbuf = NULL;
907 }
908 rxq->completed = completed;
909
910 level = rxq->added - rxq->completed;
911
912 /* Pass last packet up the stack or into LRO */
913 if (prev != NULL) {
914 if (lro_enabled &&
915 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
916 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
917 sfxge_lro(rxq, prev);
918 else
919 sfxge_rx_deliver(rxq, prev);
920 }
921
922 /*
923 * If there are any pending flows and this is the end of the
924 * poll then they must be completed.
925 */
926 if (eop)
927 sfxge_lro_end_of_burst(rxq);
928
929 /* Top up the queue if necessary */
930 if (level < rxq->refill_threshold)
931 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
932 }
933
934 static void
sfxge_rx_qstop(struct sfxge_softc * sc,unsigned int index)935 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
936 {
937 struct sfxge_rxq *rxq;
938 struct sfxge_evq *evq;
939 unsigned int count;
940 unsigned int retry = 3;
941
942 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
943
944 rxq = sc->rxq[index];
945 evq = sc->evq[index];
946
947 SFXGE_EVQ_LOCK(evq);
948
949 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
950 ("rxq not started"));
951
952 rxq->init_state = SFXGE_RXQ_INITIALIZED;
953
954 callout_stop(&rxq->refill_callout);
955
956 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
957 rxq->flush_state = SFXGE_FLUSH_PENDING;
958
959 SFXGE_EVQ_UNLOCK(evq);
960
961 /* Flush the receive queue */
962 if (efx_rx_qflush(rxq->common) != 0) {
963 SFXGE_EVQ_LOCK(evq);
964 rxq->flush_state = SFXGE_FLUSH_FAILED;
965 break;
966 }
967
968 count = 0;
969 do {
970 /* Spin for 100 ms */
971 DELAY(100000);
972
973 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
974 break;
975
976 } while (++count < 20);
977
978 SFXGE_EVQ_LOCK(evq);
979
980 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
981 /* Flush timeout - neither done nor failed */
982 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
983 device_get_nameunit(sc->dev), index);
984 rxq->flush_state = SFXGE_FLUSH_DONE;
985 }
986 retry--;
987 }
988 if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
989 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
990 device_get_nameunit(sc->dev), index);
991 rxq->flush_state = SFXGE_FLUSH_DONE;
992 }
993
994 rxq->pending = rxq->added;
995 sfxge_rx_qcomplete(rxq, B_TRUE);
996
997 KASSERT(rxq->completed == rxq->pending,
998 ("rxq->completed != rxq->pending"));
999
1000 rxq->added = 0;
1001 rxq->pushed = 0;
1002 rxq->pending = 0;
1003 rxq->completed = 0;
1004 rxq->loopback = 0;
1005
1006 /* Destroy the common code receive queue. */
1007 efx_rx_qdestroy(rxq->common);
1008
1009 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1010 EFX_RXQ_NBUFS(sc->rxq_entries));
1011
1012 SFXGE_EVQ_UNLOCK(evq);
1013 }
1014
1015 static int
sfxge_rx_qstart(struct sfxge_softc * sc,unsigned int index)1016 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1017 {
1018 struct sfxge_rxq *rxq;
1019 efsys_mem_t *esmp;
1020 struct sfxge_evq *evq;
1021 int rc;
1022
1023 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1024
1025 rxq = sc->rxq[index];
1026 esmp = &rxq->mem;
1027 evq = sc->evq[index];
1028
1029 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1030 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1031 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1032 ("evq->init_state != SFXGE_EVQ_STARTED"));
1033
1034 /* Program the buffer table. */
1035 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1036 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1037 return (rc);
1038
1039 /* Create the common code receive queue. */
1040 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1041 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1042 &rxq->common)) != 0)
1043 goto fail;
1044
1045 SFXGE_EVQ_LOCK(evq);
1046
1047 /* Enable the receive queue. */
1048 efx_rx_qenable(rxq->common);
1049
1050 rxq->init_state = SFXGE_RXQ_STARTED;
1051 rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1052
1053 /* Try to fill the queue from the pool. */
1054 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1055
1056 SFXGE_EVQ_UNLOCK(evq);
1057
1058 return (0);
1059
1060 fail:
1061 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1062 EFX_RXQ_NBUFS(sc->rxq_entries));
1063 return (rc);
1064 }
1065
1066 void
sfxge_rx_stop(struct sfxge_softc * sc)1067 sfxge_rx_stop(struct sfxge_softc *sc)
1068 {
1069 int index;
1070
1071 efx_mac_filter_default_rxq_clear(sc->enp);
1072
1073 /* Stop the receive queue(s) */
1074 index = sc->rxq_count;
1075 while (--index >= 0)
1076 sfxge_rx_qstop(sc, index);
1077
1078 sc->rx_prefix_size = 0;
1079 sc->rx_buffer_size = 0;
1080
1081 efx_rx_fini(sc->enp);
1082 }
1083
1084 int
sfxge_rx_start(struct sfxge_softc * sc)1085 sfxge_rx_start(struct sfxge_softc *sc)
1086 {
1087 struct sfxge_intr *intr;
1088 const efx_nic_cfg_t *encp;
1089 size_t hdrlen, align, reserved;
1090 int index;
1091 int rc;
1092
1093 intr = &sc->intr;
1094
1095 /* Initialize the common code receive module. */
1096 if ((rc = efx_rx_init(sc->enp)) != 0)
1097 return (rc);
1098
1099 encp = efx_nic_cfg_get(sc->enp);
1100 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1101
1102 /* Calculate the receive packet buffer size. */
1103 sc->rx_prefix_size = encp->enc_rx_prefix_size;
1104
1105 /* Ensure IP headers are 32bit aligned */
1106 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1107 sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen;
1108
1109 sc->rx_buffer_size += sc->rx_buffer_align;
1110
1111 /* Align end of packet buffer for RX DMA end padding */
1112 align = MAX(1, encp->enc_rx_buf_align_end);
1113 EFSYS_ASSERT(ISP2(align));
1114 sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align);
1115
1116 /*
1117 * Standard mbuf zones only guarantee pointer-size alignment;
1118 * we need extra space to align to the cache line
1119 */
1120 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1121
1122 /* Select zone for packet buffers */
1123 if (reserved <= MCLBYTES)
1124 sc->rx_cluster_size = MCLBYTES;
1125 else if (reserved <= MJUMPAGESIZE)
1126 sc->rx_cluster_size = MJUMPAGESIZE;
1127 else if (reserved <= MJUM9BYTES)
1128 sc->rx_cluster_size = MJUM9BYTES;
1129 else
1130 sc->rx_cluster_size = MJUM16BYTES;
1131
1132 /*
1133 * Set up the scale table. Enable all hash types and hash insertion.
1134 */
1135 for (index = 0; index < nitems(sc->rx_indir_table); index++)
1136 #ifdef RSS
1137 sc->rx_indir_table[index] =
1138 rss_get_indirection_to_bucket(index) % sc->rxq_count;
1139 #else
1140 sc->rx_indir_table[index] = index % sc->rxq_count;
1141 #endif
1142 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1143 nitems(sc->rx_indir_table))) != 0)
1144 goto fail;
1145 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1146 EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1147 EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1148
1149 #ifdef RSS
1150 rss_getkey(toep_key);
1151 #endif
1152 if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1153 sizeof(toep_key))) != 0)
1154 goto fail;
1155
1156 /* Start the receive queue(s). */
1157 for (index = 0; index < sc->rxq_count; index++) {
1158 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1159 goto fail2;
1160 }
1161
1162 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1163 sc->intr.n_alloc > 1);
1164 if (rc != 0)
1165 goto fail3;
1166
1167 return (0);
1168
1169 fail3:
1170 fail2:
1171 while (--index >= 0)
1172 sfxge_rx_qstop(sc, index);
1173
1174 fail:
1175 efx_rx_fini(sc->enp);
1176
1177 return (rc);
1178 }
1179
1180 #ifdef SFXGE_LRO
1181
sfxge_lro_init(struct sfxge_rxq * rxq)1182 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1183 {
1184 struct sfxge_lro_state *st = &rxq->lro;
1185 unsigned i;
1186
1187 st->conns_mask = lro_table_size - 1;
1188 KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1189 ("lro_table_size must be a power of 2"));
1190 st->sc = rxq->sc;
1191 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1192 M_SFXGE, M_WAITOK);
1193 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1194 M_SFXGE, M_WAITOK);
1195 for (i = 0; i <= st->conns_mask; ++i) {
1196 TAILQ_INIT(&st->conns[i]);
1197 st->conns_n[i] = 0;
1198 }
1199 LIST_INIT(&st->active_conns);
1200 TAILQ_INIT(&st->free_conns);
1201 }
1202
sfxge_lro_fini(struct sfxge_rxq * rxq)1203 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1204 {
1205 struct sfxge_lro_state *st = &rxq->lro;
1206 struct sfxge_lro_conn *c;
1207 unsigned i;
1208
1209 /* Return cleanly if sfxge_lro_init() has not been called. */
1210 if (st->conns == NULL)
1211 return;
1212
1213 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1214
1215 for (i = 0; i <= st->conns_mask; ++i) {
1216 while (!TAILQ_EMPTY(&st->conns[i])) {
1217 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1218 sfxge_lro_drop(rxq, c);
1219 }
1220 }
1221
1222 while (!TAILQ_EMPTY(&st->free_conns)) {
1223 c = TAILQ_FIRST(&st->free_conns);
1224 TAILQ_REMOVE(&st->free_conns, c, link);
1225 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1226 free(c, M_SFXGE);
1227 }
1228
1229 free(st->conns_n, M_SFXGE);
1230 free(st->conns, M_SFXGE);
1231 st->conns = NULL;
1232 }
1233
1234 #else
1235
1236 static void
sfxge_lro_init(struct sfxge_rxq * rxq)1237 sfxge_lro_init(struct sfxge_rxq *rxq)
1238 {
1239 }
1240
1241 static void
sfxge_lro_fini(struct sfxge_rxq * rxq)1242 sfxge_lro_fini(struct sfxge_rxq *rxq)
1243 {
1244 }
1245
1246 #endif /* SFXGE_LRO */
1247
1248 static void
sfxge_rx_qfini(struct sfxge_softc * sc,unsigned int index)1249 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1250 {
1251 struct sfxge_rxq *rxq;
1252
1253 rxq = sc->rxq[index];
1254
1255 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1256 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1257
1258 /* Free the context array and the flow table. */
1259 free(rxq->queue, M_SFXGE);
1260 sfxge_lro_fini(rxq);
1261
1262 /* Release DMA memory. */
1263 sfxge_dma_free(&rxq->mem);
1264
1265 sc->rxq[index] = NULL;
1266
1267 free(rxq, M_SFXGE);
1268 }
1269
1270 static int
sfxge_rx_qinit(struct sfxge_softc * sc,unsigned int index)1271 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1272 {
1273 struct sfxge_rxq *rxq;
1274 struct sfxge_evq *evq;
1275 efsys_mem_t *esmp;
1276 int rc;
1277
1278 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1279
1280 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1281 rxq->sc = sc;
1282 rxq->index = index;
1283 rxq->entries = sc->rxq_entries;
1284 rxq->ptr_mask = rxq->entries - 1;
1285 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1286
1287 sc->rxq[index] = rxq;
1288 esmp = &rxq->mem;
1289
1290 evq = sc->evq[index];
1291
1292 /* Allocate and zero DMA space. */
1293 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1294 return (rc);
1295
1296 /* Allocate buffer table entries. */
1297 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1298 &rxq->buf_base_id);
1299
1300 /* Allocate the context array and the flow table. */
1301 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1302 M_SFXGE, M_WAITOK | M_ZERO);
1303 sfxge_lro_init(rxq);
1304
1305 callout_init(&rxq->refill_callout, 1);
1306
1307 rxq->init_state = SFXGE_RXQ_INITIALIZED;
1308
1309 return (0);
1310 }
1311
1312 static const struct {
1313 const char *name;
1314 size_t offset;
1315 } sfxge_rx_stats[] = {
1316 #define SFXGE_RX_STAT(name, member) \
1317 { #name, offsetof(struct sfxge_rxq, member) }
1318 #ifdef SFXGE_LRO
1319 SFXGE_RX_STAT(lro_merges, lro.n_merges),
1320 SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1321 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1322 SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1323 SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1324 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1325 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1326 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1327 #endif
1328 };
1329
1330 static int
sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)1331 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1332 {
1333 struct sfxge_softc *sc = arg1;
1334 unsigned int id = arg2;
1335 unsigned int sum, index;
1336
1337 /* Sum across all RX queues */
1338 sum = 0;
1339 for (index = 0; index < sc->rxq_count; index++)
1340 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1341 sfxge_rx_stats[id].offset);
1342
1343 return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1344 }
1345
1346 static void
sfxge_rx_stat_init(struct sfxge_softc * sc)1347 sfxge_rx_stat_init(struct sfxge_softc *sc)
1348 {
1349 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1350 struct sysctl_oid_list *stat_list;
1351 unsigned int id;
1352
1353 stat_list = SYSCTL_CHILDREN(sc->stats_node);
1354
1355 for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1356 SYSCTL_ADD_PROC(
1357 ctx, stat_list,
1358 OID_AUTO, sfxge_rx_stats[id].name,
1359 CTLTYPE_UINT|CTLFLAG_RD,
1360 sc, id, sfxge_rx_stat_handler, "IU",
1361 "");
1362 }
1363 }
1364
1365 void
sfxge_rx_fini(struct sfxge_softc * sc)1366 sfxge_rx_fini(struct sfxge_softc *sc)
1367 {
1368 int index;
1369
1370 index = sc->rxq_count;
1371 while (--index >= 0)
1372 sfxge_rx_qfini(sc, index);
1373
1374 sc->rxq_count = 0;
1375 }
1376
1377 int
sfxge_rx_init(struct sfxge_softc * sc)1378 sfxge_rx_init(struct sfxge_softc *sc)
1379 {
1380 struct sfxge_intr *intr;
1381 int index;
1382 int rc;
1383
1384 #ifdef SFXGE_LRO
1385 if (!ISP2(lro_table_size)) {
1386 log(LOG_ERR, "%s=%u must be power of 2",
1387 SFXGE_LRO_PARAM(table_size), lro_table_size);
1388 rc = EINVAL;
1389 goto fail_lro_table_size;
1390 }
1391
1392 if (lro_idle_ticks == 0)
1393 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1394 #endif
1395
1396 intr = &sc->intr;
1397
1398 sc->rxq_count = intr->n_alloc;
1399
1400 KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1401 ("intr->state != SFXGE_INTR_INITIALIZED"));
1402
1403 /* Initialize the receive queue(s) - one per interrupt. */
1404 for (index = 0; index < sc->rxq_count; index++) {
1405 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1406 goto fail;
1407 }
1408
1409 sfxge_rx_stat_init(sc);
1410
1411 return (0);
1412
1413 fail:
1414 /* Tear down the receive queue(s). */
1415 while (--index >= 0)
1416 sfxge_rx_qfini(sc, index);
1417
1418 sc->rxq_count = 0;
1419
1420 #ifdef SFXGE_LRO
1421 fail_lro_table_size:
1422 #endif
1423 return (rc);
1424 }
1425