1 /******************************************************************************
2 
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD: stable/10/sys/dev/mxge/if_mxge.c 329834 2018-02-22 19:40:03Z rpokala $");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 #include <sys/taskqueue.h>
49 
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
55 
56 #include <net/bpf.h>
57 
58 #include <net/if_types.h>
59 #include <net/if_vlan_var.h>
60 #include <net/zlib.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/ip6.h>
66 #include <netinet/tcp.h>
67 #include <netinet/tcp_lro.h>
68 #include <netinet6/ip6_var.h>
69 
70 #include <machine/bus.h>
71 #include <machine/in_cksum.h>
72 #include <machine/resource.h>
73 #include <sys/bus.h>
74 #include <sys/rman.h>
75 #include <sys/smp.h>
76 
77 #include <dev/pci/pcireg.h>
78 #include <dev/pci/pcivar.h>
79 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
80 
81 #include <vm/vm.h>		/* for pmap_mapdev() */
82 #include <vm/pmap.h>
83 
84 #if defined(__i386) || defined(__amd64)
85 #include <machine/specialreg.h>
86 #endif
87 
88 #include <dev/mxge/mxge_mcp.h>
89 #include <dev/mxge/mcp_gen_header.h>
90 /*#define MXGE_FAKE_IFP*/
91 #include <dev/mxge/if_mxge_var.h>
92 #ifdef IFNET_BUF_RING
93 #include <sys/buf_ring.h>
94 #endif
95 
96 #include "opt_inet.h"
97 #include "opt_inet6.h"
98 
99 /* tunable params */
100 static int mxge_nvidia_ecrc_enable = 1;
101 static int mxge_force_firmware = 0;
102 static int mxge_intr_coal_delay = 30;
103 static int mxge_deassert_wait = 1;
104 static int mxge_flow_control = 1;
105 static int mxge_verbose = 0;
106 static int mxge_ticks;
107 static int mxge_max_slices = 1;
108 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
109 static int mxge_always_promisc = 0;
110 static int mxge_initial_mtu = ETHERMTU_JUMBO;
111 static int mxge_throttle = 0;
112 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
113 static char *mxge_fw_aligned = "mxge_eth_z8e";
114 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
115 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
116 
117 static int mxge_probe(device_t dev);
118 static int mxge_attach(device_t dev);
119 static int mxge_detach(device_t dev);
120 static int mxge_shutdown(device_t dev);
121 static void mxge_intr(void *arg);
122 
123 static device_method_t mxge_methods[] =
124 {
125   /* Device interface */
126   DEVMETHOD(device_probe, mxge_probe),
127   DEVMETHOD(device_attach, mxge_attach),
128   DEVMETHOD(device_detach, mxge_detach),
129   DEVMETHOD(device_shutdown, mxge_shutdown),
130 
131   DEVMETHOD_END
132 };
133 
134 static driver_t mxge_driver =
135 {
136   "mxge",
137   mxge_methods,
138   sizeof(mxge_softc_t),
139 };
140 
141 static devclass_t mxge_devclass;
142 
143 /* Declare ourselves to be a child of the PCI bus.*/
144 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
145 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
146 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
147 
148 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
149 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
150 static int mxge_close(mxge_softc_t *sc, int down);
151 static int mxge_open(mxge_softc_t *sc);
152 static void mxge_tick(void *arg);
153 
154 static int
mxge_probe(device_t dev)155 mxge_probe(device_t dev)
156 {
157 	int rev;
158 
159 
160 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
161 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
162 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
163 		rev = pci_get_revid(dev);
164 		switch (rev) {
165 		case MXGE_PCI_REV_Z8E:
166 			device_set_desc(dev, "Myri10G-PCIE-8A");
167 			break;
168 		case MXGE_PCI_REV_Z8ES:
169 			device_set_desc(dev, "Myri10G-PCIE-8B");
170 			break;
171 		default:
172 			device_set_desc(dev, "Myri10G-PCIE-8??");
173 			device_printf(dev, "Unrecognized rev %d NIC\n",
174 				      rev);
175 			break;
176 		}
177 		return 0;
178 	}
179 	return ENXIO;
180 }
181 
182 static void
mxge_enable_wc(mxge_softc_t * sc)183 mxge_enable_wc(mxge_softc_t *sc)
184 {
185 #if defined(__i386) || defined(__amd64)
186 	vm_offset_t len;
187 	int err;
188 
189 	sc->wc = 1;
190 	len = rman_get_size(sc->mem_res);
191 	err = pmap_change_attr((vm_offset_t) sc->sram,
192 			       len, PAT_WRITE_COMBINING);
193 	if (err != 0) {
194 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
195 			      err);
196 		sc->wc = 0;
197 	}
198 #endif
199 }
200 
201 
202 /* callback to get our DMA address */
203 static void
mxge_dmamap_callback(void * arg,bus_dma_segment_t * segs,int nsegs,int error)204 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
205 			 int error)
206 {
207 	if (error == 0) {
208 		*(bus_addr_t *) arg = segs->ds_addr;
209 	}
210 }
211 
212 static int
mxge_dma_alloc(mxge_softc_t * sc,mxge_dma_t * dma,size_t bytes,bus_size_t alignment)213 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
214 		   bus_size_t alignment)
215 {
216 	int err;
217 	device_t dev = sc->dev;
218 	bus_size_t boundary, maxsegsize;
219 
220 	if (bytes > 4096 && alignment == 4096) {
221 		boundary = 0;
222 		maxsegsize = bytes;
223 	} else {
224 		boundary = 4096;
225 		maxsegsize = 4096;
226 	}
227 
228 	/* allocate DMAable memory tags */
229 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
230 				 alignment,		/* alignment */
231 				 boundary,		/* boundary */
232 				 BUS_SPACE_MAXADDR,	/* low */
233 				 BUS_SPACE_MAXADDR,	/* high */
234 				 NULL, NULL,		/* filter */
235 				 bytes,			/* maxsize */
236 				 1,			/* num segs */
237 				 maxsegsize,		/* maxsegsize */
238 				 BUS_DMA_COHERENT,	/* flags */
239 				 NULL, NULL,		/* lock */
240 				 &dma->dmat);		/* tag */
241 	if (err != 0) {
242 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
243 		return err;
244 	}
245 
246 	/* allocate DMAable memory & map */
247 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
248 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
249 				| BUS_DMA_ZERO),  &dma->map);
250 	if (err != 0) {
251 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
252 		goto abort_with_dmat;
253 	}
254 
255 	/* load the memory */
256 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
257 			      mxge_dmamap_callback,
258 			      (void *)&dma->bus_addr, 0);
259 	if (err != 0) {
260 		device_printf(dev, "couldn't load map (err = %d)\n", err);
261 		goto abort_with_mem;
262 	}
263 	return 0;
264 
265 abort_with_mem:
266 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
267 abort_with_dmat:
268 	(void)bus_dma_tag_destroy(dma->dmat);
269 	return err;
270 }
271 
272 
273 static void
mxge_dma_free(mxge_dma_t * dma)274 mxge_dma_free(mxge_dma_t *dma)
275 {
276 	bus_dmamap_unload(dma->dmat, dma->map);
277 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
278 	(void)bus_dma_tag_destroy(dma->dmat);
279 }
280 
281 /*
282  * The eeprom strings on the lanaiX have the format
283  * SN=x\0
284  * MAC=x:x:x:x:x:x\0
285  * PC=text\0
286  */
287 
288 static int
mxge_parse_strings(mxge_softc_t * sc)289 mxge_parse_strings(mxge_softc_t *sc)
290 {
291 	char *ptr;
292 	int i, found_mac, found_sn2;
293 	char *endptr;
294 
295 	ptr = sc->eeprom_strings;
296 	found_mac = 0;
297 	found_sn2 = 0;
298 	while (*ptr != '\0') {
299 		if (strncmp(ptr, "MAC=", 4) == 0) {
300 			ptr += 4;
301 			for (i = 0;;) {
302 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
303 				if (endptr - ptr != 2)
304 					goto abort;
305 				ptr = endptr;
306 				if (++i == 6)
307 					break;
308 				if (*ptr++ != ':')
309 					goto abort;
310 			}
311 			found_mac = 1;
312 		} else if (strncmp(ptr, "PC=", 3) == 0) {
313 			ptr += 3;
314 			strlcpy(sc->product_code_string, ptr,
315 			    sizeof(sc->product_code_string));
316 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
317 			ptr += 3;
318 			strlcpy(sc->serial_number_string, ptr,
319 			    sizeof(sc->serial_number_string));
320 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
321 			/* SN2 takes precedence over SN */
322 			ptr += 4;
323 			found_sn2 = 1;
324 			strlcpy(sc->serial_number_string, ptr,
325 			    sizeof(sc->serial_number_string));
326 		}
327 		while (*ptr++ != '\0') {}
328 	}
329 
330 	if (found_mac)
331 		return 0;
332 
333  abort:
334 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
335 
336 	return ENXIO;
337 }
338 
339 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
340 static void
mxge_enable_nvidia_ecrc(mxge_softc_t * sc)341 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
342 {
343 	uint32_t val;
344 	unsigned long base, off;
345 	char *va, *cfgptr;
346 	device_t pdev, mcp55;
347 	uint16_t vendor_id, device_id, word;
348 	uintptr_t bus, slot, func, ivend, idev;
349 	uint32_t *ptr32;
350 
351 
352 	if (!mxge_nvidia_ecrc_enable)
353 		return;
354 
355 	pdev = device_get_parent(device_get_parent(sc->dev));
356 	if (pdev == NULL) {
357 		device_printf(sc->dev, "could not find parent?\n");
358 		return;
359 	}
360 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
361 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
362 
363 	if (vendor_id != 0x10de)
364 		return;
365 
366 	base = 0;
367 
368 	if (device_id == 0x005d) {
369 		/* ck804, base address is magic */
370 		base = 0xe0000000UL;
371 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
372 		/* mcp55, base address stored in chipset */
373 		mcp55 = pci_find_bsf(0, 0, 0);
374 		if (mcp55 &&
375 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
376 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
377 			word = pci_read_config(mcp55, 0x90, 2);
378 			base = ((unsigned long)word & 0x7ffeU) << 25;
379 		}
380 	}
381 	if (!base)
382 		return;
383 
384 	/* XXXX
385 	   Test below is commented because it is believed that doing
386 	   config read/write beyond 0xff will access the config space
387 	   for the next larger function.  Uncomment this and remove
388 	   the hacky pmap_mapdev() way of accessing config space when
389 	   FreeBSD grows support for extended pcie config space access
390 	*/
391 #if 0
392 	/* See if we can, by some miracle, access the extended
393 	   config space */
394 	val = pci_read_config(pdev, 0x178, 4);
395 	if (val != 0xffffffff) {
396 		val |= 0x40;
397 		pci_write_config(pdev, 0x178, val, 4);
398 		return;
399 	}
400 #endif
401 	/* Rather than using normal pci config space writes, we must
402 	 * map the Nvidia config space ourselves.  This is because on
403 	 * opteron/nvidia class machine the 0xe000000 mapping is
404 	 * handled by the nvidia chipset, that means the internal PCI
405 	 * device (the on-chip northbridge), or the amd-8131 bridge
406 	 * and things behind them are not visible by this method.
407 	 */
408 
409 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
410 		      PCI_IVAR_BUS, &bus);
411 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
412 		      PCI_IVAR_SLOT, &slot);
413 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
414 		      PCI_IVAR_FUNCTION, &func);
415 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
416 		      PCI_IVAR_VENDOR, &ivend);
417 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
418 		      PCI_IVAR_DEVICE, &idev);
419 
420 	off =  base
421 		+ 0x00100000UL * (unsigned long)bus
422 		+ 0x00001000UL * (unsigned long)(func
423 						 + 8 * slot);
424 
425 	/* map it into the kernel */
426 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
427 
428 
429 	if (va == NULL) {
430 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
431 		return;
432 	}
433 	/* get a pointer to the config space mapped into the kernel */
434 	cfgptr = va + (off & PAGE_MASK);
435 
436 	/* make sure that we can really access it */
437 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
438 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
439 	if (! (vendor_id == ivend && device_id == idev)) {
440 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
441 			      vendor_id, device_id);
442 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
443 		return;
444 	}
445 
446 	ptr32 = (uint32_t*)(cfgptr + 0x178);
447 	val = *ptr32;
448 
449 	if (val == 0xffffffff) {
450 		device_printf(sc->dev, "extended mapping failed\n");
451 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
452 		return;
453 	}
454 	*ptr32 = val | 0x40;
455 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
456 	if (mxge_verbose)
457 		device_printf(sc->dev,
458 			      "Enabled ECRC on upstream Nvidia bridge "
459 			      "at %d:%d:%d\n",
460 			      (int)bus, (int)slot, (int)func);
461 	return;
462 }
463 #else
464 static void
mxge_enable_nvidia_ecrc(mxge_softc_t * sc)465 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
466 {
467 	device_printf(sc->dev,
468 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
469 	return;
470 }
471 #endif
472 
473 
474 static int
mxge_dma_test(mxge_softc_t * sc,int test_type)475 mxge_dma_test(mxge_softc_t *sc, int test_type)
476 {
477 	mxge_cmd_t cmd;
478 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
479 	int status;
480 	uint32_t len;
481 	char *test = " ";
482 
483 
484 	/* Run a small DMA test.
485 	 * The magic multipliers to the length tell the firmware
486 	 * to do DMA read, write, or read+write tests.  The
487 	 * results are returned in cmd.data0.  The upper 16
488 	 * bits of the return is the number of transfers completed.
489 	 * The lower 16 bits is the time in 0.5us ticks that the
490 	 * transfers took to complete.
491 	 */
492 
493 	len = sc->tx_boundary;
494 
495 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
496 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
497 	cmd.data2 = len * 0x10000;
498 	status = mxge_send_cmd(sc, test_type, &cmd);
499 	if (status != 0) {
500 		test = "read";
501 		goto abort;
502 	}
503 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
504 		(cmd.data0 & 0xffff);
505 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
506 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
507 	cmd.data2 = len * 0x1;
508 	status = mxge_send_cmd(sc, test_type, &cmd);
509 	if (status != 0) {
510 		test = "write";
511 		goto abort;
512 	}
513 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
514 		(cmd.data0 & 0xffff);
515 
516 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
517 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
518 	cmd.data2 = len * 0x10001;
519 	status = mxge_send_cmd(sc, test_type, &cmd);
520 	if (status != 0) {
521 		test = "read/write";
522 		goto abort;
523 	}
524 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
525 		(cmd.data0 & 0xffff);
526 
527 abort:
528 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
529 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
530 			      test, status);
531 
532 	return status;
533 }
534 
535 /*
536  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
537  * when the PCI-E Completion packets are aligned on an 8-byte
538  * boundary.  Some PCI-E chip sets always align Completion packets; on
539  * the ones that do not, the alignment can be enforced by enabling
540  * ECRC generation (if supported).
541  *
542  * When PCI-E Completion packets are not aligned, it is actually more
543  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
544  *
545  * If the driver can neither enable ECRC nor verify that it has
546  * already been enabled, then it must use a firmware image which works
547  * around unaligned completion packets (ethp_z8e.dat), and it should
548  * also ensure that it never gives the device a Read-DMA which is
549  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
550  * enabled, then the driver should use the aligned (eth_z8e.dat)
551  * firmware image, and set tx_boundary to 4KB.
552  */
553 
554 static int
mxge_firmware_probe(mxge_softc_t * sc)555 mxge_firmware_probe(mxge_softc_t *sc)
556 {
557 	device_t dev = sc->dev;
558 	int reg, status;
559 	uint16_t pectl;
560 
561 	sc->tx_boundary = 4096;
562 	/*
563 	 * Verify the max read request size was set to 4KB
564 	 * before trying the test with 4KB.
565 	 */
566 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
567 		pectl = pci_read_config(dev, reg + 0x8, 2);
568 		if ((pectl & (5 << 12)) != (5 << 12)) {
569 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
570 				      pectl);
571 			sc->tx_boundary = 2048;
572 		}
573 	}
574 
575 	/*
576 	 * load the optimized firmware (which assumes aligned PCIe
577 	 * completions) in order to see if it works on this host.
578 	 */
579 	sc->fw_name = mxge_fw_aligned;
580 	status = mxge_load_firmware(sc, 1);
581 	if (status != 0) {
582 		return status;
583 	}
584 
585 	/*
586 	 * Enable ECRC if possible
587 	 */
588 	mxge_enable_nvidia_ecrc(sc);
589 
590 	/*
591 	 * Run a DMA test which watches for unaligned completions and
592 	 * aborts on the first one seen.  Not required on Z8ES or newer.
593 	 */
594 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
595 		return 0;
596 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
597 	if (status == 0)
598 		return 0; /* keep the aligned firmware */
599 
600 	if (status != E2BIG)
601 		device_printf(dev, "DMA test failed: %d\n", status);
602 	if (status == ENOSYS)
603 		device_printf(dev, "Falling back to ethp! "
604 			      "Please install up to date fw\n");
605 	return status;
606 }
607 
608 static int
mxge_select_firmware(mxge_softc_t * sc)609 mxge_select_firmware(mxge_softc_t *sc)
610 {
611 	int aligned = 0;
612 	int force_firmware = mxge_force_firmware;
613 
614 	if (sc->throttle)
615 		force_firmware = sc->throttle;
616 
617 	if (force_firmware != 0) {
618 		if (force_firmware == 1)
619 			aligned = 1;
620 		else
621 			aligned = 0;
622 		if (mxge_verbose)
623 			device_printf(sc->dev,
624 				      "Assuming %s completions (forced)\n",
625 				      aligned ? "aligned" : "unaligned");
626 		goto abort;
627 	}
628 
629 	/* if the PCIe link width is 4 or less, we can use the aligned
630 	   firmware and skip any checks */
631 	if (sc->link_width != 0 && sc->link_width <= 4) {
632 		device_printf(sc->dev,
633 			      "PCIe x%d Link, expect reduced performance\n",
634 			      sc->link_width);
635 		aligned = 1;
636 		goto abort;
637 	}
638 
639 	if (0 == mxge_firmware_probe(sc))
640 		return 0;
641 
642 abort:
643 	if (aligned) {
644 		sc->fw_name = mxge_fw_aligned;
645 		sc->tx_boundary = 4096;
646 	} else {
647 		sc->fw_name = mxge_fw_unaligned;
648 		sc->tx_boundary = 2048;
649 	}
650 	return (mxge_load_firmware(sc, 0));
651 }
652 
653 static int
mxge_validate_firmware(mxge_softc_t * sc,const mcp_gen_header_t * hdr)654 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
655 {
656 
657 
658 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
659 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
660 			      be32toh(hdr->mcp_type));
661 		return EIO;
662 	}
663 
664 	/* save firmware version for sysctl */
665 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
666 	if (mxge_verbose)
667 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
668 
669 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
670 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
671 
672 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
673 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
674 		device_printf(sc->dev, "Found firmware version %s\n",
675 			      sc->fw_version);
676 		device_printf(sc->dev, "Driver needs %d.%d\n",
677 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
678 		return EINVAL;
679 	}
680 	return 0;
681 
682 }
683 
684 static void *
z_alloc(void * nil,u_int items,u_int size)685 z_alloc(void *nil, u_int items, u_int size)
686 {
687         void *ptr;
688 
689         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
690         return ptr;
691 }
692 
693 static void
z_free(void * nil,void * ptr)694 z_free(void *nil, void *ptr)
695 {
696         free(ptr, M_TEMP);
697 }
698 
699 
700 static int
mxge_load_firmware_helper(mxge_softc_t * sc,uint32_t * limit)701 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
702 {
703 	z_stream zs;
704 	char *inflate_buffer;
705 	const struct firmware *fw;
706 	const mcp_gen_header_t *hdr;
707 	unsigned hdr_offset;
708 	int status;
709 	unsigned int i;
710 	char dummy;
711 	size_t fw_len;
712 
713 	fw = firmware_get(sc->fw_name);
714 	if (fw == NULL) {
715 		device_printf(sc->dev, "Could not find firmware image %s\n",
716 			      sc->fw_name);
717 		return ENOENT;
718 	}
719 
720 
721 
722 	/* setup zlib and decompress f/w */
723 	bzero(&zs, sizeof (zs));
724 	zs.zalloc = z_alloc;
725 	zs.zfree = z_free;
726 	status = inflateInit(&zs);
727 	if (status != Z_OK) {
728 		status = EIO;
729 		goto abort_with_fw;
730 	}
731 
732 	/* the uncompressed size is stored as the firmware version,
733 	   which would otherwise go unused */
734 	fw_len = (size_t) fw->version;
735 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
736 	if (inflate_buffer == NULL)
737 		goto abort_with_zs;
738 	zs.avail_in = fw->datasize;
739 	zs.next_in = __DECONST(char *, fw->data);
740 	zs.avail_out = fw_len;
741 	zs.next_out = inflate_buffer;
742 	status = inflate(&zs, Z_FINISH);
743 	if (status != Z_STREAM_END) {
744 		device_printf(sc->dev, "zlib %d\n", status);
745 		status = EIO;
746 		goto abort_with_buffer;
747 	}
748 
749 	/* check id */
750 	hdr_offset = htobe32(*(const uint32_t *)
751 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
752 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
753 		device_printf(sc->dev, "Bad firmware file");
754 		status = EIO;
755 		goto abort_with_buffer;
756 	}
757 	hdr = (const void*)(inflate_buffer + hdr_offset);
758 
759 	status = mxge_validate_firmware(sc, hdr);
760 	if (status != 0)
761 		goto abort_with_buffer;
762 
763 	/* Copy the inflated firmware to NIC SRAM. */
764 	for (i = 0; i < fw_len; i += 256) {
765 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
766 			      inflate_buffer + i,
767 			      min(256U, (unsigned)(fw_len - i)));
768 		wmb();
769 		dummy = *sc->sram;
770 		wmb();
771 	}
772 
773 	*limit = fw_len;
774 	status = 0;
775 abort_with_buffer:
776 	free(inflate_buffer, M_TEMP);
777 abort_with_zs:
778 	inflateEnd(&zs);
779 abort_with_fw:
780 	firmware_put(fw, FIRMWARE_UNLOAD);
781 	return status;
782 }
783 
784 /*
785  * Enable or disable periodic RDMAs from the host to make certain
786  * chipsets resend dropped PCIe messages
787  */
788 
789 static void
mxge_dummy_rdma(mxge_softc_t * sc,int enable)790 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
791 {
792 	char buf_bytes[72];
793 	volatile uint32_t *confirm;
794 	volatile char *submit;
795 	uint32_t *buf, dma_low, dma_high;
796 	int i;
797 
798 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
799 
800 	/* clear confirmation addr */
801 	confirm = (volatile uint32_t *)sc->cmd;
802 	*confirm = 0;
803 	wmb();
804 
805 	/* send an rdma command to the PCIe engine, and wait for the
806 	   response in the confirmation address.  The firmware should
807 	   write a -1 there to indicate it is alive and well
808 	*/
809 
810 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
811 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
812 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
813 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
814 	buf[2] = htobe32(0xffffffff);		/* confirm data */
815 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
816 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
817 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
818 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
819 	buf[5] = htobe32(enable);			/* enable? */
820 
821 
822 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
823 
824 	mxge_pio_copy(submit, buf, 64);
825 	wmb();
826 	DELAY(1000);
827 	wmb();
828 	i = 0;
829 	while (*confirm != 0xffffffff && i < 20) {
830 		DELAY(1000);
831 		i++;
832 	}
833 	if (*confirm != 0xffffffff) {
834 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
835 			      (enable ? "enable" : "disable"), confirm,
836 			      *confirm);
837 	}
838 	return;
839 }
840 
841 static int
mxge_send_cmd(mxge_softc_t * sc,uint32_t cmd,mxge_cmd_t * data)842 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
843 {
844 	mcp_cmd_t *buf;
845 	char buf_bytes[sizeof(*buf) + 8];
846 	volatile mcp_cmd_response_t *response = sc->cmd;
847 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
848 	uint32_t dma_low, dma_high;
849 	int err, sleep_total = 0;
850 
851 	/* ensure buf is aligned to 8 bytes */
852 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
853 
854 	buf->data0 = htobe32(data->data0);
855 	buf->data1 = htobe32(data->data1);
856 	buf->data2 = htobe32(data->data2);
857 	buf->cmd = htobe32(cmd);
858 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
859 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
860 
861 	buf->response_addr.low = htobe32(dma_low);
862 	buf->response_addr.high = htobe32(dma_high);
863 	mtx_lock(&sc->cmd_mtx);
864 	response->result = 0xffffffff;
865 	wmb();
866 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
867 
868 	/* wait up to 20ms */
869 	err = EAGAIN;
870 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
871 		bus_dmamap_sync(sc->cmd_dma.dmat,
872 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
873 		wmb();
874 		switch (be32toh(response->result)) {
875 		case 0:
876 			data->data0 = be32toh(response->data);
877 			err = 0;
878 			break;
879 		case 0xffffffff:
880 			DELAY(1000);
881 			break;
882 		case MXGEFW_CMD_UNKNOWN:
883 			err = ENOSYS;
884 			break;
885 		case MXGEFW_CMD_ERROR_UNALIGNED:
886 			err = E2BIG;
887 			break;
888 		case MXGEFW_CMD_ERROR_BUSY:
889 			err = EBUSY;
890 			break;
891 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
892 			err = ENXIO;
893 			break;
894 		default:
895 			device_printf(sc->dev,
896 				      "mxge: command %d "
897 				      "failed, result = %d\n",
898 				      cmd, be32toh(response->result));
899 			err = ENXIO;
900 			break;
901 		}
902 		if (err != EAGAIN)
903 			break;
904 	}
905 	if (err == EAGAIN)
906 		device_printf(sc->dev, "mxge: command %d timed out"
907 			      "result = %d\n",
908 			      cmd, be32toh(response->result));
909 	mtx_unlock(&sc->cmd_mtx);
910 	return err;
911 }
912 
913 static int
mxge_adopt_running_firmware(mxge_softc_t * sc)914 mxge_adopt_running_firmware(mxge_softc_t *sc)
915 {
916 	struct mcp_gen_header *hdr;
917 	const size_t bytes = sizeof (struct mcp_gen_header);
918 	size_t hdr_offset;
919 	int status;
920 
921 	/* find running firmware header */
922 	hdr_offset = htobe32(*(volatile uint32_t *)
923 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
924 
925 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
926 		device_printf(sc->dev,
927 			      "Running firmware has bad header offset (%d)\n",
928 			      (int)hdr_offset);
929 		return EIO;
930 	}
931 
932 	/* copy header of running firmware from SRAM to host memory to
933 	 * validate firmware */
934 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
935 	if (hdr == NULL) {
936 		device_printf(sc->dev, "could not malloc firmware hdr\n");
937 		return ENOMEM;
938 	}
939 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
940 				rman_get_bushandle(sc->mem_res),
941 				hdr_offset, (char *)hdr, bytes);
942 	status = mxge_validate_firmware(sc, hdr);
943 	free(hdr, M_DEVBUF);
944 
945 	/*
946 	 * check to see if adopted firmware has bug where adopting
947 	 * it will cause broadcasts to be filtered unless the NIC
948 	 * is kept in ALLMULTI mode
949 	 */
950 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
951 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
952 		sc->adopted_rx_filter_bug = 1;
953 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
954 			      "working around rx filter bug\n",
955 			      sc->fw_ver_major, sc->fw_ver_minor,
956 			      sc->fw_ver_tiny);
957 	}
958 
959 	return status;
960 }
961 
962 
963 static int
mxge_load_firmware(mxge_softc_t * sc,int adopt)964 mxge_load_firmware(mxge_softc_t *sc, int adopt)
965 {
966 	volatile uint32_t *confirm;
967 	volatile char *submit;
968 	char buf_bytes[72];
969 	uint32_t *buf, size, dma_low, dma_high;
970 	int status, i;
971 
972 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
973 
974 	size = sc->sram_size;
975 	status = mxge_load_firmware_helper(sc, &size);
976 	if (status) {
977 		if (!adopt)
978 			return status;
979 		/* Try to use the currently running firmware, if
980 		   it is new enough */
981 		status = mxge_adopt_running_firmware(sc);
982 		if (status) {
983 			device_printf(sc->dev,
984 				      "failed to adopt running firmware\n");
985 			return status;
986 		}
987 		device_printf(sc->dev,
988 			      "Successfully adopted running firmware\n");
989 		if (sc->tx_boundary == 4096) {
990 			device_printf(sc->dev,
991 				"Using firmware currently running on NIC"
992 				 ".  For optimal\n");
993 			device_printf(sc->dev,
994 				 "performance consider loading optimized "
995 				 "firmware\n");
996 		}
997 		sc->fw_name = mxge_fw_unaligned;
998 		sc->tx_boundary = 2048;
999 		return 0;
1000 	}
1001 	/* clear confirmation addr */
1002 	confirm = (volatile uint32_t *)sc->cmd;
1003 	*confirm = 0;
1004 	wmb();
1005 	/* send a reload command to the bootstrap MCP, and wait for the
1006 	   response in the confirmation address.  The firmware should
1007 	   write a -1 there to indicate it is alive and well
1008 	*/
1009 
1010 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1011 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1012 
1013 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1014 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1015 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1016 
1017 	/* FIX: All newest firmware should un-protect the bottom of
1018 	   the sram before handoff. However, the very first interfaces
1019 	   do not. Therefore the handoff copy must skip the first 8 bytes
1020 	*/
1021 					/* where the code starts*/
1022 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1023 	buf[4] = htobe32(size - 8); 	/* length of code */
1024 	buf[5] = htobe32(8);		/* where to copy to */
1025 	buf[6] = htobe32(0);		/* where to jump to */
1026 
1027 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1028 	mxge_pio_copy(submit, buf, 64);
1029 	wmb();
1030 	DELAY(1000);
1031 	wmb();
1032 	i = 0;
1033 	while (*confirm != 0xffffffff && i < 20) {
1034 		DELAY(1000*10);
1035 		i++;
1036 		bus_dmamap_sync(sc->cmd_dma.dmat,
1037 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1038 	}
1039 	if (*confirm != 0xffffffff) {
1040 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1041 			confirm, *confirm);
1042 
1043 		return ENXIO;
1044 	}
1045 	return 0;
1046 }
1047 
1048 static int
mxge_update_mac_address(mxge_softc_t * sc)1049 mxge_update_mac_address(mxge_softc_t *sc)
1050 {
1051 	mxge_cmd_t cmd;
1052 	uint8_t *addr = sc->mac_addr;
1053 	int status;
1054 
1055 
1056 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1057 		     | (addr[2] << 8) | addr[3]);
1058 
1059 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1060 
1061 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1062 	return status;
1063 }
1064 
1065 static int
mxge_change_pause(mxge_softc_t * sc,int pause)1066 mxge_change_pause(mxge_softc_t *sc, int pause)
1067 {
1068 	mxge_cmd_t cmd;
1069 	int status;
1070 
1071 	if (pause)
1072 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1073 				       &cmd);
1074 	else
1075 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1076 				       &cmd);
1077 
1078 	if (status) {
1079 		device_printf(sc->dev, "Failed to set flow control mode\n");
1080 		return ENXIO;
1081 	}
1082 	sc->pause = pause;
1083 	return 0;
1084 }
1085 
1086 static void
mxge_change_promisc(mxge_softc_t * sc,int promisc)1087 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1088 {
1089 	mxge_cmd_t cmd;
1090 	int status;
1091 
1092 	if (mxge_always_promisc)
1093 		promisc = 1;
1094 
1095 	if (promisc)
1096 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1097 				       &cmd);
1098 	else
1099 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1100 				       &cmd);
1101 
1102 	if (status) {
1103 		device_printf(sc->dev, "Failed to set promisc mode\n");
1104 	}
1105 }
1106 
1107 static void
mxge_set_multicast_list(mxge_softc_t * sc)1108 mxge_set_multicast_list(mxge_softc_t *sc)
1109 {
1110 	mxge_cmd_t cmd;
1111 	struct ifmultiaddr *ifma;
1112 	struct ifnet *ifp = sc->ifp;
1113 	int err;
1114 
1115 	/* This firmware is known to not support multicast */
1116 	if (!sc->fw_multicast_support)
1117 		return;
1118 
1119 	/* Disable multicast filtering while we play with the lists*/
1120 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1121 	if (err != 0) {
1122 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1123 		       " error status: %d\n", err);
1124 		return;
1125 	}
1126 
1127 	if (sc->adopted_rx_filter_bug)
1128 		return;
1129 
1130 	if (ifp->if_flags & IFF_ALLMULTI)
1131 		/* request to disable multicast filtering, so quit here */
1132 		return;
1133 
1134 	/* Flush all the filters */
1135 
1136 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1137 	if (err != 0) {
1138 		device_printf(sc->dev,
1139 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1140 			      ", error status: %d\n", err);
1141 		return;
1142 	}
1143 
1144 	/* Walk the multicast list, and add each address */
1145 
1146 	if_maddr_rlock(ifp);
1147 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1148 		if (ifma->ifma_addr->sa_family != AF_LINK)
1149 			continue;
1150 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1151 		      &cmd.data0, 4);
1152 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1153 		      &cmd.data1, 2);
1154 		cmd.data0 = htonl(cmd.data0);
1155 		cmd.data1 = htonl(cmd.data1);
1156 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1157 		if (err != 0) {
1158 			device_printf(sc->dev, "Failed "
1159 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1160 			       "%d\t", err);
1161 			/* abort, leaving multicast filtering off */
1162 			if_maddr_runlock(ifp);
1163 			return;
1164 		}
1165 	}
1166 	if_maddr_runlock(ifp);
1167 	/* Enable multicast filtering */
1168 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1169 	if (err != 0) {
1170 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1171 		       ", error status: %d\n", err);
1172 	}
1173 }
1174 
1175 static int
mxge_max_mtu(mxge_softc_t * sc)1176 mxge_max_mtu(mxge_softc_t *sc)
1177 {
1178 	mxge_cmd_t cmd;
1179 	int status;
1180 
1181 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1182 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1183 
1184 	/* try to set nbufs to see if it we can
1185 	   use virtually contiguous jumbos */
1186 	cmd.data0 = 0;
1187 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1188 			       &cmd);
1189 	if (status == 0)
1190 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1191 
1192 	/* otherwise, we're limited to MJUMPAGESIZE */
1193 	return MJUMPAGESIZE - MXGEFW_PAD;
1194 }
1195 
1196 static int
mxge_reset(mxge_softc_t * sc,int interrupts_setup)1197 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1198 {
1199 	struct mxge_slice_state *ss;
1200 	mxge_rx_done_t *rx_done;
1201 	volatile uint32_t *irq_claim;
1202 	mxge_cmd_t cmd;
1203 	int slice, status;
1204 
1205 	/* try to send a reset command to the card to see if it
1206 	   is alive */
1207 	memset(&cmd, 0, sizeof (cmd));
1208 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1209 	if (status != 0) {
1210 		device_printf(sc->dev, "failed reset\n");
1211 		return ENXIO;
1212 	}
1213 
1214 	mxge_dummy_rdma(sc, 1);
1215 
1216 
1217 	/* set the intrq size */
1218 	cmd.data0 = sc->rx_ring_size;
1219 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1220 
1221 	/*
1222 	 * Even though we already know how many slices are supported
1223 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1224 	 * has magic side effects, and must be called after a reset.
1225 	 * It must be called prior to calling any RSS related cmds,
1226 	 * including assigning an interrupt queue for anything but
1227 	 * slice 0.  It must also be called *after*
1228 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1229 	 * the firmware to compute offsets.
1230 	 */
1231 
1232 	if (sc->num_slices > 1) {
1233 		/* ask the maximum number of slices it supports */
1234 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1235 					   &cmd);
1236 		if (status != 0) {
1237 			device_printf(sc->dev,
1238 				      "failed to get number of slices\n");
1239 			return status;
1240 		}
1241 		/*
1242 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1243 		 * to setting up the interrupt queue DMA
1244 		 */
1245 		cmd.data0 = sc->num_slices;
1246 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1247 #ifdef IFNET_BUF_RING
1248 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1249 #endif
1250 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1251 					   &cmd);
1252 		if (status != 0) {
1253 			device_printf(sc->dev,
1254 				      "failed to set number of slices\n");
1255 			return status;
1256 		}
1257 	}
1258 
1259 
1260 	if (interrupts_setup) {
1261 		/* Now exchange information about interrupts  */
1262 		for (slice = 0; slice < sc->num_slices; slice++) {
1263 			rx_done = &sc->ss[slice].rx_done;
1264 			memset(rx_done->entry, 0, sc->rx_ring_size);
1265 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1266 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1267 			cmd.data2 = slice;
1268 			status |= mxge_send_cmd(sc,
1269 						MXGEFW_CMD_SET_INTRQ_DMA,
1270 						&cmd);
1271 		}
1272 	}
1273 
1274 	status |= mxge_send_cmd(sc,
1275 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1276 
1277 
1278 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1279 
1280 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1281 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1282 
1283 
1284 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1285 				&cmd);
1286 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1287 	if (status != 0) {
1288 		device_printf(sc->dev, "failed set interrupt parameters\n");
1289 		return status;
1290 	}
1291 
1292 
1293 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1294 
1295 
1296 	/* run a DMA benchmark */
1297 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1298 
1299 	for (slice = 0; slice < sc->num_slices; slice++) {
1300 		ss = &sc->ss[slice];
1301 
1302 		ss->irq_claim = irq_claim + (2 * slice);
1303 		/* reset mcp/driver shared state back to 0 */
1304 		ss->rx_done.idx = 0;
1305 		ss->rx_done.cnt = 0;
1306 		ss->tx.req = 0;
1307 		ss->tx.done = 0;
1308 		ss->tx.pkt_done = 0;
1309 		ss->tx.queue_active = 0;
1310 		ss->tx.activate = 0;
1311 		ss->tx.deactivate = 0;
1312 		ss->tx.wake = 0;
1313 		ss->tx.defrag = 0;
1314 		ss->tx.stall = 0;
1315 		ss->rx_big.cnt = 0;
1316 		ss->rx_small.cnt = 0;
1317 		ss->lc.lro_bad_csum = 0;
1318 		ss->lc.lro_queued = 0;
1319 		ss->lc.lro_flushed = 0;
1320 		if (ss->fw_stats != NULL) {
1321 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1322 		}
1323 	}
1324 	sc->rdma_tags_available = 15;
1325 	status = mxge_update_mac_address(sc);
1326 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1327 	mxge_change_pause(sc, sc->pause);
1328 	mxge_set_multicast_list(sc);
1329 	if (sc->throttle) {
1330 		cmd.data0 = sc->throttle;
1331 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1332 				  &cmd)) {
1333 			device_printf(sc->dev,
1334 				      "can't enable throttle\n");
1335 		}
1336 	}
1337 	return status;
1338 }
1339 
1340 static int
mxge_change_throttle(SYSCTL_HANDLER_ARGS)1341 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1342 {
1343 	mxge_cmd_t cmd;
1344 	mxge_softc_t *sc;
1345 	int err;
1346 	unsigned int throttle;
1347 
1348 	sc = arg1;
1349 	throttle = sc->throttle;
1350 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1351         if (err != 0) {
1352                 return err;
1353         }
1354 
1355 	if (throttle == sc->throttle)
1356 		return 0;
1357 
1358         if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1359                 return EINVAL;
1360 
1361 	mtx_lock(&sc->driver_mtx);
1362 	cmd.data0 = throttle;
1363 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1364 	if (err == 0)
1365 		sc->throttle = throttle;
1366 	mtx_unlock(&sc->driver_mtx);
1367 	return err;
1368 }
1369 
1370 static int
mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)1371 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1372 {
1373         mxge_softc_t *sc;
1374         unsigned int intr_coal_delay;
1375         int err;
1376 
1377         sc = arg1;
1378         intr_coal_delay = sc->intr_coal_delay;
1379         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1380         if (err != 0) {
1381                 return err;
1382         }
1383         if (intr_coal_delay == sc->intr_coal_delay)
1384                 return 0;
1385 
1386         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1387                 return EINVAL;
1388 
1389 	mtx_lock(&sc->driver_mtx);
1390 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1391 	sc->intr_coal_delay = intr_coal_delay;
1392 
1393 	mtx_unlock(&sc->driver_mtx);
1394         return err;
1395 }
1396 
1397 static int
mxge_change_flow_control(SYSCTL_HANDLER_ARGS)1398 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1399 {
1400         mxge_softc_t *sc;
1401         unsigned int enabled;
1402         int err;
1403 
1404         sc = arg1;
1405         enabled = sc->pause;
1406         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1407         if (err != 0) {
1408                 return err;
1409         }
1410         if (enabled == sc->pause)
1411                 return 0;
1412 
1413 	mtx_lock(&sc->driver_mtx);
1414 	err = mxge_change_pause(sc, enabled);
1415 	mtx_unlock(&sc->driver_mtx);
1416         return err;
1417 }
1418 
1419 static int
mxge_handle_be32(SYSCTL_HANDLER_ARGS)1420 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1421 {
1422         int err;
1423 
1424         if (arg1 == NULL)
1425                 return EFAULT;
1426         arg2 = be32toh(*(int *)arg1);
1427         arg1 = NULL;
1428         err = sysctl_handle_int(oidp, arg1, arg2, req);
1429 
1430         return err;
1431 }
1432 
1433 static void
mxge_rem_sysctls(mxge_softc_t * sc)1434 mxge_rem_sysctls(mxge_softc_t *sc)
1435 {
1436 	struct mxge_slice_state *ss;
1437 	int slice;
1438 
1439 	if (sc->slice_sysctl_tree == NULL)
1440 		return;
1441 
1442 	for (slice = 0; slice < sc->num_slices; slice++) {
1443 		ss = &sc->ss[slice];
1444 		if (ss == NULL || ss->sysctl_tree == NULL)
1445 			continue;
1446 		sysctl_ctx_free(&ss->sysctl_ctx);
1447 		ss->sysctl_tree = NULL;
1448 	}
1449 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1450 	sc->slice_sysctl_tree = NULL;
1451 }
1452 
1453 static void
mxge_add_sysctls(mxge_softc_t * sc)1454 mxge_add_sysctls(mxge_softc_t *sc)
1455 {
1456 	struct sysctl_ctx_list *ctx;
1457 	struct sysctl_oid_list *children;
1458 	mcp_irq_data_t *fw;
1459 	struct mxge_slice_state *ss;
1460 	int slice;
1461 	char slice_num[8];
1462 
1463 	ctx = device_get_sysctl_ctx(sc->dev);
1464 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1465 	fw = sc->ss[0].fw_stats;
1466 
1467 	/* random information */
1468 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1469 		       "firmware_version",
1470 		       CTLFLAG_RD, sc->fw_version,
1471 		       0, "firmware version");
1472 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1473 		       "serial_number",
1474 		       CTLFLAG_RD, sc->serial_number_string,
1475 		       0, "serial number");
1476 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1477 		       "product_code",
1478 		       CTLFLAG_RD, sc->product_code_string,
1479 		       0, "product_code");
1480 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1481 		       "pcie_link_width",
1482 		       CTLFLAG_RD, &sc->link_width,
1483 		       0, "tx_boundary");
1484 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1485 		       "tx_boundary",
1486 		       CTLFLAG_RD, &sc->tx_boundary,
1487 		       0, "tx_boundary");
1488 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1489 		       "write_combine",
1490 		       CTLFLAG_RD, &sc->wc,
1491 		       0, "write combining PIO?");
1492 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1493 		       "read_dma_MBs",
1494 		       CTLFLAG_RD, &sc->read_dma,
1495 		       0, "DMA Read speed in MB/s");
1496 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1497 		       "write_dma_MBs",
1498 		       CTLFLAG_RD, &sc->write_dma,
1499 		       0, "DMA Write speed in MB/s");
1500 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1501 		       "read_write_dma_MBs",
1502 		       CTLFLAG_RD, &sc->read_write_dma,
1503 		       0, "DMA concurrent Read/Write speed in MB/s");
1504 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1505 		       "watchdog_resets",
1506 		       CTLFLAG_RD, &sc->watchdog_resets,
1507 		       0, "Number of times NIC was reset");
1508 
1509 
1510 	/* performance related tunables */
1511 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1512 			"intr_coal_delay",
1513 			CTLTYPE_INT|CTLFLAG_RW, sc,
1514 			0, mxge_change_intr_coal,
1515 			"I", "interrupt coalescing delay in usecs");
1516 
1517 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1518 			"throttle",
1519 			CTLTYPE_INT|CTLFLAG_RW, sc,
1520 			0, mxge_change_throttle,
1521 			"I", "transmit throttling");
1522 
1523 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1524 			"flow_control_enabled",
1525 			CTLTYPE_INT|CTLFLAG_RW, sc,
1526 			0, mxge_change_flow_control,
1527 			"I", "interrupt coalescing delay in usecs");
1528 
1529 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1530 		       "deassert_wait",
1531 		       CTLFLAG_RW, &mxge_deassert_wait,
1532 		       0, "Wait for IRQ line to go low in ihandler");
1533 
1534 	/* stats block from firmware is in network byte order.
1535 	   Need to swap it */
1536 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1537 			"link_up",
1538 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1539 			0, mxge_handle_be32,
1540 			"I", "link up");
1541 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1542 			"rdma_tags_available",
1543 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1544 			0, mxge_handle_be32,
1545 			"I", "rdma_tags_available");
1546 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1547 			"dropped_bad_crc32",
1548 			CTLTYPE_INT|CTLFLAG_RD,
1549 			&fw->dropped_bad_crc32,
1550 			0, mxge_handle_be32,
1551 			"I", "dropped_bad_crc32");
1552 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1553 			"dropped_bad_phy",
1554 			CTLTYPE_INT|CTLFLAG_RD,
1555 			&fw->dropped_bad_phy,
1556 			0, mxge_handle_be32,
1557 			"I", "dropped_bad_phy");
1558 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1559 			"dropped_link_error_or_filtered",
1560 			CTLTYPE_INT|CTLFLAG_RD,
1561 			&fw->dropped_link_error_or_filtered,
1562 			0, mxge_handle_be32,
1563 			"I", "dropped_link_error_or_filtered");
1564 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1565 			"dropped_link_overflow",
1566 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1567 			0, mxge_handle_be32,
1568 			"I", "dropped_link_overflow");
1569 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1570 			"dropped_multicast_filtered",
1571 			CTLTYPE_INT|CTLFLAG_RD,
1572 			&fw->dropped_multicast_filtered,
1573 			0, mxge_handle_be32,
1574 			"I", "dropped_multicast_filtered");
1575 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1576 			"dropped_no_big_buffer",
1577 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1578 			0, mxge_handle_be32,
1579 			"I", "dropped_no_big_buffer");
1580 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1581 			"dropped_no_small_buffer",
1582 			CTLTYPE_INT|CTLFLAG_RD,
1583 			&fw->dropped_no_small_buffer,
1584 			0, mxge_handle_be32,
1585 			"I", "dropped_no_small_buffer");
1586 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1587 			"dropped_overrun",
1588 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1589 			0, mxge_handle_be32,
1590 			"I", "dropped_overrun");
1591 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1592 			"dropped_pause",
1593 			CTLTYPE_INT|CTLFLAG_RD,
1594 			&fw->dropped_pause,
1595 			0, mxge_handle_be32,
1596 			"I", "dropped_pause");
1597 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1598 			"dropped_runt",
1599 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1600 			0, mxge_handle_be32,
1601 			"I", "dropped_runt");
1602 
1603 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1604 			"dropped_unicast_filtered",
1605 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1606 			0, mxge_handle_be32,
1607 			"I", "dropped_unicast_filtered");
1608 
1609 	/* verbose printing? */
1610 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1611 		       "verbose",
1612 		       CTLFLAG_RW, &mxge_verbose,
1613 		       0, "verbose printing");
1614 
1615 	/* add counters exported for debugging from all slices */
1616 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1617 	sc->slice_sysctl_tree =
1618 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1619 				"slice", CTLFLAG_RD, 0, "");
1620 
1621 	for (slice = 0; slice < sc->num_slices; slice++) {
1622 		ss = &sc->ss[slice];
1623 		sysctl_ctx_init(&ss->sysctl_ctx);
1624 		ctx = &ss->sysctl_ctx;
1625 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1626 		sprintf(slice_num, "%d", slice);
1627 		ss->sysctl_tree =
1628 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1629 					CTLFLAG_RD, 0, "");
1630 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1631 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1632 			       "rx_small_cnt",
1633 			       CTLFLAG_RD, &ss->rx_small.cnt,
1634 			       0, "rx_small_cnt");
1635 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1636 			       "rx_big_cnt",
1637 			       CTLFLAG_RD, &ss->rx_big.cnt,
1638 			       0, "rx_small_cnt");
1639 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1640 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1641 			       0, "number of lro merge queues flushed");
1642 
1643 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1644 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1645 			       0, "number of bad csums preventing LRO");
1646 
1647 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1648 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1649 			       0, "number of frames appended to lro merge"
1650 			       "queues");
1651 
1652 #ifndef IFNET_BUF_RING
1653 		/* only transmit from slice 0 for now */
1654 		if (slice > 0)
1655 			continue;
1656 #endif
1657 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1658 			       "tx_req",
1659 			       CTLFLAG_RD, &ss->tx.req,
1660 			       0, "tx_req");
1661 
1662 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1663 			       "tx_done",
1664 			       CTLFLAG_RD, &ss->tx.done,
1665 			       0, "tx_done");
1666 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1667 			       "tx_pkt_done",
1668 			       CTLFLAG_RD, &ss->tx.pkt_done,
1669 			       0, "tx_done");
1670 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1671 			       "tx_stall",
1672 			       CTLFLAG_RD, &ss->tx.stall,
1673 			       0, "tx_stall");
1674 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1675 			       "tx_wake",
1676 			       CTLFLAG_RD, &ss->tx.wake,
1677 			       0, "tx_wake");
1678 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1679 			       "tx_defrag",
1680 			       CTLFLAG_RD, &ss->tx.defrag,
1681 			       0, "tx_defrag");
1682 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1683 			       "tx_queue_active",
1684 			       CTLFLAG_RD, &ss->tx.queue_active,
1685 			       0, "tx_queue_active");
1686 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1687 			       "tx_activate",
1688 			       CTLFLAG_RD, &ss->tx.activate,
1689 			       0, "tx_activate");
1690 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1691 			       "tx_deactivate",
1692 			       CTLFLAG_RD, &ss->tx.deactivate,
1693 			       0, "tx_deactivate");
1694 	}
1695 }
1696 
1697 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1698    backwards one at a time and handle ring wraps */
1699 
1700 static inline void
mxge_submit_req_backwards(mxge_tx_ring_t * tx,mcp_kreq_ether_send_t * src,int cnt)1701 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1702 			    mcp_kreq_ether_send_t *src, int cnt)
1703 {
1704         int idx, starting_slot;
1705         starting_slot = tx->req;
1706         while (cnt > 1) {
1707                 cnt--;
1708                 idx = (starting_slot + cnt) & tx->mask;
1709                 mxge_pio_copy(&tx->lanai[idx],
1710 			      &src[cnt], sizeof(*src));
1711                 wmb();
1712         }
1713 }
1714 
1715 /*
1716  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1717  * at most 32 bytes at a time, so as to avoid involving the software
1718  * pio handler in the nic.   We re-write the first segment's flags
1719  * to mark them valid only after writing the entire chain
1720  */
1721 
1722 static inline void
mxge_submit_req(mxge_tx_ring_t * tx,mcp_kreq_ether_send_t * src,int cnt)1723 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1724                   int cnt)
1725 {
1726         int idx, i;
1727         uint32_t *src_ints;
1728 	volatile uint32_t *dst_ints;
1729         mcp_kreq_ether_send_t *srcp;
1730 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1731 	uint8_t last_flags;
1732 
1733         idx = tx->req & tx->mask;
1734 
1735 	last_flags = src->flags;
1736 	src->flags = 0;
1737         wmb();
1738         dst = dstp = &tx->lanai[idx];
1739         srcp = src;
1740 
1741         if ((idx + cnt) < tx->mask) {
1742                 for (i = 0; i < (cnt - 1); i += 2) {
1743                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1744                         wmb(); /* force write every 32 bytes */
1745                         srcp += 2;
1746                         dstp += 2;
1747                 }
1748         } else {
1749                 /* submit all but the first request, and ensure
1750                    that it is submitted below */
1751                 mxge_submit_req_backwards(tx, src, cnt);
1752                 i = 0;
1753         }
1754         if (i < cnt) {
1755                 /* submit the first request */
1756                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1757                 wmb(); /* barrier before setting valid flag */
1758         }
1759 
1760         /* re-write the last 32-bits with the valid flags */
1761         src->flags = last_flags;
1762         src_ints = (uint32_t *)src;
1763         src_ints+=3;
1764         dst_ints = (volatile uint32_t *)dst;
1765         dst_ints+=3;
1766         *dst_ints =  *src_ints;
1767         tx->req += cnt;
1768         wmb();
1769 }
1770 
1771 static int
mxge_parse_tx(struct mxge_slice_state * ss,struct mbuf * m,struct mxge_pkt_info * pi)1772 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1773     struct mxge_pkt_info *pi)
1774 {
1775 	struct ether_vlan_header *eh;
1776 	uint16_t etype;
1777 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1778 #if IFCAP_TSO6 && defined(INET6)
1779 	int nxt;
1780 #endif
1781 
1782 	eh = mtod(m, struct ether_vlan_header *);
1783 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1784 		etype = ntohs(eh->evl_proto);
1785 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1786 	} else {
1787 		etype = ntohs(eh->evl_encap_proto);
1788 		pi->ip_off = ETHER_HDR_LEN;
1789 	}
1790 
1791 	switch (etype) {
1792 	case ETHERTYPE_IP:
1793 		/*
1794 		 * ensure ip header is in first mbuf, copy it to a
1795 		 * scratch buffer if not
1796 		 */
1797 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1798 		pi->ip6 = NULL;
1799 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1800 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1801 			    ss->scratch);
1802 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1803 		}
1804 		pi->ip_hlen = pi->ip->ip_hl << 2;
1805 		if (!tso)
1806 			return 0;
1807 
1808 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1809 		    sizeof(struct tcphdr))) {
1810 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1811 			    sizeof(struct tcphdr), ss->scratch);
1812 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1813 		}
1814 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1815 		break;
1816 #if IFCAP_TSO6 && defined(INET6)
1817 	case ETHERTYPE_IPV6:
1818 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1819 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1820 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1821 			    ss->scratch);
1822 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1823 		}
1824 		nxt = 0;
1825 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1826 		pi->ip_hlen -= pi->ip_off;
1827 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1828 			return EINVAL;
1829 
1830 		if (!tso)
1831 			return 0;
1832 
1833 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1834 			return EINVAL;
1835 
1836 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1837 		    sizeof(struct tcphdr))) {
1838 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1839 			    sizeof(struct tcphdr), ss->scratch);
1840 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1841 		}
1842 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1843 		break;
1844 #endif
1845 	default:
1846 		return EINVAL;
1847 	}
1848 	return 0;
1849 }
1850 
1851 #if IFCAP_TSO4
1852 
1853 static void
mxge_encap_tso(struct mxge_slice_state * ss,struct mbuf * m,int busdma_seg_cnt,struct mxge_pkt_info * pi)1854 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1855 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1856 {
1857 	mxge_tx_ring_t *tx;
1858 	mcp_kreq_ether_send_t *req;
1859 	bus_dma_segment_t *seg;
1860 	uint32_t low, high_swapped;
1861 	int len, seglen, cum_len, cum_len_next;
1862 	int next_is_first, chop, cnt, rdma_count, small;
1863 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1864 	uint8_t flags, flags_next;
1865 	static int once;
1866 
1867 	mss = m->m_pkthdr.tso_segsz;
1868 
1869 	/* negative cum_len signifies to the
1870 	 * send loop that we are still in the
1871 	 * header portion of the TSO packet.
1872 	 */
1873 
1874 	cksum_offset = pi->ip_off + pi->ip_hlen;
1875 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1876 
1877 	/* TSO implies checksum offload on this hardware */
1878 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1879 		/*
1880 		 * If packet has full TCP csum, replace it with pseudo hdr
1881 		 * sum that the NIC expects, otherwise the NIC will emit
1882 		 * packets with bad TCP checksums.
1883 		 */
1884 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1885 		if (pi->ip6) {
1886 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1887 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1888 			sum = in6_cksum_pseudo(pi->ip6,
1889 			    m->m_pkthdr.len - cksum_offset,
1890 			    IPPROTO_TCP, 0);
1891 #endif
1892 		} else {
1893 #ifdef INET
1894 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1895 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1896 			    pi->ip->ip_dst.s_addr,
1897 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1898 				    cksum_offset)));
1899 #endif
1900 		}
1901 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1902 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1903 	}
1904 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1905 
1906 
1907 	/* for TSO, pseudo_hdr_offset holds mss.
1908 	 * The firmware figures out where to put
1909 	 * the checksum by parsing the header. */
1910 	pseudo_hdr_offset = htobe16(mss);
1911 
1912 	if (pi->ip6) {
1913 		/*
1914 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1915 		 * to store the TCP header len
1916 		 */
1917 		cksum_offset = (pi->tcp->th_off << 2);
1918 	}
1919 
1920 	tx = &ss->tx;
1921 	req = tx->req_list;
1922 	seg = tx->seg_list;
1923 	cnt = 0;
1924 	rdma_count = 0;
1925 	/* "rdma_count" is the number of RDMAs belonging to the
1926 	 * current packet BEFORE the current send request. For
1927 	 * non-TSO packets, this is equal to "count".
1928 	 * For TSO packets, rdma_count needs to be reset
1929 	 * to 0 after a segment cut.
1930 	 *
1931 	 * The rdma_count field of the send request is
1932 	 * the number of RDMAs of the packet starting at
1933 	 * that request. For TSO send requests with one ore more cuts
1934 	 * in the middle, this is the number of RDMAs starting
1935 	 * after the last cut in the request. All previous
1936 	 * segments before the last cut implicitly have 1 RDMA.
1937 	 *
1938 	 * Since the number of RDMAs is not known beforehand,
1939 	 * it must be filled-in retroactively - after each
1940 	 * segmentation cut or at the end of the entire packet.
1941 	 */
1942 
1943 	while (busdma_seg_cnt) {
1944 		/* Break the busdma segment up into pieces*/
1945 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1946 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1947 		len = seg->ds_len;
1948 
1949 		while (len) {
1950 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1951 			seglen = len;
1952 			cum_len_next = cum_len + seglen;
1953 			(req-rdma_count)->rdma_count = rdma_count + 1;
1954 			if (__predict_true(cum_len >= 0)) {
1955 				/* payload */
1956 				chop = (cum_len_next > mss);
1957 				cum_len_next = cum_len_next % mss;
1958 				next_is_first = (cum_len_next == 0);
1959 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1960 				flags_next |= next_is_first *
1961 					MXGEFW_FLAGS_FIRST;
1962 				rdma_count |= -(chop | next_is_first);
1963 				rdma_count += chop & !next_is_first;
1964 			} else if (cum_len_next >= 0) {
1965 				/* header ends */
1966 				rdma_count = -1;
1967 				cum_len_next = 0;
1968 				seglen = -cum_len;
1969 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1970 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1971 					MXGEFW_FLAGS_FIRST |
1972 					(small * MXGEFW_FLAGS_SMALL);
1973 			    }
1974 
1975 			req->addr_high = high_swapped;
1976 			req->addr_low = htobe32(low);
1977 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1978 			req->pad = 0;
1979 			req->rdma_count = 1;
1980 			req->length = htobe16(seglen);
1981 			req->cksum_offset = cksum_offset;
1982 			req->flags = flags | ((cum_len & 1) *
1983 					      MXGEFW_FLAGS_ALIGN_ODD);
1984 			low += seglen;
1985 			len -= seglen;
1986 			cum_len = cum_len_next;
1987 			flags = flags_next;
1988 			req++;
1989 			cnt++;
1990 			rdma_count++;
1991 			if (cksum_offset != 0 && !pi->ip6) {
1992 				if (__predict_false(cksum_offset > seglen))
1993 					cksum_offset -= seglen;
1994 				else
1995 					cksum_offset = 0;
1996 			}
1997 			if (__predict_false(cnt > tx->max_desc))
1998 				goto drop;
1999 		}
2000 		busdma_seg_cnt--;
2001 		seg++;
2002 	}
2003 	(req-rdma_count)->rdma_count = rdma_count;
2004 
2005 	do {
2006 		req--;
2007 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
2008 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2009 
2010 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2011 	mxge_submit_req(tx, tx->req_list, cnt);
2012 #ifdef IFNET_BUF_RING
2013 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2014 		/* tell the NIC to start polling this slice */
2015 		*tx->send_go = 1;
2016 		tx->queue_active = 1;
2017 		tx->activate++;
2018 		wmb();
2019 	}
2020 #endif
2021 	return;
2022 
2023 drop:
2024 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2025 	m_freem(m);
2026 	ss->oerrors++;
2027 	if (!once) {
2028 		printf("tx->max_desc exceeded via TSO!\n");
2029 		printf("mss = %d, %ld, %d!\n", mss,
2030 		       (long)seg - (long)tx->seg_list, tx->max_desc);
2031 		once = 1;
2032 	}
2033 	return;
2034 
2035 }
2036 
2037 #endif /* IFCAP_TSO4 */
2038 
2039 #ifdef MXGE_NEW_VLAN_API
2040 /*
2041  * We reproduce the software vlan tag insertion from
2042  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2043  * vlan tag insertion. We need to advertise this in order to have the
2044  * vlan interface respect our csum offload flags.
2045  */
2046 static struct mbuf *
mxge_vlan_tag_insert(struct mbuf * m)2047 mxge_vlan_tag_insert(struct mbuf *m)
2048 {
2049 	struct ether_vlan_header *evl;
2050 
2051 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2052 	if (__predict_false(m == NULL))
2053 		return NULL;
2054 	if (m->m_len < sizeof(*evl)) {
2055 		m = m_pullup(m, sizeof(*evl));
2056 		if (__predict_false(m == NULL))
2057 			return NULL;
2058 	}
2059 	/*
2060 	 * Transform the Ethernet header into an Ethernet header
2061 	 * with 802.1Q encapsulation.
2062 	 */
2063 	evl = mtod(m, struct ether_vlan_header *);
2064 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2065 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2066 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2067 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2068 	m->m_flags &= ~M_VLANTAG;
2069 	return m;
2070 }
2071 #endif /* MXGE_NEW_VLAN_API */
2072 
2073 static void
mxge_encap(struct mxge_slice_state * ss,struct mbuf * m)2074 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2075 {
2076 	struct mxge_pkt_info pi = {0,0,0,0};
2077 	mxge_softc_t *sc;
2078 	mcp_kreq_ether_send_t *req;
2079 	bus_dma_segment_t *seg;
2080 	struct mbuf *m_tmp;
2081 	struct ifnet *ifp;
2082 	mxge_tx_ring_t *tx;
2083 	int cnt, cum_len, err, i, idx, odd_flag;
2084 	uint16_t pseudo_hdr_offset;
2085         uint8_t flags, cksum_offset;
2086 
2087 
2088 	sc = ss->sc;
2089 	ifp = sc->ifp;
2090 	tx = &ss->tx;
2091 
2092 #ifdef MXGE_NEW_VLAN_API
2093 	if (m->m_flags & M_VLANTAG) {
2094 		m = mxge_vlan_tag_insert(m);
2095 		if (__predict_false(m == NULL))
2096 			goto drop_without_m;
2097 	}
2098 #endif
2099 	if (m->m_pkthdr.csum_flags &
2100 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2101 		if (mxge_parse_tx(ss, m, &pi))
2102 			goto drop;
2103 	}
2104 
2105 	/* (try to) map the frame for DMA */
2106 	idx = tx->req & tx->mask;
2107 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2108 				      m, tx->seg_list, &cnt,
2109 				      BUS_DMA_NOWAIT);
2110 	if (__predict_false(err == EFBIG)) {
2111 		/* Too many segments in the chain.  Try
2112 		   to defrag */
2113 		m_tmp = m_defrag(m, M_NOWAIT);
2114 		if (m_tmp == NULL) {
2115 			goto drop;
2116 		}
2117 		ss->tx.defrag++;
2118 		m = m_tmp;
2119 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2120 					      tx->info[idx].map,
2121 					      m, tx->seg_list, &cnt,
2122 					      BUS_DMA_NOWAIT);
2123 	}
2124 	if (__predict_false(err != 0)) {
2125 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2126 			      " packet len = %d\n", err, m->m_pkthdr.len);
2127 		goto drop;
2128 	}
2129 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2130 			BUS_DMASYNC_PREWRITE);
2131 	tx->info[idx].m = m;
2132 
2133 #if IFCAP_TSO4
2134 	/* TSO is different enough, we handle it in another routine */
2135 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2136 		mxge_encap_tso(ss, m, cnt, &pi);
2137 		return;
2138 	}
2139 #endif
2140 
2141 	req = tx->req_list;
2142 	cksum_offset = 0;
2143 	pseudo_hdr_offset = 0;
2144 	flags = MXGEFW_FLAGS_NO_TSO;
2145 
2146 	/* checksum offloading? */
2147 	if (m->m_pkthdr.csum_flags &
2148 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2149 		/* ensure ip header is in first mbuf, copy
2150 		   it to a scratch buffer if not */
2151 		cksum_offset = pi.ip_off + pi.ip_hlen;
2152 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2153 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2154 		req->cksum_offset = cksum_offset;
2155 		flags |= MXGEFW_FLAGS_CKSUM;
2156 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2157 	} else {
2158 		odd_flag = 0;
2159 	}
2160 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2161 		flags |= MXGEFW_FLAGS_SMALL;
2162 
2163 	/* convert segments into a request list */
2164 	cum_len = 0;
2165 	seg = tx->seg_list;
2166 	req->flags = MXGEFW_FLAGS_FIRST;
2167 	for (i = 0; i < cnt; i++) {
2168 		req->addr_low =
2169 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2170 		req->addr_high =
2171 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2172 		req->length = htobe16(seg->ds_len);
2173 		req->cksum_offset = cksum_offset;
2174 		if (cksum_offset > seg->ds_len)
2175 			cksum_offset -= seg->ds_len;
2176 		else
2177 			cksum_offset = 0;
2178 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2179 		req->pad = 0; /* complete solid 16-byte block */
2180 		req->rdma_count = 1;
2181 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2182 		cum_len += seg->ds_len;
2183 		seg++;
2184 		req++;
2185 		req->flags = 0;
2186 	}
2187 	req--;
2188 	/* pad runts to 60 bytes */
2189 	if (cum_len < 60) {
2190 		req++;
2191 		req->addr_low =
2192 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2193 		req->addr_high =
2194 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2195 		req->length = htobe16(60 - cum_len);
2196 		req->cksum_offset = 0;
2197 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2198 		req->pad = 0; /* complete solid 16-byte block */
2199 		req->rdma_count = 1;
2200 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2201 		cnt++;
2202 	}
2203 
2204 	tx->req_list[0].rdma_count = cnt;
2205 #if 0
2206 	/* print what the firmware will see */
2207 	for (i = 0; i < cnt; i++) {
2208 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2209 		    "cso:%d, flags:0x%x, rdma:%d\n",
2210 		    i, (int)ntohl(tx->req_list[i].addr_high),
2211 		    (int)ntohl(tx->req_list[i].addr_low),
2212 		    (int)ntohs(tx->req_list[i].length),
2213 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2214 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2215 		    tx->req_list[i].rdma_count);
2216 	}
2217 	printf("--------------\n");
2218 #endif
2219 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2220 	mxge_submit_req(tx, tx->req_list, cnt);
2221 #ifdef IFNET_BUF_RING
2222 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2223 		/* tell the NIC to start polling this slice */
2224 		*tx->send_go = 1;
2225 		tx->queue_active = 1;
2226 		tx->activate++;
2227 		wmb();
2228 	}
2229 #endif
2230 	return;
2231 
2232 drop:
2233 	m_freem(m);
2234 drop_without_m:
2235 	ss->oerrors++;
2236 	return;
2237 }
2238 
2239 #ifdef IFNET_BUF_RING
2240 static void
mxge_qflush(struct ifnet * ifp)2241 mxge_qflush(struct ifnet *ifp)
2242 {
2243 	mxge_softc_t *sc = ifp->if_softc;
2244 	mxge_tx_ring_t *tx;
2245 	struct mbuf *m;
2246 	int slice;
2247 
2248 	for (slice = 0; slice < sc->num_slices; slice++) {
2249 		tx = &sc->ss[slice].tx;
2250 		mtx_lock(&tx->mtx);
2251 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2252 			m_freem(m);
2253 		mtx_unlock(&tx->mtx);
2254 	}
2255 	if_qflush(ifp);
2256 }
2257 
2258 static inline void
mxge_start_locked(struct mxge_slice_state * ss)2259 mxge_start_locked(struct mxge_slice_state *ss)
2260 {
2261 	mxge_softc_t *sc;
2262 	struct mbuf *m;
2263 	struct ifnet *ifp;
2264 	mxge_tx_ring_t *tx;
2265 
2266 	sc = ss->sc;
2267 	ifp = sc->ifp;
2268 	tx = &ss->tx;
2269 
2270 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2271 		m = drbr_dequeue(ifp, tx->br);
2272 		if (m == NULL) {
2273 			return;
2274 		}
2275 		/* let BPF see it */
2276 		BPF_MTAP(ifp, m);
2277 
2278 		/* give it to the nic */
2279 		mxge_encap(ss, m);
2280 	}
2281 	/* ran out of transmit slots */
2282 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2283 	    && (!drbr_empty(ifp, tx->br))) {
2284 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2285 		tx->stall++;
2286 	}
2287 }
2288 
2289 static int
mxge_transmit_locked(struct mxge_slice_state * ss,struct mbuf * m)2290 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2291 {
2292 	mxge_softc_t *sc;
2293 	struct ifnet *ifp;
2294 	mxge_tx_ring_t *tx;
2295 	int err;
2296 
2297 	sc = ss->sc;
2298 	ifp = sc->ifp;
2299 	tx = &ss->tx;
2300 
2301 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2302 	    IFF_DRV_RUNNING) {
2303 		err = drbr_enqueue(ifp, tx->br, m);
2304 		return (err);
2305 	}
2306 
2307 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2308 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2309 		/* let BPF see it */
2310 		BPF_MTAP(ifp, m);
2311 		/* give it to the nic */
2312 		mxge_encap(ss, m);
2313 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2314 		return (err);
2315 	}
2316 	if (!drbr_empty(ifp, tx->br))
2317 		mxge_start_locked(ss);
2318 	return (0);
2319 }
2320 
2321 static int
mxge_transmit(struct ifnet * ifp,struct mbuf * m)2322 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2323 {
2324 	mxge_softc_t *sc = ifp->if_softc;
2325 	struct mxge_slice_state *ss;
2326 	mxge_tx_ring_t *tx;
2327 	int err = 0;
2328 	int slice;
2329 
2330 	slice = m->m_pkthdr.flowid;
2331 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2332 
2333 	ss = &sc->ss[slice];
2334 	tx = &ss->tx;
2335 
2336 	if (mtx_trylock(&tx->mtx)) {
2337 		err = mxge_transmit_locked(ss, m);
2338 		mtx_unlock(&tx->mtx);
2339 	} else {
2340 		err = drbr_enqueue(ifp, tx->br, m);
2341 	}
2342 
2343 	return (err);
2344 }
2345 
2346 #else
2347 
2348 static inline void
mxge_start_locked(struct mxge_slice_state * ss)2349 mxge_start_locked(struct mxge_slice_state *ss)
2350 {
2351 	mxge_softc_t *sc;
2352 	struct mbuf *m;
2353 	struct ifnet *ifp;
2354 	mxge_tx_ring_t *tx;
2355 
2356 	sc = ss->sc;
2357 	ifp = sc->ifp;
2358 	tx = &ss->tx;
2359 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2360 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2361 		if (m == NULL) {
2362 			return;
2363 		}
2364 		/* let BPF see it */
2365 		BPF_MTAP(ifp, m);
2366 
2367 		/* give it to the nic */
2368 		mxge_encap(ss, m);
2369 	}
2370 	/* ran out of transmit slots */
2371 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2372 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2373 		tx->stall++;
2374 	}
2375 }
2376 #endif
2377 static void
mxge_start(struct ifnet * ifp)2378 mxge_start(struct ifnet *ifp)
2379 {
2380 	mxge_softc_t *sc = ifp->if_softc;
2381 	struct mxge_slice_state *ss;
2382 
2383 	/* only use the first slice for now */
2384 	ss = &sc->ss[0];
2385 	mtx_lock(&ss->tx.mtx);
2386 	mxge_start_locked(ss);
2387 	mtx_unlock(&ss->tx.mtx);
2388 }
2389 
2390 /*
2391  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2392  * at most 32 bytes at a time, so as to avoid involving the software
2393  * pio handler in the nic.   We re-write the first segment's low
2394  * DMA address to mark it valid only after we write the entire chunk
2395  * in a burst
2396  */
2397 static inline void
mxge_submit_8rx(volatile mcp_kreq_ether_recv_t * dst,mcp_kreq_ether_recv_t * src)2398 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2399 		mcp_kreq_ether_recv_t *src)
2400 {
2401 	uint32_t low;
2402 
2403 	low = src->addr_low;
2404 	src->addr_low = 0xffffffff;
2405 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2406 	wmb();
2407 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2408 	wmb();
2409 	src->addr_low = low;
2410 	dst->addr_low = low;
2411 	wmb();
2412 }
2413 
2414 static int
mxge_get_buf_small(struct mxge_slice_state * ss,bus_dmamap_t map,int idx)2415 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2416 {
2417 	bus_dma_segment_t seg;
2418 	struct mbuf *m;
2419 	mxge_rx_ring_t *rx = &ss->rx_small;
2420 	int cnt, err;
2421 
2422 	m = m_gethdr(M_NOWAIT, MT_DATA);
2423 	if (m == NULL) {
2424 		rx->alloc_fail++;
2425 		err = ENOBUFS;
2426 		goto done;
2427 	}
2428 	m->m_len = MHLEN;
2429 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2430 				      &seg, &cnt, BUS_DMA_NOWAIT);
2431 	if (err != 0) {
2432 		m_free(m);
2433 		goto done;
2434 	}
2435 	rx->info[idx].m = m;
2436 	rx->shadow[idx].addr_low =
2437 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2438 	rx->shadow[idx].addr_high =
2439 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2440 
2441 done:
2442 	if ((idx & 7) == 7)
2443 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2444 	return err;
2445 }
2446 
2447 static int
mxge_get_buf_big(struct mxge_slice_state * ss,bus_dmamap_t map,int idx)2448 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2449 {
2450 	bus_dma_segment_t seg[3];
2451 	struct mbuf *m;
2452 	mxge_rx_ring_t *rx = &ss->rx_big;
2453 	int cnt, err, i;
2454 
2455 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2456 	if (m == NULL) {
2457 		rx->alloc_fail++;
2458 		err = ENOBUFS;
2459 		goto done;
2460 	}
2461 	m->m_len = rx->mlen;
2462 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2463 				      seg, &cnt, BUS_DMA_NOWAIT);
2464 	if (err != 0) {
2465 		m_free(m);
2466 		goto done;
2467 	}
2468 	rx->info[idx].m = m;
2469 	rx->shadow[idx].addr_low =
2470 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2471 	rx->shadow[idx].addr_high =
2472 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2473 
2474 #if MXGE_VIRT_JUMBOS
2475 	for (i = 1; i < cnt; i++) {
2476 		rx->shadow[idx + i].addr_low =
2477 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2478 		rx->shadow[idx + i].addr_high =
2479 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2480        }
2481 #endif
2482 
2483 done:
2484        for (i = 0; i < rx->nbufs; i++) {
2485 		if ((idx & 7) == 7) {
2486 			mxge_submit_8rx(&rx->lanai[idx - 7],
2487 					&rx->shadow[idx - 7]);
2488 		}
2489 		idx++;
2490 	}
2491 	return err;
2492 }
2493 
2494 #ifdef INET6
2495 
2496 static uint16_t
mxge_csum_generic(uint16_t * raw,int len)2497 mxge_csum_generic(uint16_t *raw, int len)
2498 {
2499 	uint32_t csum;
2500 
2501 
2502 	csum = 0;
2503 	while (len > 0) {
2504 		csum += *raw;
2505 		raw++;
2506 		len -= 2;
2507 	}
2508 	csum = (csum >> 16) + (csum & 0xffff);
2509 	csum = (csum >> 16) + (csum & 0xffff);
2510 	return (uint16_t)csum;
2511 }
2512 
2513 static inline uint16_t
mxge_rx_csum6(void * p,struct mbuf * m,uint32_t csum)2514 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2515 {
2516 	uint32_t partial;
2517 	int nxt, cksum_offset;
2518 	struct ip6_hdr *ip6 = p;
2519 	uint16_t c;
2520 
2521 	nxt = ip6->ip6_nxt;
2522 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2523 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2524 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2525 					   IPPROTO_IPV6, &nxt);
2526 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2527 			return (1);
2528 	}
2529 
2530 	/*
2531 	 * IPv6 headers do not contain a checksum, and hence
2532 	 * do not checksum to zero, so they don't "fall out"
2533 	 * of the partial checksum calculation like IPv4
2534 	 * headers do.  We need to fix the partial checksum by
2535 	 * subtracting the checksum of the IPv6 header.
2536 	 */
2537 
2538 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2539 				    ETHER_HDR_LEN);
2540 	csum += ~partial;
2541 	csum +=	 (csum < ~partial);
2542 	csum = (csum >> 16) + (csum & 0xFFFF);
2543 	csum = (csum >> 16) + (csum & 0xFFFF);
2544 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2545 			     csum);
2546 	c ^= 0xffff;
2547 	return (c);
2548 }
2549 #endif /* INET6 */
2550 /*
2551  *  Myri10GE hardware checksums are not valid if the sender
2552  *  padded the frame with non-zero padding.  This is because
2553  *  the firmware just does a simple 16-bit 1s complement
2554  *  checksum across the entire frame, excluding the first 14
2555  *  bytes.  It is best to simply to check the checksum and
2556  *  tell the stack about it only if the checksum is good
2557  */
2558 
2559 static inline uint16_t
mxge_rx_csum(struct mbuf * m,int csum)2560 mxge_rx_csum(struct mbuf *m, int csum)
2561 {
2562 	struct ether_header *eh;
2563 #ifdef INET
2564 	struct ip *ip;
2565 #endif
2566 #if defined(INET) || defined(INET6)
2567 	int cap = m->m_pkthdr.rcvif->if_capenable;
2568 #endif
2569 	uint16_t c, etype;
2570 
2571 
2572 	eh = mtod(m, struct ether_header *);
2573 	etype = ntohs(eh->ether_type);
2574 	switch (etype) {
2575 #ifdef INET
2576 	case ETHERTYPE_IP:
2577 		if ((cap & IFCAP_RXCSUM) == 0)
2578 			return (1);
2579 		ip = (struct ip *)(eh + 1);
2580 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2581 			return (1);
2582 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2583 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2584 				    (ip->ip_hl << 2) + ip->ip_p));
2585 		c ^= 0xffff;
2586 		break;
2587 #endif
2588 #ifdef INET6
2589 	case ETHERTYPE_IPV6:
2590 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2591 			return (1);
2592 		c = mxge_rx_csum6((eh + 1), m, csum);
2593 		break;
2594 #endif
2595 	default:
2596 		c = 1;
2597 	}
2598 	return (c);
2599 }
2600 
2601 static void
mxge_vlan_tag_remove(struct mbuf * m,uint32_t * csum)2602 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2603 {
2604 	struct ether_vlan_header *evl;
2605 	struct ether_header *eh;
2606 	uint32_t partial;
2607 
2608 	evl = mtod(m, struct ether_vlan_header *);
2609 	eh = mtod(m, struct ether_header *);
2610 
2611 	/*
2612 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2613 	 * after what the firmware thought was the end of the ethernet
2614 	 * header.
2615 	 */
2616 
2617 	/* put checksum into host byte order */
2618 	*csum = ntohs(*csum);
2619 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2620 	(*csum) += ~partial;
2621 	(*csum) +=  ((*csum) < ~partial);
2622 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2623 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2624 
2625 	/* restore checksum to network byte order;
2626 	   later consumers expect this */
2627 	*csum = htons(*csum);
2628 
2629 	/* save the tag */
2630 #ifdef MXGE_NEW_VLAN_API
2631 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2632 #else
2633 	{
2634 		struct m_tag *mtag;
2635 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2636 				   M_NOWAIT);
2637 		if (mtag == NULL)
2638 			return;
2639 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2640 		m_tag_prepend(m, mtag);
2641 	}
2642 
2643 #endif
2644 	m->m_flags |= M_VLANTAG;
2645 
2646 	/*
2647 	 * Remove the 802.1q header by copying the Ethernet
2648 	 * addresses over it and adjusting the beginning of
2649 	 * the data in the mbuf.  The encapsulated Ethernet
2650 	 * type field is already in place.
2651 	 */
2652 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2653 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2654 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2655 }
2656 
2657 
2658 static inline void
mxge_rx_done_big(struct mxge_slice_state * ss,uint32_t len,uint32_t csum,int lro)2659 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2660 		 uint32_t csum, int lro)
2661 {
2662 	mxge_softc_t *sc;
2663 	struct ifnet *ifp;
2664 	struct mbuf *m;
2665 	struct ether_header *eh;
2666 	mxge_rx_ring_t *rx;
2667 	bus_dmamap_t old_map;
2668 	int idx;
2669 
2670 	sc = ss->sc;
2671 	ifp = sc->ifp;
2672 	rx = &ss->rx_big;
2673 	idx = rx->cnt & rx->mask;
2674 	rx->cnt += rx->nbufs;
2675 	/* save a pointer to the received mbuf */
2676 	m = rx->info[idx].m;
2677 	/* try to replace the received mbuf */
2678 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2679 		/* drop the frame -- the old mbuf is re-cycled */
2680 		ifp->if_ierrors++;
2681 		return;
2682 	}
2683 
2684 	/* unmap the received buffer */
2685 	old_map = rx->info[idx].map;
2686 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2687 	bus_dmamap_unload(rx->dmat, old_map);
2688 
2689 	/* swap the bus_dmamap_t's */
2690 	rx->info[idx].map = rx->extra_map;
2691 	rx->extra_map = old_map;
2692 
2693 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2694 	 * aligned */
2695 	m->m_data += MXGEFW_PAD;
2696 
2697 	m->m_pkthdr.rcvif = ifp;
2698 	m->m_len = m->m_pkthdr.len = len;
2699 	ss->ipackets++;
2700 	eh = mtod(m, struct ether_header *);
2701 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2702 		mxge_vlan_tag_remove(m, &csum);
2703 	}
2704 	/* flowid only valid if RSS hashing is enabled */
2705 	if (sc->num_slices > 1) {
2706 		m->m_pkthdr.flowid = (ss - sc->ss);
2707 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2708 	}
2709 	/* if the checksum is valid, mark it in the mbuf header */
2710 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2711 	    (0 == mxge_rx_csum(m, csum))) {
2712 		/* Tell the stack that the  checksum is good */
2713 		m->m_pkthdr.csum_data = 0xffff;
2714 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2715 			CSUM_DATA_VALID;
2716 
2717 #if defined(INET) || defined (INET6)
2718 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2719 			return;
2720 #endif
2721 	}
2722 	/* pass the frame up the stack */
2723 	(*ifp->if_input)(ifp, m);
2724 }
2725 
2726 static inline void
mxge_rx_done_small(struct mxge_slice_state * ss,uint32_t len,uint32_t csum,int lro)2727 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2728 		   uint32_t csum, int lro)
2729 {
2730 	mxge_softc_t *sc;
2731 	struct ifnet *ifp;
2732 	struct ether_header *eh;
2733 	struct mbuf *m;
2734 	mxge_rx_ring_t *rx;
2735 	bus_dmamap_t old_map;
2736 	int idx;
2737 
2738 	sc = ss->sc;
2739 	ifp = sc->ifp;
2740 	rx = &ss->rx_small;
2741 	idx = rx->cnt & rx->mask;
2742 	rx->cnt++;
2743 	/* save a pointer to the received mbuf */
2744 	m = rx->info[idx].m;
2745 	/* try to replace the received mbuf */
2746 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2747 		/* drop the frame -- the old mbuf is re-cycled */
2748 		ifp->if_ierrors++;
2749 		return;
2750 	}
2751 
2752 	/* unmap the received buffer */
2753 	old_map = rx->info[idx].map;
2754 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2755 	bus_dmamap_unload(rx->dmat, old_map);
2756 
2757 	/* swap the bus_dmamap_t's */
2758 	rx->info[idx].map = rx->extra_map;
2759 	rx->extra_map = old_map;
2760 
2761 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2762 	 * aligned */
2763 	m->m_data += MXGEFW_PAD;
2764 
2765 	m->m_pkthdr.rcvif = ifp;
2766 	m->m_len = m->m_pkthdr.len = len;
2767 	ss->ipackets++;
2768 	eh = mtod(m, struct ether_header *);
2769 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2770 		mxge_vlan_tag_remove(m, &csum);
2771 	}
2772 	/* flowid only valid if RSS hashing is enabled */
2773 	if (sc->num_slices > 1) {
2774 		m->m_pkthdr.flowid = (ss - sc->ss);
2775 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2776 	}
2777 	/* if the checksum is valid, mark it in the mbuf header */
2778 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2779 	    (0 == mxge_rx_csum(m, csum))) {
2780 		/* Tell the stack that the  checksum is good */
2781 		m->m_pkthdr.csum_data = 0xffff;
2782 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2783 			CSUM_DATA_VALID;
2784 
2785 #if defined(INET) || defined (INET6)
2786 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2787 			return;
2788 #endif
2789 	}
2790 	/* pass the frame up the stack */
2791 	(*ifp->if_input)(ifp, m);
2792 }
2793 
2794 static inline void
mxge_clean_rx_done(struct mxge_slice_state * ss)2795 mxge_clean_rx_done(struct mxge_slice_state *ss)
2796 {
2797 	mxge_rx_done_t *rx_done = &ss->rx_done;
2798 	int limit = 0;
2799 	uint16_t length;
2800 	uint16_t checksum;
2801 	int lro;
2802 
2803 	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2804 	while (rx_done->entry[rx_done->idx].length != 0) {
2805 		length = ntohs(rx_done->entry[rx_done->idx].length);
2806 		rx_done->entry[rx_done->idx].length = 0;
2807 		checksum = rx_done->entry[rx_done->idx].checksum;
2808 		if (length <= (MHLEN - MXGEFW_PAD))
2809 			mxge_rx_done_small(ss, length, checksum, lro);
2810 		else
2811 			mxge_rx_done_big(ss, length, checksum, lro);
2812 		rx_done->cnt++;
2813 		rx_done->idx = rx_done->cnt & rx_done->mask;
2814 
2815 		/* limit potential for livelock */
2816 		if (__predict_false(++limit > rx_done->mask / 2))
2817 			break;
2818 	}
2819 #if defined(INET)  || defined (INET6)
2820 	while (!SLIST_EMPTY(&ss->lc.lro_active)) {
2821 		struct lro_entry *lro = SLIST_FIRST(&ss->lc.lro_active);
2822 		SLIST_REMOVE_HEAD(&ss->lc.lro_active, next);
2823 		tcp_lro_flush(&ss->lc, lro);
2824 	}
2825 #endif
2826 }
2827 
2828 
2829 static inline void
mxge_tx_done(struct mxge_slice_state * ss,uint32_t mcp_idx)2830 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2831 {
2832 	struct ifnet *ifp;
2833 	mxge_tx_ring_t *tx;
2834 	struct mbuf *m;
2835 	bus_dmamap_t map;
2836 	int idx;
2837 	int *flags;
2838 
2839 	tx = &ss->tx;
2840 	ifp = ss->sc->ifp;
2841 	while (tx->pkt_done != mcp_idx) {
2842 		idx = tx->done & tx->mask;
2843 		tx->done++;
2844 		m = tx->info[idx].m;
2845 		/* mbuf and DMA map only attached to the first
2846 		   segment per-mbuf */
2847 		if (m != NULL) {
2848 			ss->obytes += m->m_pkthdr.len;
2849 			if (m->m_flags & M_MCAST)
2850 				ss->omcasts++;
2851 			ss->opackets++;
2852 			tx->info[idx].m = NULL;
2853 			map = tx->info[idx].map;
2854 			bus_dmamap_unload(tx->dmat, map);
2855 			m_freem(m);
2856 		}
2857 		if (tx->info[idx].flag) {
2858 			tx->info[idx].flag = 0;
2859 			tx->pkt_done++;
2860 		}
2861 	}
2862 
2863 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2864            its OK to send packets */
2865 #ifdef IFNET_BUF_RING
2866 	flags = &ss->if_drv_flags;
2867 #else
2868 	flags = &ifp->if_drv_flags;
2869 #endif
2870 	mtx_lock(&ss->tx.mtx);
2871 	if ((*flags) & IFF_DRV_OACTIVE &&
2872 	    tx->req - tx->done < (tx->mask + 1)/4) {
2873 		*(flags) &= ~IFF_DRV_OACTIVE;
2874 		ss->tx.wake++;
2875 		mxge_start_locked(ss);
2876 	}
2877 #ifdef IFNET_BUF_RING
2878 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2879 		/* let the NIC stop polling this queue, since there
2880 		 * are no more transmits pending */
2881 		if (tx->req == tx->done) {
2882 			*tx->send_stop = 1;
2883 			tx->queue_active = 0;
2884 			tx->deactivate++;
2885 			wmb();
2886 		}
2887 	}
2888 #endif
2889 	mtx_unlock(&ss->tx.mtx);
2890 
2891 }
2892 
2893 static struct mxge_media_type mxge_xfp_media_types[] =
2894 {
2895 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2896 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2897 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2898 	{0,		(1 << 5),	"10GBASE-ER"},
2899 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2900 	{0,		(1 << 3),	"10GBASE-SW"},
2901 	{0,		(1 << 2),	"10GBASE-LW"},
2902 	{0,		(1 << 1),	"10GBASE-EW"},
2903 	{0,		(1 << 0),	"Reserved"}
2904 };
2905 static struct mxge_media_type mxge_sfp_media_types[] =
2906 {
2907 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2908 	{0,		(1 << 7),	"Reserved"},
2909 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2910 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2911 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2912 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2913 };
2914 
2915 static void
mxge_media_set(mxge_softc_t * sc,int media_type)2916 mxge_media_set(mxge_softc_t *sc, int media_type)
2917 {
2918 
2919 
2920 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2921 		    0, NULL);
2922 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2923 	sc->current_media = media_type;
2924 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2925 }
2926 
2927 static void
mxge_media_init(mxge_softc_t * sc)2928 mxge_media_init(mxge_softc_t *sc)
2929 {
2930 	char *ptr;
2931 	int i;
2932 
2933 	ifmedia_removeall(&sc->media);
2934 	mxge_media_set(sc, IFM_AUTO);
2935 
2936 	/*
2937 	 * parse the product code to deterimine the interface type
2938 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2939 	 * after the 3rd dash in the driver's cached copy of the
2940 	 * EEPROM's product code string.
2941 	 */
2942 	ptr = sc->product_code_string;
2943 	if (ptr == NULL) {
2944 		device_printf(sc->dev, "Missing product code\n");
2945 		return;
2946 	}
2947 
2948 	for (i = 0; i < 3; i++, ptr++) {
2949 		ptr = strchr(ptr, '-');
2950 		if (ptr == NULL) {
2951 			device_printf(sc->dev,
2952 				      "only %d dashes in PC?!?\n", i);
2953 			return;
2954 		}
2955 	}
2956 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2957 		/* -C is CX4 */
2958 		sc->connector = MXGE_CX4;
2959 		mxge_media_set(sc, IFM_10G_CX4);
2960 	} else if (*ptr == 'Q') {
2961 		/* -Q is Quad Ribbon Fiber */
2962 		sc->connector = MXGE_QRF;
2963 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2964 		/* FreeBSD has no media type for Quad ribbon fiber */
2965 	} else if (*ptr == 'R') {
2966 		/* -R is XFP */
2967 		sc->connector = MXGE_XFP;
2968 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2969 		/* -S or -2S is SFP+ */
2970 		sc->connector = MXGE_SFP;
2971 	} else {
2972 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2973 	}
2974 }
2975 
2976 /*
2977  * Determine the media type for a NIC.  Some XFPs will identify
2978  * themselves only when their link is up, so this is initiated via a
2979  * link up interrupt.  However, this can potentially take up to
2980  * several milliseconds, so it is run via the watchdog routine, rather
2981  * than in the interrupt handler itself.
2982  */
2983 static void
mxge_media_probe(mxge_softc_t * sc)2984 mxge_media_probe(mxge_softc_t *sc)
2985 {
2986 	mxge_cmd_t cmd;
2987 	char *cage_type;
2988 
2989 	struct mxge_media_type *mxge_media_types = NULL;
2990 	int i, err, ms, mxge_media_type_entries;
2991 	uint32_t byte;
2992 
2993 	sc->need_media_probe = 0;
2994 
2995 	if (sc->connector == MXGE_XFP) {
2996 		/* -R is XFP */
2997 		mxge_media_types = mxge_xfp_media_types;
2998 		mxge_media_type_entries =
2999 			sizeof (mxge_xfp_media_types) /
3000 			sizeof (mxge_xfp_media_types[0]);
3001 		byte = MXGE_XFP_COMPLIANCE_BYTE;
3002 		cage_type = "XFP";
3003 	} else 	if (sc->connector == MXGE_SFP) {
3004 		/* -S or -2S is SFP+ */
3005 		mxge_media_types = mxge_sfp_media_types;
3006 		mxge_media_type_entries =
3007 			sizeof (mxge_sfp_media_types) /
3008 			sizeof (mxge_sfp_media_types[0]);
3009 		cage_type = "SFP+";
3010 		byte = 3;
3011 	} else {
3012 		/* nothing to do; media type cannot change */
3013 		return;
3014 	}
3015 
3016 	/*
3017 	 * At this point we know the NIC has an XFP cage, so now we
3018 	 * try to determine what is in the cage by using the
3019 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
3020 	 * register.  We read just one byte, which may take over
3021 	 * a millisecond
3022 	 */
3023 
3024 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3025 	cmd.data1 = byte;
3026 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3027 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3028 		device_printf(sc->dev, "failed to read XFP\n");
3029 	}
3030 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3031 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3032 	}
3033 	if (err != MXGEFW_CMD_OK) {
3034 		return;
3035 	}
3036 
3037 	/* now we wait for the data to be cached */
3038 	cmd.data0 = byte;
3039 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3040 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3041 		DELAY(1000);
3042 		cmd.data0 = byte;
3043 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3044 	}
3045 	if (err != MXGEFW_CMD_OK) {
3046 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3047 			      cage_type, err, ms);
3048 		return;
3049 	}
3050 
3051 	if (cmd.data0 == mxge_media_types[0].bitmask) {
3052 		if (mxge_verbose)
3053 			device_printf(sc->dev, "%s:%s\n", cage_type,
3054 				      mxge_media_types[0].name);
3055 		if (sc->current_media != mxge_media_types[0].flag) {
3056 			mxge_media_init(sc);
3057 			mxge_media_set(sc, mxge_media_types[0].flag);
3058 		}
3059 		return;
3060 	}
3061 	for (i = 1; i < mxge_media_type_entries; i++) {
3062 		if (cmd.data0 & mxge_media_types[i].bitmask) {
3063 			if (mxge_verbose)
3064 				device_printf(sc->dev, "%s:%s\n",
3065 					      cage_type,
3066 					      mxge_media_types[i].name);
3067 
3068 			if (sc->current_media != mxge_media_types[i].flag) {
3069 				mxge_media_init(sc);
3070 				mxge_media_set(sc, mxge_media_types[i].flag);
3071 			}
3072 			return;
3073 		}
3074 	}
3075 	if (mxge_verbose)
3076 		device_printf(sc->dev, "%s media 0x%x unknown\n",
3077 			      cage_type, cmd.data0);
3078 
3079 	return;
3080 }
3081 
3082 static void
mxge_intr(void * arg)3083 mxge_intr(void *arg)
3084 {
3085 	struct mxge_slice_state *ss = arg;
3086 	mxge_softc_t *sc = ss->sc;
3087 	mcp_irq_data_t *stats = ss->fw_stats;
3088 	mxge_tx_ring_t *tx = &ss->tx;
3089 	mxge_rx_done_t *rx_done = &ss->rx_done;
3090 	uint32_t send_done_count;
3091 	uint8_t valid;
3092 
3093 
3094 #ifndef IFNET_BUF_RING
3095 	/* an interrupt on a non-zero slice is implicitly valid
3096 	   since MSI-X irqs are not shared */
3097 	if (ss != sc->ss) {
3098 		mxge_clean_rx_done(ss);
3099 		*ss->irq_claim = be32toh(3);
3100 		return;
3101 	}
3102 #endif
3103 
3104 	/* make sure the DMA has finished */
3105 	if (!stats->valid) {
3106 		return;
3107 	}
3108 	valid = stats->valid;
3109 
3110 	if (sc->legacy_irq) {
3111 		/* lower legacy IRQ  */
3112 		*sc->irq_deassert = 0;
3113 		if (!mxge_deassert_wait)
3114 			/* don't wait for conf. that irq is low */
3115 			stats->valid = 0;
3116 	} else {
3117 		stats->valid = 0;
3118 	}
3119 
3120 	/* loop while waiting for legacy irq deassertion */
3121 	do {
3122 		/* check for transmit completes and receives */
3123 		send_done_count = be32toh(stats->send_done_count);
3124 		while ((send_done_count != tx->pkt_done) ||
3125 		       (rx_done->entry[rx_done->idx].length != 0)) {
3126 			if (send_done_count != tx->pkt_done)
3127 				mxge_tx_done(ss, (int)send_done_count);
3128 			mxge_clean_rx_done(ss);
3129 			send_done_count = be32toh(stats->send_done_count);
3130 		}
3131 		if (sc->legacy_irq && mxge_deassert_wait)
3132 			wmb();
3133 	} while (*((volatile uint8_t *) &stats->valid));
3134 
3135 	/* fw link & error stats meaningful only on the first slice */
3136 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3137 		if (sc->link_state != stats->link_up) {
3138 			sc->link_state = stats->link_up;
3139 			if (sc->link_state) {
3140 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3141 				if_initbaudrate(sc->ifp, IF_Gbps(10));
3142 				if (mxge_verbose)
3143 					device_printf(sc->dev, "link up\n");
3144 			} else {
3145 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3146 				sc->ifp->if_baudrate = 0;
3147 				if (mxge_verbose)
3148 					device_printf(sc->dev, "link down\n");
3149 			}
3150 			sc->need_media_probe = 1;
3151 		}
3152 		if (sc->rdma_tags_available !=
3153 		    be32toh(stats->rdma_tags_available)) {
3154 			sc->rdma_tags_available =
3155 				be32toh(stats->rdma_tags_available);
3156 			device_printf(sc->dev, "RDMA timed out! %d tags "
3157 				      "left\n", sc->rdma_tags_available);
3158 		}
3159 
3160 		if (stats->link_down) {
3161 			sc->down_cnt += stats->link_down;
3162 			sc->link_state = 0;
3163 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3164 		}
3165 	}
3166 
3167 	/* check to see if we have rx token to pass back */
3168 	if (valid & 0x1)
3169 	    *ss->irq_claim = be32toh(3);
3170 	*(ss->irq_claim + 1) = be32toh(3);
3171 }
3172 
3173 static void
mxge_init(void * arg)3174 mxge_init(void *arg)
3175 {
3176 	mxge_softc_t *sc = arg;
3177 	struct ifnet *ifp = sc->ifp;
3178 
3179 
3180 	mtx_lock(&sc->driver_mtx);
3181 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3182 		(void) mxge_open(sc);
3183 	mtx_unlock(&sc->driver_mtx);
3184 }
3185 
3186 
3187 
3188 static void
mxge_free_slice_mbufs(struct mxge_slice_state * ss)3189 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3190 {
3191 	int i;
3192 
3193 #if defined(INET) || defined(INET6)
3194 	tcp_lro_free(&ss->lc);
3195 #endif
3196 	for (i = 0; i <= ss->rx_big.mask; i++) {
3197 		if (ss->rx_big.info[i].m == NULL)
3198 			continue;
3199 		bus_dmamap_unload(ss->rx_big.dmat,
3200 				  ss->rx_big.info[i].map);
3201 		m_freem(ss->rx_big.info[i].m);
3202 		ss->rx_big.info[i].m = NULL;
3203 	}
3204 
3205 	for (i = 0; i <= ss->rx_small.mask; i++) {
3206 		if (ss->rx_small.info[i].m == NULL)
3207 			continue;
3208 		bus_dmamap_unload(ss->rx_small.dmat,
3209 				  ss->rx_small.info[i].map);
3210 		m_freem(ss->rx_small.info[i].m);
3211 		ss->rx_small.info[i].m = NULL;
3212 	}
3213 
3214 	/* transmit ring used only on the first slice */
3215 	if (ss->tx.info == NULL)
3216 		return;
3217 
3218 	for (i = 0; i <= ss->tx.mask; i++) {
3219 		ss->tx.info[i].flag = 0;
3220 		if (ss->tx.info[i].m == NULL)
3221 			continue;
3222 		bus_dmamap_unload(ss->tx.dmat,
3223 				  ss->tx.info[i].map);
3224 		m_freem(ss->tx.info[i].m);
3225 		ss->tx.info[i].m = NULL;
3226 	}
3227 }
3228 
3229 static void
mxge_free_mbufs(mxge_softc_t * sc)3230 mxge_free_mbufs(mxge_softc_t *sc)
3231 {
3232 	int slice;
3233 
3234 	for (slice = 0; slice < sc->num_slices; slice++)
3235 		mxge_free_slice_mbufs(&sc->ss[slice]);
3236 }
3237 
3238 static void
mxge_free_slice_rings(struct mxge_slice_state * ss)3239 mxge_free_slice_rings(struct mxge_slice_state *ss)
3240 {
3241 	int i;
3242 
3243 
3244 	if (ss->rx_done.entry != NULL)
3245 		mxge_dma_free(&ss->rx_done.dma);
3246 	ss->rx_done.entry = NULL;
3247 
3248 	if (ss->tx.req_bytes != NULL)
3249 		free(ss->tx.req_bytes, M_DEVBUF);
3250 	ss->tx.req_bytes = NULL;
3251 
3252 	if (ss->tx.seg_list != NULL)
3253 		free(ss->tx.seg_list, M_DEVBUF);
3254 	ss->tx.seg_list = NULL;
3255 
3256 	if (ss->rx_small.shadow != NULL)
3257 		free(ss->rx_small.shadow, M_DEVBUF);
3258 	ss->rx_small.shadow = NULL;
3259 
3260 	if (ss->rx_big.shadow != NULL)
3261 		free(ss->rx_big.shadow, M_DEVBUF);
3262 	ss->rx_big.shadow = NULL;
3263 
3264 	if (ss->tx.info != NULL) {
3265 		if (ss->tx.dmat != NULL) {
3266 			for (i = 0; i <= ss->tx.mask; i++) {
3267 				bus_dmamap_destroy(ss->tx.dmat,
3268 						   ss->tx.info[i].map);
3269 			}
3270 			bus_dma_tag_destroy(ss->tx.dmat);
3271 		}
3272 		free(ss->tx.info, M_DEVBUF);
3273 	}
3274 	ss->tx.info = NULL;
3275 
3276 	if (ss->rx_small.info != NULL) {
3277 		if (ss->rx_small.dmat != NULL) {
3278 			for (i = 0; i <= ss->rx_small.mask; i++) {
3279 				bus_dmamap_destroy(ss->rx_small.dmat,
3280 						   ss->rx_small.info[i].map);
3281 			}
3282 			bus_dmamap_destroy(ss->rx_small.dmat,
3283 					   ss->rx_small.extra_map);
3284 			bus_dma_tag_destroy(ss->rx_small.dmat);
3285 		}
3286 		free(ss->rx_small.info, M_DEVBUF);
3287 	}
3288 	ss->rx_small.info = NULL;
3289 
3290 	if (ss->rx_big.info != NULL) {
3291 		if (ss->rx_big.dmat != NULL) {
3292 			for (i = 0; i <= ss->rx_big.mask; i++) {
3293 				bus_dmamap_destroy(ss->rx_big.dmat,
3294 						   ss->rx_big.info[i].map);
3295 			}
3296 			bus_dmamap_destroy(ss->rx_big.dmat,
3297 					   ss->rx_big.extra_map);
3298 			bus_dma_tag_destroy(ss->rx_big.dmat);
3299 		}
3300 		free(ss->rx_big.info, M_DEVBUF);
3301 	}
3302 	ss->rx_big.info = NULL;
3303 }
3304 
3305 static void
mxge_free_rings(mxge_softc_t * sc)3306 mxge_free_rings(mxge_softc_t *sc)
3307 {
3308 	int slice;
3309 
3310 	for (slice = 0; slice < sc->num_slices; slice++)
3311 		mxge_free_slice_rings(&sc->ss[slice]);
3312 }
3313 
3314 static int
mxge_alloc_slice_rings(struct mxge_slice_state * ss,int rx_ring_entries,int tx_ring_entries)3315 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3316 		       int tx_ring_entries)
3317 {
3318 	mxge_softc_t *sc = ss->sc;
3319 	size_t bytes;
3320 	int err, i;
3321 
3322 	/* allocate per-slice receive resources */
3323 
3324 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3325 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3326 
3327 	/* allocate the rx shadow rings */
3328 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3329 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3330 
3331 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3332 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3333 
3334 	/* allocate the rx host info rings */
3335 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3336 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3337 
3338 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3339 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3340 
3341 	/* allocate the rx busdma resources */
3342 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3343 				 1,			/* alignment */
3344 				 4096,			/* boundary */
3345 				 BUS_SPACE_MAXADDR,	/* low */
3346 				 BUS_SPACE_MAXADDR,	/* high */
3347 				 NULL, NULL,		/* filter */
3348 				 MHLEN,			/* maxsize */
3349 				 1,			/* num segs */
3350 				 MHLEN,			/* maxsegsize */
3351 				 BUS_DMA_ALLOCNOW,	/* flags */
3352 				 NULL, NULL,		/* lock */
3353 				 &ss->rx_small.dmat);	/* tag */
3354 	if (err != 0) {
3355 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3356 			      err);
3357 		return err;
3358 	}
3359 
3360 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3361 				 1,			/* alignment */
3362 #if MXGE_VIRT_JUMBOS
3363 				 4096,			/* boundary */
3364 #else
3365 				 0,			/* boundary */
3366 #endif
3367 				 BUS_SPACE_MAXADDR,	/* low */
3368 				 BUS_SPACE_MAXADDR,	/* high */
3369 				 NULL, NULL,		/* filter */
3370 				 3*4096,		/* maxsize */
3371 #if MXGE_VIRT_JUMBOS
3372 				 3,			/* num segs */
3373 				 4096,			/* maxsegsize*/
3374 #else
3375 				 1,			/* num segs */
3376 				 MJUM9BYTES,		/* maxsegsize*/
3377 #endif
3378 				 BUS_DMA_ALLOCNOW,	/* flags */
3379 				 NULL, NULL,		/* lock */
3380 				 &ss->rx_big.dmat);	/* tag */
3381 	if (err != 0) {
3382 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3383 			      err);
3384 		return err;
3385 	}
3386 	for (i = 0; i <= ss->rx_small.mask; i++) {
3387 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3388 					&ss->rx_small.info[i].map);
3389 		if (err != 0) {
3390 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3391 				      err);
3392 			return err;
3393 		}
3394 	}
3395 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3396 				&ss->rx_small.extra_map);
3397 	if (err != 0) {
3398 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3399 			      err);
3400 		return err;
3401 	}
3402 
3403 	for (i = 0; i <= ss->rx_big.mask; i++) {
3404 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3405 					&ss->rx_big.info[i].map);
3406 		if (err != 0) {
3407 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3408 				      err);
3409 			return err;
3410 		}
3411 	}
3412 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3413 				&ss->rx_big.extra_map);
3414 	if (err != 0) {
3415 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3416 			      err);
3417 		return err;
3418 	}
3419 
3420 	/* now allocate TX resources */
3421 
3422 #ifndef IFNET_BUF_RING
3423 	/* only use a single TX ring for now */
3424 	if (ss != ss->sc->ss)
3425 		return 0;
3426 #endif
3427 
3428 	ss->tx.mask = tx_ring_entries - 1;
3429 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3430 
3431 
3432 	/* allocate the tx request copy block */
3433 	bytes = 8 +
3434 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3435 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3436 	/* ensure req_list entries are aligned to 8 bytes */
3437 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3438 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3439 
3440 	/* allocate the tx busdma segment list */
3441 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3442 	ss->tx.seg_list = (bus_dma_segment_t *)
3443 		malloc(bytes, M_DEVBUF, M_WAITOK);
3444 
3445 	/* allocate the tx host info ring */
3446 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3447 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3448 
3449 	/* allocate the tx busdma resources */
3450 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3451 				 1,			/* alignment */
3452 				 sc->tx_boundary,	/* boundary */
3453 				 BUS_SPACE_MAXADDR,	/* low */
3454 				 BUS_SPACE_MAXADDR,	/* high */
3455 				 NULL, NULL,		/* filter */
3456 				 65536 + 256,		/* maxsize */
3457 				 ss->tx.max_desc - 2,	/* num segs */
3458 				 sc->tx_boundary,	/* maxsegsz */
3459 				 BUS_DMA_ALLOCNOW,	/* flags */
3460 				 NULL, NULL,		/* lock */
3461 				 &ss->tx.dmat);		/* tag */
3462 
3463 	if (err != 0) {
3464 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3465 			      err);
3466 		return err;
3467 	}
3468 
3469 	/* now use these tags to setup dmamaps for each slot
3470 	   in the ring */
3471 	for (i = 0; i <= ss->tx.mask; i++) {
3472 		err = bus_dmamap_create(ss->tx.dmat, 0,
3473 					&ss->tx.info[i].map);
3474 		if (err != 0) {
3475 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3476 				      err);
3477 			return err;
3478 		}
3479 	}
3480 	return 0;
3481 
3482 }
3483 
3484 static int
mxge_alloc_rings(mxge_softc_t * sc)3485 mxge_alloc_rings(mxge_softc_t *sc)
3486 {
3487 	mxge_cmd_t cmd;
3488 	int tx_ring_size;
3489 	int tx_ring_entries, rx_ring_entries;
3490 	int err, slice;
3491 
3492 	/* get ring sizes */
3493 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3494 	tx_ring_size = cmd.data0;
3495 	if (err != 0) {
3496 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3497 		goto abort;
3498 	}
3499 
3500 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3501 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3502 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3503 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3504 	IFQ_SET_READY(&sc->ifp->if_snd);
3505 
3506 	for (slice = 0; slice < sc->num_slices; slice++) {
3507 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3508 					     rx_ring_entries,
3509 					     tx_ring_entries);
3510 		if (err != 0)
3511 			goto abort;
3512 	}
3513 	return 0;
3514 
3515 abort:
3516 	mxge_free_rings(sc);
3517 	return err;
3518 
3519 }
3520 
3521 
3522 static void
mxge_choose_params(int mtu,int * big_buf_size,int * cl_size,int * nbufs)3523 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3524 {
3525 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3526 
3527 	if (bufsize < MCLBYTES) {
3528 		/* easy, everything fits in a single buffer */
3529 		*big_buf_size = MCLBYTES;
3530 		*cl_size = MCLBYTES;
3531 		*nbufs = 1;
3532 		return;
3533 	}
3534 
3535 	if (bufsize < MJUMPAGESIZE) {
3536 		/* still easy, everything still fits in a single buffer */
3537 		*big_buf_size = MJUMPAGESIZE;
3538 		*cl_size = MJUMPAGESIZE;
3539 		*nbufs = 1;
3540 		return;
3541 	}
3542 #if MXGE_VIRT_JUMBOS
3543 	/* now we need to use virtually contiguous buffers */
3544 	*cl_size = MJUM9BYTES;
3545 	*big_buf_size = 4096;
3546 	*nbufs = mtu / 4096 + 1;
3547 	/* needs to be a power of two, so round up */
3548 	if (*nbufs == 3)
3549 		*nbufs = 4;
3550 #else
3551 	*cl_size = MJUM9BYTES;
3552 	*big_buf_size = MJUM9BYTES;
3553 	*nbufs = 1;
3554 #endif
3555 }
3556 
3557 static int
mxge_slice_open(struct mxge_slice_state * ss,int nbufs,int cl_size)3558 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3559 {
3560 	mxge_softc_t *sc;
3561 	mxge_cmd_t cmd;
3562 	bus_dmamap_t map;
3563 	int err, i, slice;
3564 
3565 
3566 	sc = ss->sc;
3567 	slice = ss - sc->ss;
3568 
3569 #if defined(INET) || defined(INET6)
3570 	(void)tcp_lro_init(&ss->lc);
3571 #endif
3572 	ss->lc.ifp = sc->ifp;
3573 
3574 	/* get the lanai pointers to the send and receive rings */
3575 
3576 	err = 0;
3577 #ifndef IFNET_BUF_RING
3578 	/* We currently only send from the first slice */
3579 	if (slice == 0) {
3580 #endif
3581 		cmd.data0 = slice;
3582 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3583 		ss->tx.lanai =
3584 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3585 		ss->tx.send_go = (volatile uint32_t *)
3586 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3587 		ss->tx.send_stop = (volatile uint32_t *)
3588 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3589 #ifndef IFNET_BUF_RING
3590 	}
3591 #endif
3592 	cmd.data0 = slice;
3593 	err |= mxge_send_cmd(sc,
3594 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3595 	ss->rx_small.lanai =
3596 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3597 	cmd.data0 = slice;
3598 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3599 	ss->rx_big.lanai =
3600 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3601 
3602 	if (err != 0) {
3603 		device_printf(sc->dev,
3604 			      "failed to get ring sizes or locations\n");
3605 		return EIO;
3606 	}
3607 
3608 	/* stock receive rings */
3609 	for (i = 0; i <= ss->rx_small.mask; i++) {
3610 		map = ss->rx_small.info[i].map;
3611 		err = mxge_get_buf_small(ss, map, i);
3612 		if (err) {
3613 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3614 				      i, ss->rx_small.mask + 1);
3615 			return ENOMEM;
3616 		}
3617 	}
3618 	for (i = 0; i <= ss->rx_big.mask; i++) {
3619 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3620 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3621 	}
3622 	ss->rx_big.nbufs = nbufs;
3623 	ss->rx_big.cl_size = cl_size;
3624 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3625 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3626 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3627 		map = ss->rx_big.info[i].map;
3628 		err = mxge_get_buf_big(ss, map, i);
3629 		if (err) {
3630 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3631 				      i, ss->rx_big.mask + 1);
3632 			return ENOMEM;
3633 		}
3634 	}
3635 	return 0;
3636 }
3637 
3638 static int
mxge_open(mxge_softc_t * sc)3639 mxge_open(mxge_softc_t *sc)
3640 {
3641 	mxge_cmd_t cmd;
3642 	int err, big_bytes, nbufs, slice, cl_size, i;
3643 	bus_addr_t bus;
3644 	volatile uint8_t *itable;
3645 	struct mxge_slice_state *ss;
3646 
3647 	/* Copy the MAC address in case it was overridden */
3648 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3649 
3650 	err = mxge_reset(sc, 1);
3651 	if (err != 0) {
3652 		device_printf(sc->dev, "failed to reset\n");
3653 		return EIO;
3654 	}
3655 
3656 	if (sc->num_slices > 1) {
3657 		/* setup the indirection table */
3658 		cmd.data0 = sc->num_slices;
3659 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3660 				    &cmd);
3661 
3662 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3663 				     &cmd);
3664 		if (err != 0) {
3665 			device_printf(sc->dev,
3666 				      "failed to setup rss tables\n");
3667 			return err;
3668 		}
3669 
3670 		/* just enable an identity mapping */
3671 		itable = sc->sram + cmd.data0;
3672 		for (i = 0; i < sc->num_slices; i++)
3673 			itable[i] = (uint8_t)i;
3674 
3675 		cmd.data0 = 1;
3676 		cmd.data1 = mxge_rss_hash_type;
3677 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3678 		if (err != 0) {
3679 			device_printf(sc->dev, "failed to enable slices\n");
3680 			return err;
3681 		}
3682 	}
3683 
3684 
3685 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3686 
3687 	cmd.data0 = nbufs;
3688 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3689 			    &cmd);
3690 	/* error is only meaningful if we're trying to set
3691 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3692 	if (err && nbufs > 1) {
3693 		device_printf(sc->dev,
3694 			      "Failed to set alway-use-n to %d\n",
3695 			      nbufs);
3696 		return EIO;
3697 	}
3698 	/* Give the firmware the mtu and the big and small buffer
3699 	   sizes.  The firmware wants the big buf size to be a power
3700 	   of two. Luckily, FreeBSD's clusters are powers of two */
3701 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3702 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3703 	cmd.data0 = MHLEN - MXGEFW_PAD;
3704 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3705 			     &cmd);
3706 	cmd.data0 = big_bytes;
3707 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3708 
3709 	if (err != 0) {
3710 		device_printf(sc->dev, "failed to setup params\n");
3711 		goto abort;
3712 	}
3713 
3714 	/* Now give him the pointer to the stats block */
3715 	for (slice = 0;
3716 #ifdef IFNET_BUF_RING
3717 	     slice < sc->num_slices;
3718 #else
3719 	     slice < 1;
3720 #endif
3721 	     slice++) {
3722 		ss = &sc->ss[slice];
3723 		cmd.data0 =
3724 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3725 		cmd.data1 =
3726 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3727 		cmd.data2 = sizeof(struct mcp_irq_data);
3728 		cmd.data2 |= (slice << 16);
3729 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3730 	}
3731 
3732 	if (err != 0) {
3733 		bus = sc->ss->fw_stats_dma.bus_addr;
3734 		bus += offsetof(struct mcp_irq_data, send_done_count);
3735 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3736 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3737 		err = mxge_send_cmd(sc,
3738 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3739 				    &cmd);
3740 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3741 		sc->fw_multicast_support = 0;
3742 	} else {
3743 		sc->fw_multicast_support = 1;
3744 	}
3745 
3746 	if (err != 0) {
3747 		device_printf(sc->dev, "failed to setup params\n");
3748 		goto abort;
3749 	}
3750 
3751 	for (slice = 0; slice < sc->num_slices; slice++) {
3752 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3753 		if (err != 0) {
3754 			device_printf(sc->dev, "couldn't open slice %d\n",
3755 				      slice);
3756 			goto abort;
3757 		}
3758 	}
3759 
3760 	/* Finally, start the firmware running */
3761 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3762 	if (err) {
3763 		device_printf(sc->dev, "Couldn't bring up link\n");
3764 		goto abort;
3765 	}
3766 #ifdef IFNET_BUF_RING
3767 	for (slice = 0; slice < sc->num_slices; slice++) {
3768 		ss = &sc->ss[slice];
3769 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3770 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3771 	}
3772 #endif
3773 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3774 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3775 
3776 	return 0;
3777 
3778 
3779 abort:
3780 	mxge_free_mbufs(sc);
3781 
3782 	return err;
3783 }
3784 
3785 static int
mxge_close(mxge_softc_t * sc,int down)3786 mxge_close(mxge_softc_t *sc, int down)
3787 {
3788 	mxge_cmd_t cmd;
3789 	int err, old_down_cnt;
3790 #ifdef IFNET_BUF_RING
3791 	struct mxge_slice_state *ss;
3792 	int slice;
3793 #endif
3794 
3795 #ifdef IFNET_BUF_RING
3796 	for (slice = 0; slice < sc->num_slices; slice++) {
3797 		ss = &sc->ss[slice];
3798 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3799 	}
3800 #endif
3801 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3802 	if (!down) {
3803 		old_down_cnt = sc->down_cnt;
3804 		wmb();
3805 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3806 		if (err) {
3807 			device_printf(sc->dev,
3808 				      "Couldn't bring down link\n");
3809 		}
3810 		if (old_down_cnt == sc->down_cnt) {
3811 			/* wait for down irq */
3812 			DELAY(10 * sc->intr_coal_delay);
3813 		}
3814 		wmb();
3815 		if (old_down_cnt == sc->down_cnt) {
3816 			device_printf(sc->dev, "never got down irq\n");
3817 		}
3818 	}
3819 	mxge_free_mbufs(sc);
3820 
3821 	return 0;
3822 }
3823 
3824 static void
mxge_setup_cfg_space(mxge_softc_t * sc)3825 mxge_setup_cfg_space(mxge_softc_t *sc)
3826 {
3827 	device_t dev = sc->dev;
3828 	int reg;
3829 	uint16_t lnk, pectl;
3830 
3831 	/* find the PCIe link width and set max read request to 4KB*/
3832 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3833 		lnk = pci_read_config(dev, reg + 0x12, 2);
3834 		sc->link_width = (lnk >> 4) & 0x3f;
3835 
3836 		if (sc->pectl == 0) {
3837 			pectl = pci_read_config(dev, reg + 0x8, 2);
3838 			pectl = (pectl & ~0x7000) | (5 << 12);
3839 			pci_write_config(dev, reg + 0x8, pectl, 2);
3840 			sc->pectl = pectl;
3841 		} else {
3842 			/* restore saved pectl after watchdog reset */
3843 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3844 		}
3845 	}
3846 
3847 	/* Enable DMA and Memory space access */
3848 	pci_enable_busmaster(dev);
3849 }
3850 
3851 static uint32_t
mxge_read_reboot(mxge_softc_t * sc)3852 mxge_read_reboot(mxge_softc_t *sc)
3853 {
3854 	device_t dev = sc->dev;
3855 	uint32_t vs;
3856 
3857 	/* find the vendor specific offset */
3858 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3859 		device_printf(sc->dev,
3860 			      "could not find vendor specific offset\n");
3861 		return (uint32_t)-1;
3862 	}
3863 	/* enable read32 mode */
3864 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3865 	/* tell NIC which register to read */
3866 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3867 	return (pci_read_config(dev, vs + 0x14, 4));
3868 }
3869 
3870 static void
mxge_watchdog_reset(mxge_softc_t * sc)3871 mxge_watchdog_reset(mxge_softc_t *sc)
3872 {
3873 	struct pci_devinfo *dinfo;
3874 	struct mxge_slice_state *ss;
3875 	int err, running, s, num_tx_slices = 1;
3876 	uint32_t reboot;
3877 	uint16_t cmd;
3878 
3879 	err = ENXIO;
3880 
3881 	device_printf(sc->dev, "Watchdog reset!\n");
3882 
3883 	/*
3884 	 * check to see if the NIC rebooted.  If it did, then all of
3885 	 * PCI config space has been reset, and things like the
3886 	 * busmaster bit will be zero.  If this is the case, then we
3887 	 * must restore PCI config space before the NIC can be used
3888 	 * again
3889 	 */
3890 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3891 	if (cmd == 0xffff) {
3892 		/*
3893 		 * maybe the watchdog caught the NIC rebooting; wait
3894 		 * up to 100ms for it to finish.  If it does not come
3895 		 * back, then give up
3896 		 */
3897 		DELAY(1000*100);
3898 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3899 		if (cmd == 0xffff) {
3900 			device_printf(sc->dev, "NIC disappeared!\n");
3901 		}
3902 	}
3903 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3904 		/* print the reboot status */
3905 		reboot = mxge_read_reboot(sc);
3906 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3907 			      reboot);
3908 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3909 		if (running) {
3910 
3911 			/*
3912 			 * quiesce NIC so that TX routines will not try to
3913 			 * xmit after restoration of BAR
3914 			 */
3915 
3916 			/* Mark the link as down */
3917 			if (sc->link_state) {
3918 				sc->link_state = 0;
3919 				if_link_state_change(sc->ifp,
3920 						     LINK_STATE_DOWN);
3921 			}
3922 #ifdef IFNET_BUF_RING
3923 			num_tx_slices = sc->num_slices;
3924 #endif
3925 			/* grab all TX locks to ensure no tx  */
3926 			for (s = 0; s < num_tx_slices; s++) {
3927 				ss = &sc->ss[s];
3928 				mtx_lock(&ss->tx.mtx);
3929 			}
3930 			mxge_close(sc, 1);
3931 		}
3932 		/* restore PCI configuration space */
3933 		dinfo = device_get_ivars(sc->dev);
3934 		pci_cfg_restore(sc->dev, dinfo);
3935 
3936 		/* and redo any changes we made to our config space */
3937 		mxge_setup_cfg_space(sc);
3938 
3939 		/* reload f/w */
3940 		err = mxge_load_firmware(sc, 0);
3941 		if (err) {
3942 			device_printf(sc->dev,
3943 				      "Unable to re-load f/w\n");
3944 		}
3945 		if (running) {
3946 			if (!err)
3947 				err = mxge_open(sc);
3948 			/* release all TX locks */
3949 			for (s = 0; s < num_tx_slices; s++) {
3950 				ss = &sc->ss[s];
3951 #ifdef IFNET_BUF_RING
3952 				mxge_start_locked(ss);
3953 #endif
3954 				mtx_unlock(&ss->tx.mtx);
3955 			}
3956 		}
3957 		sc->watchdog_resets++;
3958 	} else {
3959 		device_printf(sc->dev,
3960 			      "NIC did not reboot, not resetting\n");
3961 		err = 0;
3962 	}
3963 	if (err) {
3964 		device_printf(sc->dev, "watchdog reset failed\n");
3965 	} else {
3966 		if (sc->dying == 2)
3967 			sc->dying = 0;
3968 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3969 	}
3970 }
3971 
3972 static void
mxge_watchdog_task(void * arg,int pending)3973 mxge_watchdog_task(void *arg, int pending)
3974 {
3975 	mxge_softc_t *sc = arg;
3976 
3977 
3978 	mtx_lock(&sc->driver_mtx);
3979 	mxge_watchdog_reset(sc);
3980 	mtx_unlock(&sc->driver_mtx);
3981 }
3982 
3983 static void
mxge_warn_stuck(mxge_softc_t * sc,mxge_tx_ring_t * tx,int slice)3984 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3985 {
3986 	tx = &sc->ss[slice].tx;
3987 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3988 	device_printf(sc->dev,
3989 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3990 		      tx->req, tx->done, tx->queue_active);
3991 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3992 			      tx->activate, tx->deactivate);
3993 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3994 		      tx->pkt_done,
3995 		      be32toh(sc->ss->fw_stats->send_done_count));
3996 }
3997 
3998 static int
mxge_watchdog(mxge_softc_t * sc)3999 mxge_watchdog(mxge_softc_t *sc)
4000 {
4001 	mxge_tx_ring_t *tx;
4002 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
4003 	int i, err = 0;
4004 
4005 	/* see if we have outstanding transmits, which
4006 	   have been pending for more than mxge_ticks */
4007 	for (i = 0;
4008 #ifdef IFNET_BUF_RING
4009 	     (i < sc->num_slices) && (err == 0);
4010 #else
4011 	     (i < 1) && (err == 0);
4012 #endif
4013 	     i++) {
4014 		tx = &sc->ss[i].tx;
4015 		if (tx->req != tx->done &&
4016 		    tx->watchdog_req != tx->watchdog_done &&
4017 		    tx->done == tx->watchdog_done) {
4018 			/* check for pause blocking before resetting */
4019 			if (tx->watchdog_rx_pause == rx_pause) {
4020 				mxge_warn_stuck(sc, tx, i);
4021 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4022 				return (ENXIO);
4023 			}
4024 			else
4025 				device_printf(sc->dev, "Flow control blocking "
4026 					      "xmits, check link partner\n");
4027 		}
4028 
4029 		tx->watchdog_req = tx->req;
4030 		tx->watchdog_done = tx->done;
4031 		tx->watchdog_rx_pause = rx_pause;
4032 	}
4033 
4034 	if (sc->need_media_probe)
4035 		mxge_media_probe(sc);
4036 	return (err);
4037 }
4038 
4039 static u_long
mxge_update_stats(mxge_softc_t * sc)4040 mxge_update_stats(mxge_softc_t *sc)
4041 {
4042 	struct mxge_slice_state *ss;
4043 	u_long pkts = 0;
4044 	u_long ipackets = 0;
4045 	u_long opackets = 0;
4046 #ifdef IFNET_BUF_RING
4047 	u_long obytes = 0;
4048 	u_long omcasts = 0;
4049 	u_long odrops = 0;
4050 #endif
4051 	u_long oerrors = 0;
4052 	int slice;
4053 
4054 	for (slice = 0; slice < sc->num_slices; slice++) {
4055 		ss = &sc->ss[slice];
4056 		ipackets += ss->ipackets;
4057 		opackets += ss->opackets;
4058 #ifdef IFNET_BUF_RING
4059 		obytes += ss->obytes;
4060 		omcasts += ss->omcasts;
4061 		odrops += ss->tx.br->br_drops;
4062 #endif
4063 		oerrors += ss->oerrors;
4064 	}
4065 	pkts = (ipackets - sc->ifp->if_ipackets);
4066 	pkts += (opackets - sc->ifp->if_opackets);
4067 	sc->ifp->if_ipackets = ipackets;
4068 	sc->ifp->if_opackets = opackets;
4069 #ifdef IFNET_BUF_RING
4070 	sc->ifp->if_obytes = obytes;
4071 	sc->ifp->if_omcasts = omcasts;
4072 	sc->ifp->if_snd.ifq_drops = odrops;
4073 #endif
4074 	sc->ifp->if_oerrors = oerrors;
4075 	return pkts;
4076 }
4077 
4078 static void
mxge_tick(void * arg)4079 mxge_tick(void *arg)
4080 {
4081 	mxge_softc_t *sc = arg;
4082 	u_long pkts = 0;
4083 	int err = 0;
4084 	int running, ticks;
4085 	uint16_t cmd;
4086 
4087 	ticks = mxge_ticks;
4088 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4089 	if (running) {
4090 		/* aggregate stats from different slices */
4091 		pkts = mxge_update_stats(sc);
4092 		if (!sc->watchdog_countdown) {
4093 			err = mxge_watchdog(sc);
4094 			sc->watchdog_countdown = 4;
4095 		}
4096 		sc->watchdog_countdown--;
4097 	}
4098 	if (pkts == 0) {
4099 		/* ensure NIC did not suffer h/w fault while idle */
4100 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4101 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4102 			sc->dying = 2;
4103 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4104 			err = ENXIO;
4105 		}
4106 		/* look less often if NIC is idle */
4107 		ticks *= 4;
4108 	}
4109 
4110 	if (err == 0)
4111 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4112 
4113 }
4114 
4115 static int
mxge_media_change(struct ifnet * ifp)4116 mxge_media_change(struct ifnet *ifp)
4117 {
4118 	return EINVAL;
4119 }
4120 
4121 static int
mxge_change_mtu(mxge_softc_t * sc,int mtu)4122 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4123 {
4124 	struct ifnet *ifp = sc->ifp;
4125 	int real_mtu, old_mtu;
4126 	int err = 0;
4127 
4128 
4129 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4130 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4131 		return EINVAL;
4132 	mtx_lock(&sc->driver_mtx);
4133 	old_mtu = ifp->if_mtu;
4134 	ifp->if_mtu = mtu;
4135 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4136 		mxge_close(sc, 0);
4137 		err = mxge_open(sc);
4138 		if (err != 0) {
4139 			ifp->if_mtu = old_mtu;
4140 			mxge_close(sc, 0);
4141 			(void) mxge_open(sc);
4142 		}
4143 	}
4144 	mtx_unlock(&sc->driver_mtx);
4145 	return err;
4146 }
4147 
4148 static void
mxge_media_status(struct ifnet * ifp,struct ifmediareq * ifmr)4149 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4150 {
4151 	mxge_softc_t *sc = ifp->if_softc;
4152 
4153 
4154 	if (sc == NULL)
4155 		return;
4156 	ifmr->ifm_status = IFM_AVALID;
4157 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4158 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4159 	ifmr->ifm_active |= sc->current_media;
4160 }
4161 
4162 static int
mxge_ioctl(struct ifnet * ifp,u_long command,caddr_t data)4163 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4164 {
4165 	mxge_softc_t *sc = ifp->if_softc;
4166 	struct ifreq *ifr = (struct ifreq *)data;
4167 	int err, mask;
4168 
4169 	err = 0;
4170 	switch (command) {
4171 	case SIOCSIFMTU:
4172 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4173 		break;
4174 
4175 	case SIOCSIFFLAGS:
4176 		mtx_lock(&sc->driver_mtx);
4177 		if (sc->dying) {
4178 			mtx_unlock(&sc->driver_mtx);
4179 			return EINVAL;
4180 		}
4181 		if (ifp->if_flags & IFF_UP) {
4182 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4183 				err = mxge_open(sc);
4184 			} else {
4185 				/* take care of promis can allmulti
4186 				   flag chages */
4187 				mxge_change_promisc(sc,
4188 						    ifp->if_flags & IFF_PROMISC);
4189 				mxge_set_multicast_list(sc);
4190 			}
4191 		} else {
4192 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4193 				mxge_close(sc, 0);
4194 			}
4195 		}
4196 		mtx_unlock(&sc->driver_mtx);
4197 		break;
4198 
4199 	case SIOCADDMULTI:
4200 	case SIOCDELMULTI:
4201 		mtx_lock(&sc->driver_mtx);
4202 		mxge_set_multicast_list(sc);
4203 		mtx_unlock(&sc->driver_mtx);
4204 		break;
4205 
4206 	case SIOCSIFCAP:
4207 		mtx_lock(&sc->driver_mtx);
4208 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4209 		if (mask & IFCAP_TXCSUM) {
4210 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4211 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4212 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4213 			} else {
4214 				ifp->if_capenable |= IFCAP_TXCSUM;
4215 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4216 			}
4217 		} else if (mask & IFCAP_RXCSUM) {
4218 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4219 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4220 			} else {
4221 				ifp->if_capenable |= IFCAP_RXCSUM;
4222 			}
4223 		}
4224 		if (mask & IFCAP_TSO4) {
4225 			if (IFCAP_TSO4 & ifp->if_capenable) {
4226 				ifp->if_capenable &= ~IFCAP_TSO4;
4227 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4228 				ifp->if_capenable |= IFCAP_TSO4;
4229 				ifp->if_hwassist |= CSUM_TSO;
4230 			} else {
4231 				printf("mxge requires tx checksum offload"
4232 				       " be enabled to use TSO\n");
4233 				err = EINVAL;
4234 			}
4235 		}
4236 #if IFCAP_TSO6
4237 		if (mask & IFCAP_TXCSUM_IPV6) {
4238 			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4239 				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4240 						       | IFCAP_TSO6);
4241 				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4242 						      | CSUM_UDP);
4243 			} else {
4244 				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4245 				ifp->if_hwassist |= (CSUM_TCP_IPV6
4246 						     | CSUM_UDP_IPV6);
4247 			}
4248 		} else if (mask & IFCAP_RXCSUM_IPV6) {
4249 			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4250 				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4251 			} else {
4252 				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4253 			}
4254 		}
4255 		if (mask & IFCAP_TSO6) {
4256 			if (IFCAP_TSO6 & ifp->if_capenable) {
4257 				ifp->if_capenable &= ~IFCAP_TSO6;
4258 			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4259 				ifp->if_capenable |= IFCAP_TSO6;
4260 				ifp->if_hwassist |= CSUM_TSO;
4261 			} else {
4262 				printf("mxge requires tx checksum offload"
4263 				       " be enabled to use TSO\n");
4264 				err = EINVAL;
4265 			}
4266 		}
4267 #endif /*IFCAP_TSO6 */
4268 
4269 		if (mask & IFCAP_LRO)
4270 			ifp->if_capenable ^= IFCAP_LRO;
4271 		if (mask & IFCAP_VLAN_HWTAGGING)
4272 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4273 		if (mask & IFCAP_VLAN_HWTSO)
4274 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4275 
4276 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4277 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4278 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4279 
4280 		mtx_unlock(&sc->driver_mtx);
4281 		VLAN_CAPABILITIES(ifp);
4282 
4283 		break;
4284 
4285 	case SIOCGIFMEDIA:
4286 		mtx_lock(&sc->driver_mtx);
4287 		mxge_media_probe(sc);
4288 		mtx_unlock(&sc->driver_mtx);
4289 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4290 				    &sc->media, command);
4291                 break;
4292 
4293 	default:
4294 		err = ether_ioctl(ifp, command, data);
4295 		break;
4296         }
4297 	return err;
4298 }
4299 
4300 static void
mxge_fetch_tunables(mxge_softc_t * sc)4301 mxge_fetch_tunables(mxge_softc_t *sc)
4302 {
4303 
4304 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4305 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4306 			  &mxge_flow_control);
4307 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4308 			  &mxge_intr_coal_delay);
4309 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4310 			  &mxge_nvidia_ecrc_enable);
4311 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4312 			  &mxge_force_firmware);
4313 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4314 			  &mxge_deassert_wait);
4315 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4316 			  &mxge_verbose);
4317 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4318 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4319 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4320 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4321 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4322 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4323 
4324 	if (bootverbose)
4325 		mxge_verbose = 1;
4326 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4327 		mxge_intr_coal_delay = 30;
4328 	if (mxge_ticks == 0)
4329 		mxge_ticks = hz / 2;
4330 	sc->pause = mxge_flow_control;
4331 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4332 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4333 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4334 	}
4335 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4336 	    mxge_initial_mtu < ETHER_MIN_LEN)
4337 		mxge_initial_mtu = ETHERMTU_JUMBO;
4338 
4339 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4340 		mxge_throttle = MXGE_MAX_THROTTLE;
4341 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4342 		mxge_throttle = MXGE_MIN_THROTTLE;
4343 	sc->throttle = mxge_throttle;
4344 }
4345 
4346 
4347 static void
mxge_free_slices(mxge_softc_t * sc)4348 mxge_free_slices(mxge_softc_t *sc)
4349 {
4350 	struct mxge_slice_state *ss;
4351 	int i;
4352 
4353 
4354 	if (sc->ss == NULL)
4355 		return;
4356 
4357 	for (i = 0; i < sc->num_slices; i++) {
4358 		ss = &sc->ss[i];
4359 		if (ss->fw_stats != NULL) {
4360 			mxge_dma_free(&ss->fw_stats_dma);
4361 			ss->fw_stats = NULL;
4362 #ifdef IFNET_BUF_RING
4363 			if (ss->tx.br != NULL) {
4364 				drbr_free(ss->tx.br, M_DEVBUF);
4365 				ss->tx.br = NULL;
4366 			}
4367 #endif
4368 			mtx_destroy(&ss->tx.mtx);
4369 		}
4370 		if (ss->rx_done.entry != NULL) {
4371 			mxge_dma_free(&ss->rx_done.dma);
4372 			ss->rx_done.entry = NULL;
4373 		}
4374 	}
4375 	free(sc->ss, M_DEVBUF);
4376 	sc->ss = NULL;
4377 }
4378 
4379 static int
mxge_alloc_slices(mxge_softc_t * sc)4380 mxge_alloc_slices(mxge_softc_t *sc)
4381 {
4382 	mxge_cmd_t cmd;
4383 	struct mxge_slice_state *ss;
4384 	size_t bytes;
4385 	int err, i, max_intr_slots;
4386 
4387 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4388 	if (err != 0) {
4389 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4390 		return err;
4391 	}
4392 	sc->rx_ring_size = cmd.data0;
4393 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4394 
4395 	bytes = sizeof (*sc->ss) * sc->num_slices;
4396 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4397 	if (sc->ss == NULL)
4398 		return (ENOMEM);
4399 	for (i = 0; i < sc->num_slices; i++) {
4400 		ss = &sc->ss[i];
4401 
4402 		ss->sc = sc;
4403 
4404 		/* allocate per-slice rx interrupt queues */
4405 
4406 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4407 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4408 		if (err != 0)
4409 			goto abort;
4410 		ss->rx_done.entry = ss->rx_done.dma.addr;
4411 		bzero(ss->rx_done.entry, bytes);
4412 
4413 		/*
4414 		 * allocate the per-slice firmware stats; stats
4415 		 * (including tx) are used used only on the first
4416 		 * slice for now
4417 		 */
4418 #ifndef IFNET_BUF_RING
4419 		if (i > 0)
4420 			continue;
4421 #endif
4422 
4423 		bytes = sizeof (*ss->fw_stats);
4424 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4425 				     sizeof (*ss->fw_stats), 64);
4426 		if (err != 0)
4427 			goto abort;
4428 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4429 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4430 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4431 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4432 #ifdef IFNET_BUF_RING
4433 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4434 					   &ss->tx.mtx);
4435 #endif
4436 	}
4437 
4438 	return (0);
4439 
4440 abort:
4441 	mxge_free_slices(sc);
4442 	return (ENOMEM);
4443 }
4444 
4445 static void
mxge_slice_probe(mxge_softc_t * sc)4446 mxge_slice_probe(mxge_softc_t *sc)
4447 {
4448 	mxge_cmd_t cmd;
4449 	char *old_fw;
4450 	int msix_cnt, status, max_intr_slots;
4451 
4452 	sc->num_slices = 1;
4453 	/*
4454 	 *  don't enable multiple slices if they are not enabled,
4455 	 *  or if this is not an SMP system
4456 	 */
4457 
4458 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4459 		return;
4460 
4461 	/* see how many MSI-X interrupts are available */
4462 	msix_cnt = pci_msix_count(sc->dev);
4463 	if (msix_cnt < 2)
4464 		return;
4465 
4466 	/* now load the slice aware firmware see what it supports */
4467 	old_fw = sc->fw_name;
4468 	if (old_fw == mxge_fw_aligned)
4469 		sc->fw_name = mxge_fw_rss_aligned;
4470 	else
4471 		sc->fw_name = mxge_fw_rss_unaligned;
4472 	status = mxge_load_firmware(sc, 0);
4473 	if (status != 0) {
4474 		device_printf(sc->dev, "Falling back to a single slice\n");
4475 		return;
4476 	}
4477 
4478 	/* try to send a reset command to the card to see if it
4479 	   is alive */
4480 	memset(&cmd, 0, sizeof (cmd));
4481 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4482 	if (status != 0) {
4483 		device_printf(sc->dev, "failed reset\n");
4484 		goto abort_with_fw;
4485 	}
4486 
4487 	/* get rx ring size */
4488 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4489 	if (status != 0) {
4490 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4491 		goto abort_with_fw;
4492 	}
4493 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4494 
4495 	/* tell it the size of the interrupt queues */
4496 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4497 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4498 	if (status != 0) {
4499 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4500 		goto abort_with_fw;
4501 	}
4502 
4503 	/* ask the maximum number of slices it supports */
4504 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4505 	if (status != 0) {
4506 		device_printf(sc->dev,
4507 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4508 		goto abort_with_fw;
4509 	}
4510 	sc->num_slices = cmd.data0;
4511 	if (sc->num_slices > msix_cnt)
4512 		sc->num_slices = msix_cnt;
4513 
4514 	if (mxge_max_slices == -1) {
4515 		/* cap to number of CPUs in system */
4516 		if (sc->num_slices > mp_ncpus)
4517 			sc->num_slices = mp_ncpus;
4518 	} else {
4519 		if (sc->num_slices > mxge_max_slices)
4520 			sc->num_slices = mxge_max_slices;
4521 	}
4522 	/* make sure it is a power of two */
4523 	while (sc->num_slices & (sc->num_slices - 1))
4524 		sc->num_slices--;
4525 
4526 	if (mxge_verbose)
4527 		device_printf(sc->dev, "using %d slices\n",
4528 			      sc->num_slices);
4529 
4530 	return;
4531 
4532 abort_with_fw:
4533 	sc->fw_name = old_fw;
4534 	(void) mxge_load_firmware(sc, 0);
4535 }
4536 
4537 static int
mxge_add_msix_irqs(mxge_softc_t * sc)4538 mxge_add_msix_irqs(mxge_softc_t *sc)
4539 {
4540 	size_t bytes;
4541 	int count, err, i, rid;
4542 
4543 	rid = PCIR_BAR(2);
4544 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4545 						    &rid, RF_ACTIVE);
4546 
4547 	if (sc->msix_table_res == NULL) {
4548 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4549 		return ENXIO;
4550 	}
4551 
4552 	count = sc->num_slices;
4553 	err = pci_alloc_msix(sc->dev, &count);
4554 	if (err != 0) {
4555 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4556 			      "err = %d \n", sc->num_slices, err);
4557 		goto abort_with_msix_table;
4558 	}
4559 	if (count < sc->num_slices) {
4560 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4561 			      count, sc->num_slices);
4562 		device_printf(sc->dev,
4563 			      "Try setting hw.mxge.max_slices to %d\n",
4564 			      count);
4565 		err = ENOSPC;
4566 		goto abort_with_msix;
4567 	}
4568 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4569 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4570 	if (sc->msix_irq_res == NULL) {
4571 		err = ENOMEM;
4572 		goto abort_with_msix;
4573 	}
4574 
4575 	for (i = 0; i < sc->num_slices; i++) {
4576 		rid = i + 1;
4577 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4578 							  SYS_RES_IRQ,
4579 							  &rid, RF_ACTIVE);
4580 		if (sc->msix_irq_res[i] == NULL) {
4581 			device_printf(sc->dev, "couldn't allocate IRQ res"
4582 				      " for message %d\n", i);
4583 			err = ENXIO;
4584 			goto abort_with_res;
4585 		}
4586 	}
4587 
4588 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4589 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4590 
4591 	for (i = 0; i < sc->num_slices; i++) {
4592 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4593 				     INTR_TYPE_NET | INTR_MPSAFE,
4594 #if __FreeBSD_version > 700030
4595 				     NULL,
4596 #endif
4597 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4598 		if (err != 0) {
4599 			device_printf(sc->dev, "couldn't setup intr for "
4600 				      "message %d\n", i);
4601 			goto abort_with_intr;
4602 		}
4603 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4604 				  sc->msix_ih[i], "s%d", i);
4605 	}
4606 
4607 	if (mxge_verbose) {
4608 		device_printf(sc->dev, "using %d msix IRQs:",
4609 			      sc->num_slices);
4610 		for (i = 0; i < sc->num_slices; i++)
4611 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4612 		printf("\n");
4613 	}
4614 	return (0);
4615 
4616 abort_with_intr:
4617 	for (i = 0; i < sc->num_slices; i++) {
4618 		if (sc->msix_ih[i] != NULL) {
4619 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4620 					  sc->msix_ih[i]);
4621 			sc->msix_ih[i] = NULL;
4622 		}
4623 	}
4624 	free(sc->msix_ih, M_DEVBUF);
4625 
4626 
4627 abort_with_res:
4628 	for (i = 0; i < sc->num_slices; i++) {
4629 		rid = i + 1;
4630 		if (sc->msix_irq_res[i] != NULL)
4631 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4632 					     sc->msix_irq_res[i]);
4633 		sc->msix_irq_res[i] = NULL;
4634 	}
4635 	free(sc->msix_irq_res, M_DEVBUF);
4636 
4637 
4638 abort_with_msix:
4639 	pci_release_msi(sc->dev);
4640 
4641 abort_with_msix_table:
4642 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4643 			     sc->msix_table_res);
4644 
4645 	return err;
4646 }
4647 
4648 static int
mxge_add_single_irq(mxge_softc_t * sc)4649 mxge_add_single_irq(mxge_softc_t *sc)
4650 {
4651 	int count, err, rid;
4652 
4653 	count = pci_msi_count(sc->dev);
4654 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4655 		rid = 1;
4656 	} else {
4657 		rid = 0;
4658 		sc->legacy_irq = 1;
4659 	}
4660 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4661 					 1, RF_SHAREABLE | RF_ACTIVE);
4662 	if (sc->irq_res == NULL) {
4663 		device_printf(sc->dev, "could not alloc interrupt\n");
4664 		return ENXIO;
4665 	}
4666 	if (mxge_verbose)
4667 		device_printf(sc->dev, "using %s irq %ld\n",
4668 			      sc->legacy_irq ? "INTx" : "MSI",
4669 			      rman_get_start(sc->irq_res));
4670 	err = bus_setup_intr(sc->dev, sc->irq_res,
4671 			     INTR_TYPE_NET | INTR_MPSAFE,
4672 #if __FreeBSD_version > 700030
4673 			     NULL,
4674 #endif
4675 			     mxge_intr, &sc->ss[0], &sc->ih);
4676 	if (err != 0) {
4677 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4678 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4679 		if (!sc->legacy_irq)
4680 			pci_release_msi(sc->dev);
4681 	}
4682 	return err;
4683 }
4684 
4685 static void
mxge_rem_msix_irqs(mxge_softc_t * sc)4686 mxge_rem_msix_irqs(mxge_softc_t *sc)
4687 {
4688 	int i, rid;
4689 
4690 	for (i = 0; i < sc->num_slices; i++) {
4691 		if (sc->msix_ih[i] != NULL) {
4692 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4693 					  sc->msix_ih[i]);
4694 			sc->msix_ih[i] = NULL;
4695 		}
4696 	}
4697 	free(sc->msix_ih, M_DEVBUF);
4698 
4699 	for (i = 0; i < sc->num_slices; i++) {
4700 		rid = i + 1;
4701 		if (sc->msix_irq_res[i] != NULL)
4702 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4703 					     sc->msix_irq_res[i]);
4704 		sc->msix_irq_res[i] = NULL;
4705 	}
4706 	free(sc->msix_irq_res, M_DEVBUF);
4707 
4708 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4709 			     sc->msix_table_res);
4710 
4711 	pci_release_msi(sc->dev);
4712 	return;
4713 }
4714 
4715 static void
mxge_rem_single_irq(mxge_softc_t * sc)4716 mxge_rem_single_irq(mxge_softc_t *sc)
4717 {
4718 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4719 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4720 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4721 	if (!sc->legacy_irq)
4722 		pci_release_msi(sc->dev);
4723 }
4724 
4725 static void
mxge_rem_irq(mxge_softc_t * sc)4726 mxge_rem_irq(mxge_softc_t *sc)
4727 {
4728 	if (sc->num_slices > 1)
4729 		mxge_rem_msix_irqs(sc);
4730 	else
4731 		mxge_rem_single_irq(sc);
4732 }
4733 
4734 static int
mxge_add_irq(mxge_softc_t * sc)4735 mxge_add_irq(mxge_softc_t *sc)
4736 {
4737 	int err;
4738 
4739 	if (sc->num_slices > 1)
4740 		err = mxge_add_msix_irqs(sc);
4741 	else
4742 		err = mxge_add_single_irq(sc);
4743 
4744 	if (0 && err == 0 && sc->num_slices > 1) {
4745 		mxge_rem_msix_irqs(sc);
4746 		err = mxge_add_msix_irqs(sc);
4747 	}
4748 	return err;
4749 }
4750 
4751 
4752 static int
mxge_attach(device_t dev)4753 mxge_attach(device_t dev)
4754 {
4755 	mxge_cmd_t cmd;
4756 	mxge_softc_t *sc = device_get_softc(dev);
4757 	struct ifnet *ifp;
4758 	int err, rid;
4759 
4760 	sc->dev = dev;
4761 	mxge_fetch_tunables(sc);
4762 
4763 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4764 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4765 				  taskqueue_thread_enqueue, &sc->tq);
4766 	if (sc->tq == NULL) {
4767 		err = ENOMEM;
4768 		goto abort_with_nothing;
4769 	}
4770 
4771 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4772 				 1,			/* alignment */
4773 				 0,			/* boundary */
4774 				 BUS_SPACE_MAXADDR,	/* low */
4775 				 BUS_SPACE_MAXADDR,	/* high */
4776 				 NULL, NULL,		/* filter */
4777 				 65536 + 256,		/* maxsize */
4778 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4779 				 65536,			/* maxsegsize */
4780 				 0,			/* flags */
4781 				 NULL, NULL,		/* lock */
4782 				 &sc->parent_dmat);	/* tag */
4783 
4784 	if (err != 0) {
4785 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4786 			      err);
4787 		goto abort_with_tq;
4788 	}
4789 
4790 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4791 	if (ifp == NULL) {
4792 		device_printf(dev, "can not if_alloc()\n");
4793 		err = ENOSPC;
4794 		goto abort_with_parent_dmat;
4795 	}
4796 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4797 
4798 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4799 		 device_get_nameunit(dev));
4800 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4801 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4802 		 "%s:drv", device_get_nameunit(dev));
4803 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4804 		 MTX_NETWORK_LOCK, MTX_DEF);
4805 
4806 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4807 
4808 	mxge_setup_cfg_space(sc);
4809 
4810 	/* Map the board into the kernel */
4811 	rid = PCIR_BARS;
4812 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4813 					 ~0, 1, RF_ACTIVE);
4814 	if (sc->mem_res == NULL) {
4815 		device_printf(dev, "could not map memory\n");
4816 		err = ENXIO;
4817 		goto abort_with_lock;
4818 	}
4819 	sc->sram = rman_get_virtual(sc->mem_res);
4820 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4821 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4822 		device_printf(dev, "impossible memory region size %ld\n",
4823 			      rman_get_size(sc->mem_res));
4824 		err = ENXIO;
4825 		goto abort_with_mem_res;
4826 	}
4827 
4828 	/* make NULL terminated copy of the EEPROM strings section of
4829 	   lanai SRAM */
4830 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4831 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4832 				rman_get_bushandle(sc->mem_res),
4833 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4834 				sc->eeprom_strings,
4835 				MXGE_EEPROM_STRINGS_SIZE - 2);
4836 	err = mxge_parse_strings(sc);
4837 	if (err != 0)
4838 		goto abort_with_mem_res;
4839 
4840 	/* Enable write combining for efficient use of PCIe bus */
4841 	mxge_enable_wc(sc);
4842 
4843 	/* Allocate the out of band dma memory */
4844 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4845 			     sizeof (mxge_cmd_t), 64);
4846 	if (err != 0)
4847 		goto abort_with_mem_res;
4848 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4849 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4850 	if (err != 0)
4851 		goto abort_with_cmd_dma;
4852 
4853 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4854 	if (err != 0)
4855 		goto abort_with_zeropad_dma;
4856 
4857 	/* select & load the firmware */
4858 	err = mxge_select_firmware(sc);
4859 	if (err != 0)
4860 		goto abort_with_dmabench;
4861 	sc->intr_coal_delay = mxge_intr_coal_delay;
4862 
4863 	mxge_slice_probe(sc);
4864 	err = mxge_alloc_slices(sc);
4865 	if (err != 0)
4866 		goto abort_with_dmabench;
4867 
4868 	err = mxge_reset(sc, 0);
4869 	if (err != 0)
4870 		goto abort_with_slices;
4871 
4872 	err = mxge_alloc_rings(sc);
4873 	if (err != 0) {
4874 		device_printf(sc->dev, "failed to allocate rings\n");
4875 		goto abort_with_slices;
4876 	}
4877 
4878 	err = mxge_add_irq(sc);
4879 	if (err != 0) {
4880 		device_printf(sc->dev, "failed to add irq\n");
4881 		goto abort_with_rings;
4882 	}
4883 
4884 	if_initbaudrate(ifp, IF_Gbps(10));
4885 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4886 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4887 		IFCAP_RXCSUM_IPV6;
4888 #if defined(INET) || defined(INET6)
4889 	ifp->if_capabilities |= IFCAP_LRO;
4890 #endif
4891 
4892 #ifdef MXGE_NEW_VLAN_API
4893 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4894 
4895 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4896 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4897 	    sc->fw_ver_tiny >= 32)
4898 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4899 #endif
4900 	sc->max_mtu = mxge_max_mtu(sc);
4901 	if (sc->max_mtu >= 9000)
4902 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4903 	else
4904 		device_printf(dev, "MTU limited to %d.  Install "
4905 			      "latest firmware for 9000 byte jumbo support\n",
4906 			      sc->max_mtu - ETHER_HDR_LEN);
4907 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4908 	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4909 	/* check to see if f/w supports TSO for IPv6 */
4910 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4911 		if (CSUM_TCP_IPV6)
4912 			ifp->if_capabilities |= IFCAP_TSO6;
4913 		sc->max_tso6_hlen = min(cmd.data0,
4914 					sizeof (sc->ss[0].scratch));
4915 	}
4916 	ifp->if_capenable = ifp->if_capabilities;
4917 	if (sc->lro_cnt == 0)
4918 		ifp->if_capenable &= ~IFCAP_LRO;
4919         ifp->if_init = mxge_init;
4920         ifp->if_softc = sc;
4921         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4922         ifp->if_ioctl = mxge_ioctl;
4923         ifp->if_start = mxge_start;
4924 	/* Initialise the ifmedia structure */
4925 	ifmedia_init(&sc->media, 0, mxge_media_change,
4926 		     mxge_media_status);
4927 	mxge_media_init(sc);
4928 	mxge_media_probe(sc);
4929 	sc->dying = 0;
4930 	ether_ifattach(ifp, sc->mac_addr);
4931 	/* ether_ifattach sets mtu to ETHERMTU */
4932 	if (mxge_initial_mtu != ETHERMTU)
4933 		mxge_change_mtu(sc, mxge_initial_mtu);
4934 
4935 	mxge_add_sysctls(sc);
4936 #ifdef IFNET_BUF_RING
4937 	ifp->if_transmit = mxge_transmit;
4938 	ifp->if_qflush = mxge_qflush;
4939 #endif
4940 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4941 				device_get_nameunit(sc->dev));
4942 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4943 	return 0;
4944 
4945 abort_with_rings:
4946 	mxge_free_rings(sc);
4947 abort_with_slices:
4948 	mxge_free_slices(sc);
4949 abort_with_dmabench:
4950 	mxge_dma_free(&sc->dmabench_dma);
4951 abort_with_zeropad_dma:
4952 	mxge_dma_free(&sc->zeropad_dma);
4953 abort_with_cmd_dma:
4954 	mxge_dma_free(&sc->cmd_dma);
4955 abort_with_mem_res:
4956 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4957 abort_with_lock:
4958 	pci_disable_busmaster(dev);
4959 	mtx_destroy(&sc->cmd_mtx);
4960 	mtx_destroy(&sc->driver_mtx);
4961 	if_free(ifp);
4962 abort_with_parent_dmat:
4963 	bus_dma_tag_destroy(sc->parent_dmat);
4964 abort_with_tq:
4965 	if (sc->tq != NULL) {
4966 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4967 		taskqueue_free(sc->tq);
4968 		sc->tq = NULL;
4969 	}
4970 abort_with_nothing:
4971 	return err;
4972 }
4973 
4974 static int
mxge_detach(device_t dev)4975 mxge_detach(device_t dev)
4976 {
4977 	mxge_softc_t *sc = device_get_softc(dev);
4978 
4979 	if (mxge_vlans_active(sc)) {
4980 		device_printf(sc->dev,
4981 			      "Detach vlans before removing module\n");
4982 		return EBUSY;
4983 	}
4984 	mtx_lock(&sc->driver_mtx);
4985 	sc->dying = 1;
4986 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4987 		mxge_close(sc, 0);
4988 	mtx_unlock(&sc->driver_mtx);
4989 	ether_ifdetach(sc->ifp);
4990 	if (sc->tq != NULL) {
4991 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4992 		taskqueue_free(sc->tq);
4993 		sc->tq = NULL;
4994 	}
4995 	callout_drain(&sc->co_hdl);
4996 	ifmedia_removeall(&sc->media);
4997 	mxge_dummy_rdma(sc, 0);
4998 	mxge_rem_sysctls(sc);
4999 	mxge_rem_irq(sc);
5000 	mxge_free_rings(sc);
5001 	mxge_free_slices(sc);
5002 	mxge_dma_free(&sc->dmabench_dma);
5003 	mxge_dma_free(&sc->zeropad_dma);
5004 	mxge_dma_free(&sc->cmd_dma);
5005 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5006 	pci_disable_busmaster(dev);
5007 	mtx_destroy(&sc->cmd_mtx);
5008 	mtx_destroy(&sc->driver_mtx);
5009 	if_free(sc->ifp);
5010 	bus_dma_tag_destroy(sc->parent_dmat);
5011 	return 0;
5012 }
5013 
5014 static int
mxge_shutdown(device_t dev)5015 mxge_shutdown(device_t dev)
5016 {
5017 	return 0;
5018 }
5019 
5020 /*
5021   This file uses Myri10GE driver indentation.
5022 
5023   Local Variables:
5024   c-file-style:"linux"
5025   tab-width:8
5026   End:
5027 */
5028