1 /*-
2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
12 * disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*-
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 */
54
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/rmlock.h>
75 #include <sys/sbuf.h>
76 #include <sys/smp.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
79 #include <sys/sx.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
84
85 #include <machine/atomic.h>
86 #include <machine/in_cksum.h>
87
88 #include <net/bpf.h>
89 #include <net/ethernet.h>
90 #include <net/if.h>
91 #include <net/if_dl.h>
92 #include <net/if_media.h>
93 #include <net/if_types.h>
94 #include <net/if_var.h>
95 #include <net/rndis.h>
96 #ifdef RSS
97 #include <net/rss_config.h>
98 #endif
99
100 #include <netinet/in_systm.h>
101 #include <netinet/in.h>
102 #include <netinet/ip.h>
103 #include <netinet/ip6.h>
104 #include <netinet/tcp.h>
105 #include <netinet/tcp_lro.h>
106 #include <netinet/udp.h>
107
108 #include <dev/hyperv/include/hyperv.h>
109 #include <dev/hyperv/include/hyperv_busdma.h>
110 #include <dev/hyperv/include/vmbus.h>
111 #include <dev/hyperv/include/vmbus_xact.h>
112
113 #include <dev/hyperv/netvsc/ndis.h>
114 #include <dev/hyperv/netvsc/if_hnreg.h>
115 #include <dev/hyperv/netvsc/if_hnvar.h>
116 #include <dev/hyperv/netvsc/hn_nvs.h>
117 #include <dev/hyperv/netvsc/hn_rndis.h>
118
119 #include "vmbus_if.h"
120
121 #define HN_IFSTART_SUPPORT
122
123 #define HN_RING_CNT_DEF_MAX 8
124
125 #define HN_VFMAP_SIZE_DEF 8
126
127 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */
128
129 /* YYY should get it from the underlying channel */
130 #define HN_TX_DESC_CNT 512
131
132 #define HN_RNDIS_PKT_LEN \
133 (sizeof(struct rndis_packet_msg) + \
134 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
135 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
136 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
137 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
138 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
139 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
140
141 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
142 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
143 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
144 /* -1 for RNDIS packet message */
145 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
146
147 #define HN_DIRECT_TX_SIZE_DEF 128
148
149 #define HN_EARLY_TXEOF_THRESH 8
150
151 #define HN_PKTBUF_LEN_DEF (16 * 1024)
152
153 #define HN_LROENT_CNT_DEF 128
154
155 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
156 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
157 /* YYY 2*MTU is a bit rough, but should be good enough. */
158 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
159
160 #define HN_LRO_ACKCNT_DEF 1
161
162 #define HN_LOCK_INIT(sc) \
163 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
164 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
165 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
166 #define HN_LOCK(sc) \
167 do { \
168 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
169 DELAY(1000); \
170 } while (0)
171 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
172
173 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
174 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
175 #define HN_CSUM_IP_HWASSIST(sc) \
176 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
177 #define HN_CSUM_IP6_HWASSIST(sc) \
178 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
179
180 #define HN_PKTSIZE_MIN(align) \
181 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
182 HN_RNDIS_PKT_LEN, (align))
183 #define HN_PKTSIZE(m, align) \
184 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
185
186 #ifdef RSS
187 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets())
188 #else
189 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
190 #endif
191
192 struct hn_txdesc {
193 #ifndef HN_USE_TXDESC_BUFRING
194 SLIST_ENTRY(hn_txdesc) link;
195 #endif
196 STAILQ_ENTRY(hn_txdesc) agg_link;
197
198 /* Aggregated txdescs, in sending order. */
199 STAILQ_HEAD(, hn_txdesc) agg_list;
200
201 /* The oldest packet, if transmission aggregation happens. */
202 struct mbuf *m;
203 struct hn_tx_ring *txr;
204 int refs;
205 uint32_t flags; /* HN_TXD_FLAG_ */
206 struct hn_nvs_sendctx send_ctx;
207 uint32_t chim_index;
208 int chim_size;
209
210 bus_dmamap_t data_dmap;
211
212 bus_addr_t rndis_pkt_paddr;
213 struct rndis_packet_msg *rndis_pkt;
214 bus_dmamap_t rndis_pkt_dmap;
215 };
216
217 #define HN_TXD_FLAG_ONLIST 0x0001
218 #define HN_TXD_FLAG_DMAMAP 0x0002
219 #define HN_TXD_FLAG_ONAGG 0x0004
220
221 struct hn_rxinfo {
222 uint32_t vlan_info;
223 uint32_t csum_info;
224 uint32_t hash_info;
225 uint32_t hash_value;
226 };
227
228 struct hn_rxvf_setarg {
229 struct hn_rx_ring *rxr;
230 struct ifnet *vf_ifp;
231 };
232
233 #define HN_RXINFO_VLAN 0x0001
234 #define HN_RXINFO_CSUM 0x0002
235 #define HN_RXINFO_HASHINF 0x0004
236 #define HN_RXINFO_HASHVAL 0x0008
237 #define HN_RXINFO_ALL \
238 (HN_RXINFO_VLAN | \
239 HN_RXINFO_CSUM | \
240 HN_RXINFO_HASHINF | \
241 HN_RXINFO_HASHVAL)
242
243 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
244 #define HN_NDIS_RXCSUM_INFO_INVALID 0
245 #define HN_NDIS_HASH_INFO_INVALID 0
246
247 static int hn_probe(device_t);
248 static int hn_attach(device_t);
249 static int hn_detach(device_t);
250 static int hn_shutdown(device_t);
251 static void hn_chan_callback(struct vmbus_channel *,
252 void *);
253
254 static void hn_init(void *);
255 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
256 #ifdef HN_IFSTART_SUPPORT
257 static void hn_start(struct ifnet *);
258 #endif
259 static int hn_transmit(struct ifnet *, struct mbuf *);
260 static void hn_xmit_qflush(struct ifnet *);
261 static int hn_ifmedia_upd(struct ifnet *);
262 static void hn_ifmedia_sts(struct ifnet *,
263 struct ifmediareq *);
264
265 static void hn_ifnet_event(void *, struct ifnet *, int);
266 static void hn_ifaddr_event(void *, struct ifnet *);
267 static void hn_ifnet_attevent(void *, struct ifnet *);
268 static void hn_ifnet_detevent(void *, struct ifnet *);
269 static void hn_ifnet_lnkevent(void *, struct ifnet *, int);
270
271 static bool hn_ismyvf(const struct hn_softc *,
272 const struct ifnet *);
273 static void hn_rxvf_change(struct hn_softc *,
274 struct ifnet *, bool);
275 static void hn_rxvf_set(struct hn_softc *, struct ifnet *);
276 static void hn_rxvf_set_task(void *, int);
277 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
278 static int hn_xpnt_vf_iocsetflags(struct hn_softc *);
279 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *,
280 struct ifreq *);
281 static void hn_xpnt_vf_saveifflags(struct hn_softc *);
282 static bool hn_xpnt_vf_isready(struct hn_softc *);
283 static void hn_xpnt_vf_setready(struct hn_softc *);
284 static void hn_xpnt_vf_init_taskfunc(void *, int);
285 static void hn_xpnt_vf_init(struct hn_softc *);
286 static void hn_xpnt_vf_setenable(struct hn_softc *);
287 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool);
288 static void hn_vf_rss_fixup(struct hn_softc *, bool);
289 static void hn_vf_rss_restore(struct hn_softc *);
290
291 static int hn_rndis_rxinfo(const void *, int,
292 struct hn_rxinfo *);
293 static void hn_rndis_rx_data(struct hn_rx_ring *,
294 const void *, int);
295 static void hn_rndis_rx_status(struct hn_softc *,
296 const void *, int);
297 static void hn_rndis_init_fixat(struct hn_softc *, int);
298
299 static void hn_nvs_handle_notify(struct hn_softc *,
300 const struct vmbus_chanpkt_hdr *);
301 static void hn_nvs_handle_comp(struct hn_softc *,
302 struct vmbus_channel *,
303 const struct vmbus_chanpkt_hdr *);
304 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
305 struct vmbus_channel *,
306 const struct vmbus_chanpkt_hdr *);
307 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
308 struct vmbus_channel *, uint64_t);
309
310 #if __FreeBSD_version >= 1100099
311 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
312 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
313 #endif
314 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
315 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
316 #if __FreeBSD_version < 1100095
317 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
318 #else
319 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
320 #endif
321 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
322 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
323 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
324 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
325 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
326 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
327 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
328 #ifndef RSS
329 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
330 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
331 #endif
332 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
333 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
334 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
335 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
336 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
337 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
338 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
339 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
340 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
341 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
342 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
343 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
344 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
345 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
346
347 static void hn_stop(struct hn_softc *, bool);
348 static void hn_init_locked(struct hn_softc *);
349 static int hn_chan_attach(struct hn_softc *,
350 struct vmbus_channel *);
351 static void hn_chan_detach(struct hn_softc *,
352 struct vmbus_channel *);
353 static int hn_attach_subchans(struct hn_softc *);
354 static void hn_detach_allchans(struct hn_softc *);
355 static void hn_chan_rollup(struct hn_rx_ring *,
356 struct hn_tx_ring *);
357 static void hn_set_ring_inuse(struct hn_softc *, int);
358 static int hn_synth_attach(struct hn_softc *, int);
359 static void hn_synth_detach(struct hn_softc *);
360 static int hn_synth_alloc_subchans(struct hn_softc *,
361 int *);
362 static bool hn_synth_attachable(const struct hn_softc *);
363 static void hn_suspend(struct hn_softc *);
364 static void hn_suspend_data(struct hn_softc *);
365 static void hn_suspend_mgmt(struct hn_softc *);
366 static void hn_resume(struct hn_softc *);
367 static void hn_resume_data(struct hn_softc *);
368 static void hn_resume_mgmt(struct hn_softc *);
369 static void hn_suspend_mgmt_taskfunc(void *, int);
370 static void hn_chan_drain(struct hn_softc *,
371 struct vmbus_channel *);
372 static void hn_disable_rx(struct hn_softc *);
373 static void hn_drain_rxtx(struct hn_softc *, int);
374 static void hn_polling(struct hn_softc *, u_int);
375 static void hn_chan_polling(struct vmbus_channel *, u_int);
376 static void hn_mtu_change_fixup(struct hn_softc *);
377
378 static void hn_update_link_status(struct hn_softc *);
379 static void hn_change_network(struct hn_softc *);
380 static void hn_link_taskfunc(void *, int);
381 static void hn_netchg_init_taskfunc(void *, int);
382 static void hn_netchg_status_taskfunc(void *, int);
383 static void hn_link_status(struct hn_softc *);
384
385 static int hn_create_rx_data(struct hn_softc *, int);
386 static void hn_destroy_rx_data(struct hn_softc *);
387 static int hn_check_iplen(const struct mbuf *, int);
388 static void hn_rxpkt_proto(const struct mbuf *, int *, int *);
389 static int hn_set_rxfilter(struct hn_softc *, uint32_t);
390 static int hn_rxfilter_config(struct hn_softc *);
391 static int hn_rss_reconfig(struct hn_softc *);
392 static void hn_rss_ind_fixup(struct hn_softc *);
393 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
394 static int hn_rxpkt(struct hn_rx_ring *, const void *,
395 int, const struct hn_rxinfo *);
396 static uint32_t hn_rss_type_fromndis(uint32_t);
397 static uint32_t hn_rss_type_tondis(uint32_t);
398
399 static int hn_tx_ring_create(struct hn_softc *, int);
400 static void hn_tx_ring_destroy(struct hn_tx_ring *);
401 static int hn_create_tx_data(struct hn_softc *, int);
402 static void hn_fixup_tx_data(struct hn_softc *);
403 static void hn_fixup_rx_data(struct hn_softc *);
404 static void hn_destroy_tx_data(struct hn_softc *);
405 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
406 static void hn_txdesc_gc(struct hn_tx_ring *,
407 struct hn_txdesc *);
408 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
409 struct hn_txdesc *, struct mbuf **);
410 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
411 struct hn_txdesc *);
412 static void hn_set_chim_size(struct hn_softc *, int);
413 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
414 static bool hn_tx_ring_pending(struct hn_tx_ring *);
415 static void hn_tx_ring_qflush(struct hn_tx_ring *);
416 static void hn_resume_tx(struct hn_softc *, int);
417 static void hn_set_txagg(struct hn_softc *);
418 static void *hn_try_txagg(struct ifnet *,
419 struct hn_tx_ring *, struct hn_txdesc *,
420 int);
421 static int hn_get_txswq_depth(const struct hn_tx_ring *);
422 static void hn_txpkt_done(struct hn_nvs_sendctx *,
423 struct hn_softc *, struct vmbus_channel *,
424 const void *, int);
425 static int hn_txpkt_sglist(struct hn_tx_ring *,
426 struct hn_txdesc *);
427 static int hn_txpkt_chim(struct hn_tx_ring *,
428 struct hn_txdesc *);
429 static int hn_xmit(struct hn_tx_ring *, int);
430 static void hn_xmit_taskfunc(void *, int);
431 static void hn_xmit_txeof(struct hn_tx_ring *);
432 static void hn_xmit_txeof_taskfunc(void *, int);
433 #ifdef HN_IFSTART_SUPPORT
434 static int hn_start_locked(struct hn_tx_ring *, int);
435 static void hn_start_taskfunc(void *, int);
436 static void hn_start_txeof(struct hn_tx_ring *);
437 static void hn_start_txeof_taskfunc(void *, int);
438 #endif
439
440 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
441 "Hyper-V network interface");
442
443 /* Trust tcp segements verification on host side. */
444 static int hn_trust_hosttcp = 1;
445 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
446 &hn_trust_hosttcp, 0,
447 "Trust tcp segement verification on host side, "
448 "when csum info is missing (global setting)");
449
450 /* Trust udp datagrams verification on host side. */
451 static int hn_trust_hostudp = 1;
452 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
453 &hn_trust_hostudp, 0,
454 "Trust udp datagram verification on host side, "
455 "when csum info is missing (global setting)");
456
457 /* Trust ip packets verification on host side. */
458 static int hn_trust_hostip = 1;
459 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
460 &hn_trust_hostip, 0,
461 "Trust ip packet verification on host side, "
462 "when csum info is missing (global setting)");
463
464 /*
465 * Offload UDP/IPv4 checksum.
466 */
467 static int hn_enable_udp4cs = 1;
468 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
469 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
470
471 /*
472 * Offload UDP/IPv6 checksum.
473 */
474 static int hn_enable_udp6cs = 1;
475 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
476 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
477
478 /* Stats. */
479 static counter_u64_t hn_udpcs_fixup;
480 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
481 &hn_udpcs_fixup, "# of UDP checksum fixup");
482
483 /*
484 * See hn_set_hlen().
485 *
486 * This value is for Azure. For Hyper-V, set this above
487 * 65536 to disable UDP datagram checksum fixup.
488 */
489 static int hn_udpcs_fixup_mtu = 1420;
490 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
491 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
492
493 /* Limit TSO burst size */
494 static int hn_tso_maxlen = IP_MAXPACKET;
495 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
496 &hn_tso_maxlen, 0, "TSO burst limit");
497
498 /* Limit chimney send size */
499 static int hn_tx_chimney_size = 0;
500 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
501 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
502
503 /* Limit the size of packet for direct transmission */
504 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
505 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
506 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
507
508 /* # of LRO entries per RX ring */
509 #if defined(INET) || defined(INET6)
510 #if __FreeBSD_version >= 1100095
511 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
512 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
513 &hn_lro_entry_count, 0, "LRO entry count");
514 #endif
515 #endif
516
517 static int hn_tx_taskq_cnt = 1;
518 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
519 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
520
521 #define HN_TX_TASKQ_M_INDEP 0
522 #define HN_TX_TASKQ_M_GLOBAL 1
523 #define HN_TX_TASKQ_M_EVTTQ 2
524
525 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
526 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
527 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
528 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
529
530 #ifndef HN_USE_TXDESC_BUFRING
531 static int hn_use_txdesc_bufring = 0;
532 #else
533 static int hn_use_txdesc_bufring = 1;
534 #endif
535 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
536 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
537
538 #ifdef HN_IFSTART_SUPPORT
539 /* Use ifnet.if_start instead of ifnet.if_transmit */
540 static int hn_use_if_start = 0;
541 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
542 &hn_use_if_start, 0, "Use if_start TX method");
543 #endif
544
545 /* # of channels to use */
546 static int hn_chan_cnt = 0;
547 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
548 &hn_chan_cnt, 0,
549 "# of channels to use; each channel has one RX ring and one TX ring");
550
551 /* # of transmit rings to use */
552 static int hn_tx_ring_cnt = 0;
553 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
554 &hn_tx_ring_cnt, 0, "# of TX rings to use");
555
556 /* Software TX ring deptch */
557 static int hn_tx_swq_depth = 0;
558 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
559 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
560
561 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
562 #if __FreeBSD_version >= 1100095
563 static u_int hn_lro_mbufq_depth = 512;
564 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
565 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
566 #endif
567
568 /* Packet transmission aggregation size limit */
569 static int hn_tx_agg_size = -1;
570 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
571 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
572
573 /* Packet transmission aggregation count limit */
574 static int hn_tx_agg_pkts = -1;
575 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
576 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
577
578 /* VF list */
579 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
580 0, 0, hn_vflist_sysctl, "A", "VF list");
581
582 /* VF mapping */
583 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
584 0, 0, hn_vfmap_sysctl, "A", "VF mapping");
585
586 /* Transparent VF */
587 static int hn_xpnt_vf = 1;
588 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
589 &hn_xpnt_vf, 0, "Transparent VF mod");
590
591 /* Accurate BPF support for Transparent VF */
592 static int hn_xpnt_vf_accbpf = 0;
593 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
594 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
595
596 /* Extra wait for transparent VF attach routing; unit seconds. */
597 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
598 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
599 &hn_xpnt_vf_attwait, 0,
600 "Extra wait for transparent VF attach routing; unit: seconds");
601
602 static u_int hn_cpu_index; /* next CPU for channel */
603 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
604
605 static struct rmlock hn_vfmap_lock;
606 static int hn_vfmap_size;
607 static struct ifnet **hn_vfmap;
608
609 #ifndef RSS
610 static const uint8_t
611 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
612 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
613 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
614 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
615 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
616 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
617 };
618 #endif /* !RSS */
619
620 static const struct hyperv_guid hn_guid = {
621 .hv_guid = {
622 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
623 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
624 };
625
626 static device_method_t hn_methods[] = {
627 /* Device interface */
628 DEVMETHOD(device_probe, hn_probe),
629 DEVMETHOD(device_attach, hn_attach),
630 DEVMETHOD(device_detach, hn_detach),
631 DEVMETHOD(device_shutdown, hn_shutdown),
632 DEVMETHOD_END
633 };
634
635 static driver_t hn_driver = {
636 "hn",
637 hn_methods,
638 sizeof(struct hn_softc)
639 };
640
641 static devclass_t hn_devclass;
642
643 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
644 MODULE_VERSION(hn, 1);
645 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
646
647 #if __FreeBSD_version >= 1100099
648 static void
hn_set_lro_lenlim(struct hn_softc * sc,int lenlim)649 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
650 {
651 int i;
652
653 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
654 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
655 }
656 #endif
657
658 static int
hn_txpkt_sglist(struct hn_tx_ring * txr,struct hn_txdesc * txd)659 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
660 {
661
662 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
663 txd->chim_size == 0, ("invalid rndis sglist txd"));
664 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
665 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
666 }
667
668 static int
hn_txpkt_chim(struct hn_tx_ring * txr,struct hn_txdesc * txd)669 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
670 {
671 struct hn_nvs_rndis rndis;
672
673 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
674 txd->chim_size > 0, ("invalid rndis chim txd"));
675
676 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
677 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
678 rndis.nvs_chim_idx = txd->chim_index;
679 rndis.nvs_chim_sz = txd->chim_size;
680
681 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
682 &rndis, sizeof(rndis), &txd->send_ctx));
683 }
684
685 static __inline uint32_t
hn_chim_alloc(struct hn_softc * sc)686 hn_chim_alloc(struct hn_softc *sc)
687 {
688 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
689 u_long *bmap = sc->hn_chim_bmap;
690 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
691
692 for (i = 0; i < bmap_cnt; ++i) {
693 int idx;
694
695 idx = ffsl(~bmap[i]);
696 if (idx == 0)
697 continue;
698
699 --idx; /* ffsl is 1-based */
700 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
701 ("invalid i %d and idx %d", i, idx));
702
703 if (atomic_testandset_long(&bmap[i], idx))
704 continue;
705
706 ret = i * LONG_BIT + idx;
707 break;
708 }
709 return (ret);
710 }
711
712 static __inline void
hn_chim_free(struct hn_softc * sc,uint32_t chim_idx)713 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
714 {
715 u_long mask;
716 uint32_t idx;
717
718 idx = chim_idx / LONG_BIT;
719 KASSERT(idx < sc->hn_chim_bmap_cnt,
720 ("invalid chimney index 0x%x", chim_idx));
721
722 mask = 1UL << (chim_idx % LONG_BIT);
723 KASSERT(sc->hn_chim_bmap[idx] & mask,
724 ("index bitmap 0x%lx, chimney index %u, "
725 "bitmap idx %d, bitmask 0x%lx",
726 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
727
728 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
729 }
730
731 #if defined(INET6) || defined(INET)
732
733 #define PULLUP_HDR(m, len) \
734 do { \
735 if (__predict_false((m)->m_len < (len))) { \
736 (m) = m_pullup((m), (len)); \
737 if ((m) == NULL) \
738 return (NULL); \
739 } \
740 } while (0)
741
742 /*
743 * NOTE: If this function failed, the m_head would be freed.
744 */
745 static __inline struct mbuf *
hn_tso_fixup(struct mbuf * m_head)746 hn_tso_fixup(struct mbuf *m_head)
747 {
748 struct ether_vlan_header *evl;
749 struct tcphdr *th;
750 int ehlen;
751
752 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
753
754 PULLUP_HDR(m_head, sizeof(*evl));
755 evl = mtod(m_head, struct ether_vlan_header *);
756 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
757 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
758 else
759 ehlen = ETHER_HDR_LEN;
760 m_head->m_pkthdr.l2hlen = ehlen;
761
762 #ifdef INET
763 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
764 struct ip *ip;
765 int iphlen;
766
767 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
768 ip = mtodo(m_head, ehlen);
769 iphlen = ip->ip_hl << 2;
770 m_head->m_pkthdr.l3hlen = iphlen;
771
772 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
773 th = mtodo(m_head, ehlen + iphlen);
774
775 ip->ip_len = 0;
776 ip->ip_sum = 0;
777 th->th_sum = in_pseudo(ip->ip_src.s_addr,
778 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
779 }
780 #endif
781 #if defined(INET6) && defined(INET)
782 else
783 #endif
784 #ifdef INET6
785 {
786 struct ip6_hdr *ip6;
787
788 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
789 ip6 = mtodo(m_head, ehlen);
790 if (ip6->ip6_nxt != IPPROTO_TCP) {
791 m_freem(m_head);
792 return (NULL);
793 }
794 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
795
796 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
797 th = mtodo(m_head, ehlen + sizeof(*ip6));
798
799 ip6->ip6_plen = 0;
800 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
801 }
802 #endif
803 return (m_head);
804 }
805
806 /*
807 * NOTE: If this function failed, the m_head would be freed.
808 */
809 static __inline struct mbuf *
hn_set_hlen(struct mbuf * m_head)810 hn_set_hlen(struct mbuf *m_head)
811 {
812 const struct ether_vlan_header *evl;
813 int ehlen;
814
815 PULLUP_HDR(m_head, sizeof(*evl));
816 evl = mtod(m_head, const struct ether_vlan_header *);
817 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
818 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
819 else
820 ehlen = ETHER_HDR_LEN;
821 m_head->m_pkthdr.l2hlen = ehlen;
822
823 #ifdef INET
824 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
825 const struct ip *ip;
826 int iphlen;
827
828 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
829 ip = mtodo(m_head, ehlen);
830 iphlen = ip->ip_hl << 2;
831 m_head->m_pkthdr.l3hlen = iphlen;
832
833 /*
834 * UDP checksum offload does not work in Azure, if the
835 * following conditions meet:
836 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
837 * - IP_DF is not set in the IP hdr.
838 *
839 * Fallback to software checksum for these UDP datagrams.
840 */
841 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
842 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
843 (ntohs(ip->ip_off) & IP_DF) == 0) {
844 uint16_t off = ehlen + iphlen;
845
846 counter_u64_add(hn_udpcs_fixup, 1);
847 PULLUP_HDR(m_head, off + sizeof(struct udphdr));
848 *(uint16_t *)(m_head->m_data + off +
849 m_head->m_pkthdr.csum_data) = in_cksum_skip(
850 m_head, m_head->m_pkthdr.len, off);
851 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
852 }
853 }
854 #endif
855 #if defined(INET6) && defined(INET)
856 else
857 #endif
858 #ifdef INET6
859 {
860 const struct ip6_hdr *ip6;
861
862 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
863 ip6 = mtodo(m_head, ehlen);
864 if (ip6->ip6_nxt != IPPROTO_TCP &&
865 ip6->ip6_nxt != IPPROTO_UDP) {
866 m_freem(m_head);
867 return (NULL);
868 }
869 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
870 }
871 #endif
872 return (m_head);
873 }
874
875 /*
876 * NOTE: If this function failed, the m_head would be freed.
877 */
878 static __inline struct mbuf *
hn_check_tcpsyn(struct mbuf * m_head,int * tcpsyn)879 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
880 {
881 const struct tcphdr *th;
882 int ehlen, iphlen;
883
884 *tcpsyn = 0;
885 ehlen = m_head->m_pkthdr.l2hlen;
886 iphlen = m_head->m_pkthdr.l3hlen;
887
888 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
889 th = mtodo(m_head, ehlen + iphlen);
890 if (th->th_flags & TH_SYN)
891 *tcpsyn = 1;
892 return (m_head);
893 }
894
895 #undef PULLUP_HDR
896
897 #endif /* INET6 || INET */
898
899 static int
hn_set_rxfilter(struct hn_softc * sc,uint32_t filter)900 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
901 {
902 int error = 0;
903
904 HN_LOCK_ASSERT(sc);
905
906 if (sc->hn_rx_filter != filter) {
907 error = hn_rndis_set_rxfilter(sc, filter);
908 if (!error)
909 sc->hn_rx_filter = filter;
910 }
911 return (error);
912 }
913
914 static int
hn_rxfilter_config(struct hn_softc * sc)915 hn_rxfilter_config(struct hn_softc *sc)
916 {
917 struct ifnet *ifp = sc->hn_ifp;
918 uint32_t filter;
919
920 HN_LOCK_ASSERT(sc);
921
922 /*
923 * If the non-transparent mode VF is activated, we don't know how
924 * its RX filter is configured, so stick the synthetic device in
925 * the promiscous mode.
926 */
927 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
928 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
929 } else {
930 filter = NDIS_PACKET_TYPE_DIRECTED;
931 if (ifp->if_flags & IFF_BROADCAST)
932 filter |= NDIS_PACKET_TYPE_BROADCAST;
933 /* TODO: support multicast list */
934 if ((ifp->if_flags & IFF_ALLMULTI) ||
935 !TAILQ_EMPTY(&ifp->if_multiaddrs))
936 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
937 }
938 return (hn_set_rxfilter(sc, filter));
939 }
940
941 static void
hn_set_txagg(struct hn_softc * sc)942 hn_set_txagg(struct hn_softc *sc)
943 {
944 uint32_t size, pkts;
945 int i;
946
947 /*
948 * Setup aggregation size.
949 */
950 if (sc->hn_agg_size < 0)
951 size = UINT32_MAX;
952 else
953 size = sc->hn_agg_size;
954
955 if (sc->hn_rndis_agg_size < size)
956 size = sc->hn_rndis_agg_size;
957
958 /* NOTE: We only aggregate packets using chimney sending buffers. */
959 if (size > (uint32_t)sc->hn_chim_szmax)
960 size = sc->hn_chim_szmax;
961
962 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
963 /* Disable */
964 size = 0;
965 pkts = 0;
966 goto done;
967 }
968
969 /* NOTE: Type of the per TX ring setting is 'int'. */
970 if (size > INT_MAX)
971 size = INT_MAX;
972
973 /*
974 * Setup aggregation packet count.
975 */
976 if (sc->hn_agg_pkts < 0)
977 pkts = UINT32_MAX;
978 else
979 pkts = sc->hn_agg_pkts;
980
981 if (sc->hn_rndis_agg_pkts < pkts)
982 pkts = sc->hn_rndis_agg_pkts;
983
984 if (pkts <= 1) {
985 /* Disable */
986 size = 0;
987 pkts = 0;
988 goto done;
989 }
990
991 /* NOTE: Type of the per TX ring setting is 'short'. */
992 if (pkts > SHRT_MAX)
993 pkts = SHRT_MAX;
994
995 done:
996 /* NOTE: Type of the per TX ring setting is 'short'. */
997 if (sc->hn_rndis_agg_align > SHRT_MAX) {
998 /* Disable */
999 size = 0;
1000 pkts = 0;
1001 }
1002
1003 if (bootverbose) {
1004 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1005 size, pkts, sc->hn_rndis_agg_align);
1006 }
1007
1008 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1009 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1010
1011 mtx_lock(&txr->hn_tx_lock);
1012 txr->hn_agg_szmax = size;
1013 txr->hn_agg_pktmax = pkts;
1014 txr->hn_agg_align = sc->hn_rndis_agg_align;
1015 mtx_unlock(&txr->hn_tx_lock);
1016 }
1017 }
1018
1019 static int
hn_get_txswq_depth(const struct hn_tx_ring * txr)1020 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1021 {
1022
1023 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1024 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1025 return txr->hn_txdesc_cnt;
1026 return hn_tx_swq_depth;
1027 }
1028
1029 static int
hn_rss_reconfig(struct hn_softc * sc)1030 hn_rss_reconfig(struct hn_softc *sc)
1031 {
1032 int error;
1033
1034 HN_LOCK_ASSERT(sc);
1035
1036 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1037 return (ENXIO);
1038
1039 /*
1040 * Disable RSS first.
1041 *
1042 * NOTE:
1043 * Direct reconfiguration by setting the UNCHG flags does
1044 * _not_ work properly.
1045 */
1046 if (bootverbose)
1047 if_printf(sc->hn_ifp, "disable RSS\n");
1048 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1049 if (error) {
1050 if_printf(sc->hn_ifp, "RSS disable failed\n");
1051 return (error);
1052 }
1053
1054 /*
1055 * Reenable the RSS w/ the updated RSS key or indirect
1056 * table.
1057 */
1058 if (bootverbose)
1059 if_printf(sc->hn_ifp, "reconfig RSS\n");
1060 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1061 if (error) {
1062 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1063 return (error);
1064 }
1065 return (0);
1066 }
1067
1068 static void
hn_rss_ind_fixup(struct hn_softc * sc)1069 hn_rss_ind_fixup(struct hn_softc *sc)
1070 {
1071 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1072 int i, nchan;
1073
1074 nchan = sc->hn_rx_ring_inuse;
1075 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1076
1077 /*
1078 * Check indirect table to make sure that all channels in it
1079 * can be used.
1080 */
1081 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1082 if (rss->rss_ind[i] >= nchan) {
1083 if_printf(sc->hn_ifp,
1084 "RSS indirect table %d fixup: %u -> %d\n",
1085 i, rss->rss_ind[i], nchan - 1);
1086 rss->rss_ind[i] = nchan - 1;
1087 }
1088 }
1089 }
1090
1091 static int
hn_ifmedia_upd(struct ifnet * ifp __unused)1092 hn_ifmedia_upd(struct ifnet *ifp __unused)
1093 {
1094
1095 return EOPNOTSUPP;
1096 }
1097
1098 static void
hn_ifmedia_sts(struct ifnet * ifp,struct ifmediareq * ifmr)1099 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1100 {
1101 struct hn_softc *sc = ifp->if_softc;
1102
1103 ifmr->ifm_status = IFM_AVALID;
1104 ifmr->ifm_active = IFM_ETHER;
1105
1106 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1107 ifmr->ifm_active |= IFM_NONE;
1108 return;
1109 }
1110 ifmr->ifm_status |= IFM_ACTIVE;
1111 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1112 }
1113
1114 static void
hn_rxvf_set_task(void * xarg,int pending __unused)1115 hn_rxvf_set_task(void *xarg, int pending __unused)
1116 {
1117 struct hn_rxvf_setarg *arg = xarg;
1118
1119 arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1120 }
1121
1122 static void
hn_rxvf_set(struct hn_softc * sc,struct ifnet * vf_ifp)1123 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1124 {
1125 struct hn_rx_ring *rxr;
1126 struct hn_rxvf_setarg arg;
1127 struct task task;
1128 int i;
1129
1130 HN_LOCK_ASSERT(sc);
1131
1132 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1133
1134 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1135 rxr = &sc->hn_rx_ring[i];
1136
1137 if (i < sc->hn_rx_ring_inuse) {
1138 arg.rxr = rxr;
1139 arg.vf_ifp = vf_ifp;
1140 vmbus_chan_run_task(rxr->hn_chan, &task);
1141 } else {
1142 rxr->hn_rxvf_ifp = vf_ifp;
1143 }
1144 }
1145 }
1146
1147 static bool
hn_ismyvf(const struct hn_softc * sc,const struct ifnet * ifp)1148 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1149 {
1150 const struct ifnet *hn_ifp;
1151
1152 hn_ifp = sc->hn_ifp;
1153
1154 if (ifp == hn_ifp)
1155 return (false);
1156
1157 if (ifp->if_alloctype != IFT_ETHER)
1158 return (false);
1159
1160 /* Ignore lagg/vlan interfaces */
1161 if (strcmp(ifp->if_dname, "lagg") == 0 ||
1162 strcmp(ifp->if_dname, "vlan") == 0)
1163 return (false);
1164
1165 /*
1166 * During detach events ifp->if_addr might be NULL.
1167 * Make sure the bcmp() below doesn't panic on that:
1168 */
1169 if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1170 return (false);
1171
1172 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1173 return (false);
1174
1175 return (true);
1176 }
1177
1178 static void
hn_rxvf_change(struct hn_softc * sc,struct ifnet * ifp,bool rxvf)1179 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1180 {
1181 struct ifnet *hn_ifp;
1182
1183 HN_LOCK(sc);
1184
1185 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1186 goto out;
1187
1188 if (!hn_ismyvf(sc, ifp))
1189 goto out;
1190 hn_ifp = sc->hn_ifp;
1191
1192 if (rxvf) {
1193 if (sc->hn_flags & HN_FLAG_RXVF)
1194 goto out;
1195
1196 sc->hn_flags |= HN_FLAG_RXVF;
1197 hn_rxfilter_config(sc);
1198 } else {
1199 if (!(sc->hn_flags & HN_FLAG_RXVF))
1200 goto out;
1201
1202 sc->hn_flags &= ~HN_FLAG_RXVF;
1203 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1204 hn_rxfilter_config(sc);
1205 else
1206 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1207 }
1208
1209 hn_nvs_set_datapath(sc,
1210 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1211
1212 hn_rxvf_set(sc, rxvf ? ifp : NULL);
1213
1214 if (rxvf) {
1215 hn_vf_rss_fixup(sc, true);
1216 hn_suspend_mgmt(sc);
1217 sc->hn_link_flags &=
1218 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1219 if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1220 } else {
1221 hn_vf_rss_restore(sc);
1222 hn_resume_mgmt(sc);
1223 }
1224
1225 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1226 rxvf ? "VF_UP" : "VF_DOWN", NULL);
1227
1228 if (bootverbose) {
1229 if_printf(hn_ifp, "datapath is switched %s %s\n",
1230 rxvf ? "to" : "from", ifp->if_xname);
1231 }
1232 out:
1233 HN_UNLOCK(sc);
1234 }
1235
1236 static void
hn_ifnet_event(void * arg,struct ifnet * ifp,int event)1237 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1238 {
1239
1240 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1241 return;
1242 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1243 }
1244
1245 static void
hn_ifaddr_event(void * arg,struct ifnet * ifp)1246 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1247 {
1248
1249 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1250 }
1251
1252 static int
hn_xpnt_vf_iocsetcaps(struct hn_softc * sc,struct ifreq * ifr)1253 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1254 {
1255 struct ifnet *ifp, *vf_ifp;
1256 uint64_t tmp;
1257 int error;
1258
1259 HN_LOCK_ASSERT(sc);
1260 ifp = sc->hn_ifp;
1261 vf_ifp = sc->hn_vf_ifp;
1262
1263 /*
1264 * Fix up requested capabilities w/ supported capabilities,
1265 * since the supported capabilities could have been changed.
1266 */
1267 ifr->ifr_reqcap &= ifp->if_capabilities;
1268 /* Pass SIOCSIFCAP to VF. */
1269 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1270
1271 /*
1272 * NOTE:
1273 * The error will be propagated to the callers, however, it
1274 * is _not_ useful here.
1275 */
1276
1277 /*
1278 * Merge VF's enabled capabilities.
1279 */
1280 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1281
1282 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1283 if (ifp->if_capenable & IFCAP_TXCSUM)
1284 ifp->if_hwassist |= tmp;
1285 else
1286 ifp->if_hwassist &= ~tmp;
1287
1288 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1289 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1290 ifp->if_hwassist |= tmp;
1291 else
1292 ifp->if_hwassist &= ~tmp;
1293
1294 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1295 if (ifp->if_capenable & IFCAP_TSO4)
1296 ifp->if_hwassist |= tmp;
1297 else
1298 ifp->if_hwassist &= ~tmp;
1299
1300 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1301 if (ifp->if_capenable & IFCAP_TSO6)
1302 ifp->if_hwassist |= tmp;
1303 else
1304 ifp->if_hwassist &= ~tmp;
1305
1306 return (error);
1307 }
1308
1309 static int
hn_xpnt_vf_iocsetflags(struct hn_softc * sc)1310 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1311 {
1312 struct ifnet *vf_ifp;
1313 struct ifreq ifr;
1314
1315 HN_LOCK_ASSERT(sc);
1316 vf_ifp = sc->hn_vf_ifp;
1317
1318 memset(&ifr, 0, sizeof(ifr));
1319 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1320 ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1321 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1322 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1323 }
1324
1325 static void
hn_xpnt_vf_saveifflags(struct hn_softc * sc)1326 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1327 {
1328 struct ifnet *ifp = sc->hn_ifp;
1329 int allmulti = 0;
1330
1331 HN_LOCK_ASSERT(sc);
1332
1333 /* XXX vlan(4) style mcast addr maintenance */
1334 if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
1335 allmulti = IFF_ALLMULTI;
1336
1337 /* Always set the VF's if_flags */
1338 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1339 }
1340
1341 static void
hn_xpnt_vf_input(struct ifnet * vf_ifp,struct mbuf * m)1342 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1343 {
1344 struct rm_priotracker pt;
1345 struct ifnet *hn_ifp = NULL;
1346 struct mbuf *mn;
1347
1348 /*
1349 * XXX racy, if hn(4) ever detached.
1350 */
1351 rm_rlock(&hn_vfmap_lock, &pt);
1352 if (vf_ifp->if_index < hn_vfmap_size)
1353 hn_ifp = hn_vfmap[vf_ifp->if_index];
1354 rm_runlock(&hn_vfmap_lock, &pt);
1355
1356 if (hn_ifp != NULL) {
1357 for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1358 /*
1359 * Allow tapping on the VF.
1360 */
1361 ETHER_BPF_MTAP(vf_ifp, mn);
1362
1363 /*
1364 * Update VF stats.
1365 */
1366 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1367 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1368 mn->m_pkthdr.len);
1369 }
1370 /*
1371 * XXX IFCOUNTER_IMCAST
1372 * This stat updating is kinda invasive, since it
1373 * requires two checks on the mbuf: the length check
1374 * and the ethernet header check. As of this write,
1375 * all multicast packets go directly to hn(4), which
1376 * makes imcast stat updating in the VF a try in vian.
1377 */
1378
1379 /*
1380 * Fix up rcvif and increase hn(4)'s ipackets.
1381 */
1382 mn->m_pkthdr.rcvif = hn_ifp;
1383 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1384 }
1385 /*
1386 * Go through hn(4)'s if_input.
1387 */
1388 hn_ifp->if_input(hn_ifp, m);
1389 } else {
1390 /*
1391 * In the middle of the transition; free this
1392 * mbuf chain.
1393 */
1394 while (m != NULL) {
1395 mn = m->m_nextpkt;
1396 m->m_nextpkt = NULL;
1397 m_freem(m);
1398 m = mn;
1399 }
1400 }
1401 }
1402
1403 static void
hn_mtu_change_fixup(struct hn_softc * sc)1404 hn_mtu_change_fixup(struct hn_softc *sc)
1405 {
1406 struct ifnet *ifp;
1407
1408 HN_LOCK_ASSERT(sc);
1409 ifp = sc->hn_ifp;
1410
1411 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1412 #if __FreeBSD_version >= 1100099
1413 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1414 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1415 #endif
1416 }
1417
1418 static uint32_t
hn_rss_type_fromndis(uint32_t rss_hash)1419 hn_rss_type_fromndis(uint32_t rss_hash)
1420 {
1421 uint32_t types = 0;
1422
1423 if (rss_hash & NDIS_HASH_IPV4)
1424 types |= RSS_TYPE_IPV4;
1425 if (rss_hash & NDIS_HASH_TCP_IPV4)
1426 types |= RSS_TYPE_TCP_IPV4;
1427 if (rss_hash & NDIS_HASH_IPV6)
1428 types |= RSS_TYPE_IPV6;
1429 if (rss_hash & NDIS_HASH_IPV6_EX)
1430 types |= RSS_TYPE_IPV6_EX;
1431 if (rss_hash & NDIS_HASH_TCP_IPV6)
1432 types |= RSS_TYPE_TCP_IPV6;
1433 if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1434 types |= RSS_TYPE_TCP_IPV6_EX;
1435 if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1436 types |= RSS_TYPE_UDP_IPV4;
1437 return (types);
1438 }
1439
1440 static uint32_t
hn_rss_type_tondis(uint32_t types)1441 hn_rss_type_tondis(uint32_t types)
1442 {
1443 uint32_t rss_hash = 0;
1444
1445 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1446 ("UDP6 and UDP6EX are not supported"));
1447
1448 if (types & RSS_TYPE_IPV4)
1449 rss_hash |= NDIS_HASH_IPV4;
1450 if (types & RSS_TYPE_TCP_IPV4)
1451 rss_hash |= NDIS_HASH_TCP_IPV4;
1452 if (types & RSS_TYPE_IPV6)
1453 rss_hash |= NDIS_HASH_IPV6;
1454 if (types & RSS_TYPE_IPV6_EX)
1455 rss_hash |= NDIS_HASH_IPV6_EX;
1456 if (types & RSS_TYPE_TCP_IPV6)
1457 rss_hash |= NDIS_HASH_TCP_IPV6;
1458 if (types & RSS_TYPE_TCP_IPV6_EX)
1459 rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1460 if (types & RSS_TYPE_UDP_IPV4)
1461 rss_hash |= NDIS_HASH_UDP_IPV4_X;
1462 return (rss_hash);
1463 }
1464
1465 static void
hn_rss_mbuf_hash(struct hn_softc * sc,uint32_t mbuf_hash)1466 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1467 {
1468 int i;
1469
1470 HN_LOCK_ASSERT(sc);
1471
1472 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1473 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1474 }
1475
1476 static void
hn_vf_rss_fixup(struct hn_softc * sc,bool reconf)1477 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1478 {
1479 struct ifnet *ifp, *vf_ifp;
1480 struct ifrsshash ifrh;
1481 struct ifrsskey ifrk;
1482 int error;
1483 uint32_t my_types, diff_types, mbuf_types = 0;
1484
1485 HN_LOCK_ASSERT(sc);
1486 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1487 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1488
1489 if (sc->hn_rx_ring_inuse == 1) {
1490 /* No RSS on synthetic parts; done. */
1491 return;
1492 }
1493 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1494 /* Synthetic parts do not support Toeplitz; done. */
1495 return;
1496 }
1497
1498 ifp = sc->hn_ifp;
1499 vf_ifp = sc->hn_vf_ifp;
1500
1501 /*
1502 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is
1503 * supported.
1504 */
1505 memset(&ifrk, 0, sizeof(ifrk));
1506 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1507 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1508 if (error) {
1509 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1510 vf_ifp->if_xname, error);
1511 goto done;
1512 }
1513 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1514 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1515 vf_ifp->if_xname, ifrk.ifrk_func);
1516 goto done;
1517 }
1518 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1519 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1520 vf_ifp->if_xname, ifrk.ifrk_keylen);
1521 goto done;
1522 }
1523
1524 /*
1525 * Extract VF's RSS hash. Only Toeplitz is supported.
1526 */
1527 memset(&ifrh, 0, sizeof(ifrh));
1528 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1529 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1530 if (error) {
1531 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1532 vf_ifp->if_xname, error);
1533 goto done;
1534 }
1535 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1536 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1537 vf_ifp->if_xname, ifrh.ifrh_func);
1538 goto done;
1539 }
1540
1541 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1542 if ((ifrh.ifrh_types & my_types) == 0) {
1543 /* This disables RSS; ignore it then */
1544 if_printf(ifp, "%s intersection of RSS types failed. "
1545 "VF %#x, mine %#x\n", vf_ifp->if_xname,
1546 ifrh.ifrh_types, my_types);
1547 goto done;
1548 }
1549
1550 diff_types = my_types ^ ifrh.ifrh_types;
1551 my_types &= ifrh.ifrh_types;
1552 mbuf_types = my_types;
1553
1554 /*
1555 * Detect RSS hash value/type confliction.
1556 *
1557 * NOTE:
1558 * We don't disable the hash type, but stop delivery the hash
1559 * value/type through mbufs on RX path.
1560 *
1561 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1562 * hash is delivered with type of TCP_IPV4. This means if
1563 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1564 * least to hn_mbuf_hash. However, given that _all_ of the
1565 * NICs implement TCP_IPV4, this will _not_ impose any issues
1566 * here.
1567 */
1568 if ((my_types & RSS_TYPE_IPV4) &&
1569 (diff_types & ifrh.ifrh_types &
1570 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1571 /* Conflict; disable IPV4 hash type/value delivery. */
1572 if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1573 mbuf_types &= ~RSS_TYPE_IPV4;
1574 }
1575 if ((my_types & RSS_TYPE_IPV6) &&
1576 (diff_types & ifrh.ifrh_types &
1577 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1578 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1579 RSS_TYPE_IPV6_EX))) {
1580 /* Conflict; disable IPV6 hash type/value delivery. */
1581 if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1582 mbuf_types &= ~RSS_TYPE_IPV6;
1583 }
1584 if ((my_types & RSS_TYPE_IPV6_EX) &&
1585 (diff_types & ifrh.ifrh_types &
1586 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1587 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1588 RSS_TYPE_IPV6))) {
1589 /* Conflict; disable IPV6_EX hash type/value delivery. */
1590 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1591 mbuf_types &= ~RSS_TYPE_IPV6_EX;
1592 }
1593 if ((my_types & RSS_TYPE_TCP_IPV6) &&
1594 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1595 /* Conflict; disable TCP_IPV6 hash type/value delivery. */
1596 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1597 mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1598 }
1599 if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1600 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1601 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1602 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1603 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1604 }
1605 if ((my_types & RSS_TYPE_UDP_IPV6) &&
1606 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1607 /* Conflict; disable UDP_IPV6 hash type/value delivery. */
1608 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1609 mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1610 }
1611 if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1612 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1613 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1614 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1615 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1616 }
1617
1618 /*
1619 * Indirect table does not matter.
1620 */
1621
1622 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1623 hn_rss_type_tondis(my_types);
1624 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1625 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1626
1627 if (reconf) {
1628 error = hn_rss_reconfig(sc);
1629 if (error) {
1630 /* XXX roll-back? */
1631 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1632 /* XXX keep going. */
1633 }
1634 }
1635 done:
1636 /* Hash deliverability for mbufs. */
1637 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1638 }
1639
1640 static void
hn_vf_rss_restore(struct hn_softc * sc)1641 hn_vf_rss_restore(struct hn_softc *sc)
1642 {
1643
1644 HN_LOCK_ASSERT(sc);
1645 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1646 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1647
1648 if (sc->hn_rx_ring_inuse == 1)
1649 goto done;
1650
1651 /*
1652 * Restore hash types. Key does _not_ matter.
1653 */
1654 if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1655 int error;
1656
1657 sc->hn_rss_hash = sc->hn_rss_hcap;
1658 error = hn_rss_reconfig(sc);
1659 if (error) {
1660 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1661 error);
1662 /* XXX keep going. */
1663 }
1664 }
1665 done:
1666 /* Hash deliverability for mbufs. */
1667 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1668 }
1669
1670 static void
hn_xpnt_vf_setready(struct hn_softc * sc)1671 hn_xpnt_vf_setready(struct hn_softc *sc)
1672 {
1673 struct ifnet *ifp, *vf_ifp;
1674 struct ifreq ifr;
1675
1676 HN_LOCK_ASSERT(sc);
1677 ifp = sc->hn_ifp;
1678 vf_ifp = sc->hn_vf_ifp;
1679
1680 /*
1681 * Mark the VF ready.
1682 */
1683 sc->hn_vf_rdytick = 0;
1684
1685 /*
1686 * Save information for restoration.
1687 */
1688 sc->hn_saved_caps = ifp->if_capabilities;
1689 sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1690 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1691 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1692
1693 /*
1694 * Intersect supported/enabled capabilities.
1695 *
1696 * NOTE:
1697 * if_hwassist is not changed here.
1698 */
1699 ifp->if_capabilities &= vf_ifp->if_capabilities;
1700 ifp->if_capenable &= ifp->if_capabilities;
1701
1702 /*
1703 * Fix TSO settings.
1704 */
1705 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1706 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1707 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1708 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1709 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1710 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1711
1712 /*
1713 * Change VF's enabled capabilities.
1714 */
1715 memset(&ifr, 0, sizeof(ifr));
1716 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1717 ifr.ifr_reqcap = ifp->if_capenable;
1718 hn_xpnt_vf_iocsetcaps(sc, &ifr);
1719
1720 if (ifp->if_mtu != ETHERMTU) {
1721 int error;
1722
1723 /*
1724 * Change VF's MTU.
1725 */
1726 memset(&ifr, 0, sizeof(ifr));
1727 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1728 ifr.ifr_mtu = ifp->if_mtu;
1729 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1730 if (error) {
1731 if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1732 vf_ifp->if_xname, ifp->if_mtu);
1733 if (ifp->if_mtu > ETHERMTU) {
1734 if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1735
1736 /*
1737 * XXX
1738 * No need to adjust the synthetic parts' MTU;
1739 * failure of the adjustment will cause us
1740 * infinite headache.
1741 */
1742 ifp->if_mtu = ETHERMTU;
1743 hn_mtu_change_fixup(sc);
1744 }
1745 }
1746 }
1747 }
1748
1749 static bool
hn_xpnt_vf_isready(struct hn_softc * sc)1750 hn_xpnt_vf_isready(struct hn_softc *sc)
1751 {
1752
1753 HN_LOCK_ASSERT(sc);
1754
1755 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1756 return (false);
1757
1758 if (sc->hn_vf_rdytick == 0)
1759 return (true);
1760
1761 if (sc->hn_vf_rdytick > ticks)
1762 return (false);
1763
1764 /* Mark VF as ready. */
1765 hn_xpnt_vf_setready(sc);
1766 return (true);
1767 }
1768
1769 static void
hn_xpnt_vf_setenable(struct hn_softc * sc)1770 hn_xpnt_vf_setenable(struct hn_softc *sc)
1771 {
1772 int i;
1773
1774 HN_LOCK_ASSERT(sc);
1775
1776 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1777 rm_wlock(&sc->hn_vf_lock);
1778 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1779 rm_wunlock(&sc->hn_vf_lock);
1780
1781 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1782 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1783 }
1784
1785 static void
hn_xpnt_vf_setdisable(struct hn_softc * sc,bool clear_vf)1786 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1787 {
1788 int i;
1789
1790 HN_LOCK_ASSERT(sc);
1791
1792 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1793 rm_wlock(&sc->hn_vf_lock);
1794 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1795 if (clear_vf)
1796 sc->hn_vf_ifp = NULL;
1797 rm_wunlock(&sc->hn_vf_lock);
1798
1799 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1800 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1801 }
1802
1803 static void
hn_xpnt_vf_init(struct hn_softc * sc)1804 hn_xpnt_vf_init(struct hn_softc *sc)
1805 {
1806 int error;
1807
1808 HN_LOCK_ASSERT(sc);
1809
1810 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1811 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1812
1813 if (bootverbose) {
1814 if_printf(sc->hn_ifp, "try bringing up %s\n",
1815 sc->hn_vf_ifp->if_xname);
1816 }
1817
1818 /*
1819 * Bring the VF up.
1820 */
1821 hn_xpnt_vf_saveifflags(sc);
1822 sc->hn_vf_ifp->if_flags |= IFF_UP;
1823 error = hn_xpnt_vf_iocsetflags(sc);
1824 if (error) {
1825 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1826 sc->hn_vf_ifp->if_xname, error);
1827 return;
1828 }
1829
1830 /*
1831 * NOTE:
1832 * Datapath setting must happen _after_ bringing the VF up.
1833 */
1834 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1835
1836 /*
1837 * NOTE:
1838 * Fixup RSS related bits _after_ the VF is brought up, since
1839 * many VFs generate RSS key during it's initialization.
1840 */
1841 hn_vf_rss_fixup(sc, true);
1842
1843 /* Mark transparent mode VF as enabled. */
1844 hn_xpnt_vf_setenable(sc);
1845 }
1846
1847 static void
hn_xpnt_vf_init_taskfunc(void * xsc,int pending __unused)1848 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1849 {
1850 struct hn_softc *sc = xsc;
1851
1852 HN_LOCK(sc);
1853
1854 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1855 goto done;
1856 if (sc->hn_vf_ifp == NULL)
1857 goto done;
1858 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1859 goto done;
1860
1861 if (sc->hn_vf_rdytick != 0) {
1862 /* Mark VF as ready. */
1863 hn_xpnt_vf_setready(sc);
1864 }
1865
1866 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1867 /*
1868 * Delayed VF initialization.
1869 */
1870 if (bootverbose) {
1871 if_printf(sc->hn_ifp, "delayed initialize %s\n",
1872 sc->hn_vf_ifp->if_xname);
1873 }
1874 hn_xpnt_vf_init(sc);
1875 }
1876 done:
1877 HN_UNLOCK(sc);
1878 }
1879
1880 static void
hn_ifnet_attevent(void * xsc,struct ifnet * ifp)1881 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1882 {
1883 struct hn_softc *sc = xsc;
1884
1885 HN_LOCK(sc);
1886
1887 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1888 goto done;
1889
1890 if (!hn_ismyvf(sc, ifp))
1891 goto done;
1892
1893 if (sc->hn_vf_ifp != NULL) {
1894 if_printf(sc->hn_ifp, "%s was attached as VF\n",
1895 sc->hn_vf_ifp->if_xname);
1896 goto done;
1897 }
1898
1899 if (hn_xpnt_vf && ifp->if_start != NULL) {
1900 /*
1901 * ifnet.if_start is _not_ supported by transparent
1902 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1903 */
1904 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1905 "in transparent VF mode.\n", ifp->if_xname);
1906 goto done;
1907 }
1908
1909 rm_wlock(&hn_vfmap_lock);
1910
1911 if (ifp->if_index >= hn_vfmap_size) {
1912 struct ifnet **newmap;
1913 int newsize;
1914
1915 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1916 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1917 M_WAITOK | M_ZERO);
1918
1919 memcpy(newmap, hn_vfmap,
1920 sizeof(struct ifnet *) * hn_vfmap_size);
1921 free(hn_vfmap, M_DEVBUF);
1922 hn_vfmap = newmap;
1923 hn_vfmap_size = newsize;
1924 }
1925 KASSERT(hn_vfmap[ifp->if_index] == NULL,
1926 ("%s: ifindex %d was mapped to %s",
1927 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1928 hn_vfmap[ifp->if_index] = sc->hn_ifp;
1929
1930 rm_wunlock(&hn_vfmap_lock);
1931
1932 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1933 rm_wlock(&sc->hn_vf_lock);
1934 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1935 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1936 sc->hn_vf_ifp = ifp;
1937 rm_wunlock(&sc->hn_vf_lock);
1938
1939 if (hn_xpnt_vf) {
1940 int wait_ticks;
1941
1942 /*
1943 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1944 * Save vf_ifp's current if_input for later restoration.
1945 */
1946 sc->hn_vf_input = ifp->if_input;
1947 ifp->if_input = hn_xpnt_vf_input;
1948
1949 /*
1950 * Stop link status management; use the VF's.
1951 */
1952 hn_suspend_mgmt(sc);
1953
1954 /*
1955 * Give VF sometime to complete its attach routing.
1956 */
1957 wait_ticks = hn_xpnt_vf_attwait * hz;
1958 sc->hn_vf_rdytick = ticks + wait_ticks;
1959
1960 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1961 wait_ticks);
1962 }
1963 done:
1964 HN_UNLOCK(sc);
1965 }
1966
1967 static void
hn_ifnet_detevent(void * xsc,struct ifnet * ifp)1968 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1969 {
1970 struct hn_softc *sc = xsc;
1971
1972 HN_LOCK(sc);
1973
1974 if (sc->hn_vf_ifp == NULL)
1975 goto done;
1976
1977 if (!hn_ismyvf(sc, ifp))
1978 goto done;
1979
1980 if (hn_xpnt_vf) {
1981 /*
1982 * Make sure that the delayed initialization is not running.
1983 *
1984 * NOTE:
1985 * - This lock _must_ be released, since the hn_vf_init task
1986 * will try holding this lock.
1987 * - It is safe to release this lock here, since the
1988 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1989 *
1990 * XXX racy, if hn(4) ever detached.
1991 */
1992 HN_UNLOCK(sc);
1993 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1994 HN_LOCK(sc);
1995
1996 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1997 sc->hn_ifp->if_xname));
1998 ifp->if_input = sc->hn_vf_input;
1999 sc->hn_vf_input = NULL;
2000
2001 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2002 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2003 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2004
2005 if (sc->hn_vf_rdytick == 0) {
2006 /*
2007 * The VF was ready; restore some settings.
2008 */
2009 sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2010 /*
2011 * NOTE:
2012 * There is _no_ need to fixup if_capenable and
2013 * if_hwassist, since the if_capabilities before
2014 * restoration was an intersection of the VF's
2015 * if_capabilites and the synthetic device's
2016 * if_capabilites.
2017 */
2018 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2019 sc->hn_ifp->if_hw_tsomaxsegcount =
2020 sc->hn_saved_tsosegcnt;
2021 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2022 }
2023
2024 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2025 /*
2026 * Restore RSS settings.
2027 */
2028 hn_vf_rss_restore(sc);
2029
2030 /*
2031 * Resume link status management, which was suspended
2032 * by hn_ifnet_attevent().
2033 */
2034 hn_resume_mgmt(sc);
2035 }
2036 }
2037
2038 /* Mark transparent mode VF as disabled. */
2039 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2040
2041 rm_wlock(&hn_vfmap_lock);
2042
2043 KASSERT(ifp->if_index < hn_vfmap_size,
2044 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2045 if (hn_vfmap[ifp->if_index] != NULL) {
2046 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2047 ("%s: ifindex %d was mapped to %s",
2048 ifp->if_xname, ifp->if_index,
2049 hn_vfmap[ifp->if_index]->if_xname));
2050 hn_vfmap[ifp->if_index] = NULL;
2051 }
2052
2053 rm_wunlock(&hn_vfmap_lock);
2054 done:
2055 HN_UNLOCK(sc);
2056 }
2057
2058 static void
hn_ifnet_lnkevent(void * xsc,struct ifnet * ifp,int link_state)2059 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2060 {
2061 struct hn_softc *sc = xsc;
2062
2063 if (sc->hn_vf_ifp == ifp)
2064 if_link_state_change(sc->hn_ifp, link_state);
2065 }
2066
2067 static int
hn_probe(device_t dev)2068 hn_probe(device_t dev)
2069 {
2070
2071 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2072 device_set_desc(dev, "Hyper-V Network Interface");
2073 return BUS_PROBE_DEFAULT;
2074 }
2075 return ENXIO;
2076 }
2077
2078 static int
hn_attach(device_t dev)2079 hn_attach(device_t dev)
2080 {
2081 struct hn_softc *sc = device_get_softc(dev);
2082 struct sysctl_oid_list *child;
2083 struct sysctl_ctx_list *ctx;
2084 uint8_t eaddr[ETHER_ADDR_LEN];
2085 struct ifnet *ifp = NULL;
2086 int error, ring_cnt, tx_ring_cnt;
2087 uint32_t mtu;
2088
2089 sc->hn_dev = dev;
2090 sc->hn_prichan = vmbus_get_channel(dev);
2091 HN_LOCK_INIT(sc);
2092 rm_init(&sc->hn_vf_lock, "hnvf");
2093 if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2094 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2095
2096 /*
2097 * Initialize these tunables once.
2098 */
2099 sc->hn_agg_size = hn_tx_agg_size;
2100 sc->hn_agg_pkts = hn_tx_agg_pkts;
2101
2102 /*
2103 * Setup taskqueue for transmission.
2104 */
2105 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2106 int i;
2107
2108 sc->hn_tx_taskqs =
2109 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2110 M_DEVBUF, M_WAITOK);
2111 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2112 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2113 M_WAITOK, taskqueue_thread_enqueue,
2114 &sc->hn_tx_taskqs[i]);
2115 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2116 "%s tx%d", device_get_nameunit(dev), i);
2117 }
2118 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2119 sc->hn_tx_taskqs = hn_tx_taskque;
2120 }
2121
2122 /*
2123 * Setup taskqueue for mangement tasks, e.g. link status.
2124 */
2125 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2126 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2127 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2128 device_get_nameunit(dev));
2129 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2130 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2131 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2132 hn_netchg_status_taskfunc, sc);
2133
2134 if (hn_xpnt_vf) {
2135 /*
2136 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2137 */
2138 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2139 taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2140 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2141 device_get_nameunit(dev));
2142 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2143 hn_xpnt_vf_init_taskfunc, sc);
2144 }
2145
2146 /*
2147 * Allocate ifnet and setup its name earlier, so that if_printf
2148 * can be used by functions, which will be called after
2149 * ether_ifattach().
2150 */
2151 ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2152 ifp->if_softc = sc;
2153 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2154
2155 /*
2156 * Initialize ifmedia earlier so that it can be unconditionally
2157 * destroyed, if error happened later on.
2158 */
2159 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2160
2161 /*
2162 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2163 * to use (tx_ring_cnt).
2164 *
2165 * NOTE:
2166 * The # of RX rings to use is same as the # of channels to use.
2167 */
2168 ring_cnt = hn_chan_cnt;
2169 if (ring_cnt <= 0) {
2170 /* Default */
2171 ring_cnt = mp_ncpus;
2172 if (ring_cnt > HN_RING_CNT_DEF_MAX)
2173 ring_cnt = HN_RING_CNT_DEF_MAX;
2174 } else if (ring_cnt > mp_ncpus) {
2175 ring_cnt = mp_ncpus;
2176 }
2177 #ifdef RSS
2178 if (ring_cnt > rss_getnumbuckets())
2179 ring_cnt = rss_getnumbuckets();
2180 #endif
2181
2182 tx_ring_cnt = hn_tx_ring_cnt;
2183 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2184 tx_ring_cnt = ring_cnt;
2185 #ifdef HN_IFSTART_SUPPORT
2186 if (hn_use_if_start) {
2187 /* ifnet.if_start only needs one TX ring. */
2188 tx_ring_cnt = 1;
2189 }
2190 #endif
2191
2192 /*
2193 * Set the leader CPU for channels.
2194 */
2195 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2196
2197 /*
2198 * Create enough TX/RX rings, even if only limited number of
2199 * channels can be allocated.
2200 */
2201 error = hn_create_tx_data(sc, tx_ring_cnt);
2202 if (error)
2203 goto failed;
2204 error = hn_create_rx_data(sc, ring_cnt);
2205 if (error)
2206 goto failed;
2207
2208 /*
2209 * Create transaction context for NVS and RNDIS transactions.
2210 */
2211 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2212 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2213 if (sc->hn_xact == NULL) {
2214 error = ENXIO;
2215 goto failed;
2216 }
2217
2218 /*
2219 * Install orphan handler for the revocation of this device's
2220 * primary channel.
2221 *
2222 * NOTE:
2223 * The processing order is critical here:
2224 * Install the orphan handler, _before_ testing whether this
2225 * device's primary channel has been revoked or not.
2226 */
2227 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2228 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2229 error = ENXIO;
2230 goto failed;
2231 }
2232
2233 /*
2234 * Attach the synthetic parts, i.e. NVS and RNDIS.
2235 */
2236 error = hn_synth_attach(sc, ETHERMTU);
2237 if (error)
2238 goto failed;
2239
2240 error = hn_rndis_get_eaddr(sc, eaddr);
2241 if (error)
2242 goto failed;
2243
2244 error = hn_rndis_get_mtu(sc, &mtu);
2245 if (error)
2246 mtu = ETHERMTU;
2247 else if (bootverbose)
2248 device_printf(dev, "RNDIS mtu %u\n", mtu);
2249
2250 #if __FreeBSD_version >= 1100099
2251 if (sc->hn_rx_ring_inuse > 1) {
2252 /*
2253 * Reduce TCP segment aggregation limit for multiple
2254 * RX rings to increase ACK timeliness.
2255 */
2256 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2257 }
2258 #endif
2259
2260 /*
2261 * Fixup TX/RX stuffs after synthetic parts are attached.
2262 */
2263 hn_fixup_tx_data(sc);
2264 hn_fixup_rx_data(sc);
2265
2266 ctx = device_get_sysctl_ctx(dev);
2267 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2268 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2269 &sc->hn_nvs_ver, 0, "NVS version");
2270 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2271 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2272 hn_ndis_version_sysctl, "A", "NDIS version");
2273 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2274 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2275 hn_caps_sysctl, "A", "capabilities");
2276 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2277 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2278 hn_hwassist_sysctl, "A", "hwassist");
2279 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2280 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2281 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2282 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2283 "max # of TSO segments");
2284 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2285 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2286 "max size of TSO segment");
2287 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2288 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2289 hn_rxfilter_sysctl, "A", "rxfilter");
2290 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2291 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2292 hn_rss_hash_sysctl, "A", "RSS hash");
2293 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2294 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2295 hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2296 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2297 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2298 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2299 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2300 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2301 #ifndef RSS
2302 /*
2303 * Don't allow RSS key/indirect table changes, if RSS is defined.
2304 */
2305 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2306 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2307 hn_rss_key_sysctl, "IU", "RSS key");
2308 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2309 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2310 hn_rss_ind_sysctl, "IU", "RSS indirect table");
2311 #endif
2312 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2313 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2314 "RNDIS offered packet transmission aggregation size limit");
2315 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2316 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2317 "RNDIS offered packet transmission aggregation count limit");
2318 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2319 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2320 "RNDIS packet transmission aggregation alignment");
2321 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2322 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2323 hn_txagg_size_sysctl, "I",
2324 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2325 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2326 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2327 hn_txagg_pkts_sysctl, "I",
2328 "Packet transmission aggregation packets, "
2329 "0 -- disable, -1 -- auto");
2330 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2331 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2332 hn_polling_sysctl, "I",
2333 "Polling frequency: [100,1000000], 0 disable polling");
2334 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2335 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2336 hn_vf_sysctl, "A", "Virtual Function's name");
2337 if (!hn_xpnt_vf) {
2338 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2339 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2340 hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2341 } else {
2342 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2343 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2344 hn_xpnt_vf_enabled_sysctl, "I",
2345 "Transparent VF enabled");
2346 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2347 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2348 hn_xpnt_vf_accbpf_sysctl, "I",
2349 "Accurate BPF for transparent VF");
2350 }
2351
2352 /*
2353 * Setup the ifmedia, which has been initialized earlier.
2354 */
2355 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2356 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2357 /* XXX ifmedia_set really should do this for us */
2358 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2359
2360 /*
2361 * Setup the ifnet for this interface.
2362 */
2363
2364 ifp->if_baudrate = IF_Gbps(10);
2365 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2366 ifp->if_ioctl = hn_ioctl;
2367 ifp->if_init = hn_init;
2368 #ifdef HN_IFSTART_SUPPORT
2369 if (hn_use_if_start) {
2370 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2371
2372 ifp->if_start = hn_start;
2373 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2374 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2375 IFQ_SET_READY(&ifp->if_snd);
2376 } else
2377 #endif
2378 {
2379 ifp->if_transmit = hn_transmit;
2380 ifp->if_qflush = hn_xmit_qflush;
2381 }
2382
2383 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2384 #ifdef foo
2385 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2386 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2387 #endif
2388 if (sc->hn_caps & HN_CAP_VLAN) {
2389 /* XXX not sure about VLAN_MTU. */
2390 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2391 }
2392
2393 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2394 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2395 ifp->if_capabilities |= IFCAP_TXCSUM;
2396 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2397 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2398 if (sc->hn_caps & HN_CAP_TSO4) {
2399 ifp->if_capabilities |= IFCAP_TSO4;
2400 ifp->if_hwassist |= CSUM_IP_TSO;
2401 }
2402 if (sc->hn_caps & HN_CAP_TSO6) {
2403 ifp->if_capabilities |= IFCAP_TSO6;
2404 ifp->if_hwassist |= CSUM_IP6_TSO;
2405 }
2406
2407 /* Enable all available capabilities by default. */
2408 ifp->if_capenable = ifp->if_capabilities;
2409
2410 /*
2411 * Disable IPv6 TSO and TXCSUM by default, they still can
2412 * be enabled through SIOCSIFCAP.
2413 */
2414 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2415 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2416
2417 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2418 /*
2419 * Lock hn_set_tso_maxsize() to simplify its
2420 * internal logic.
2421 */
2422 HN_LOCK(sc);
2423 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2424 HN_UNLOCK(sc);
2425 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2426 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2427 }
2428
2429 ether_ifattach(ifp, eaddr);
2430
2431 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2432 if_printf(ifp, "TSO segcnt %u segsz %u\n",
2433 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2434 }
2435 if (mtu < ETHERMTU) {
2436 if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2437 ifp->if_mtu = mtu;
2438 }
2439
2440 /* Inform the upper layer about the long frame support. */
2441 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2442
2443 /*
2444 * Kick off link status check.
2445 */
2446 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2447 hn_update_link_status(sc);
2448
2449 if (!hn_xpnt_vf) {
2450 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2451 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2452 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2453 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2454 } else {
2455 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2456 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2457 }
2458
2459 /*
2460 * NOTE:
2461 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2462 * since interface's LLADDR is needed; interface LLADDR is not
2463 * available when ifnet_arrival event is triggered.
2464 */
2465 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2466 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2467 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2468 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2469
2470 return (0);
2471 failed:
2472 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2473 hn_synth_detach(sc);
2474 hn_detach(dev);
2475 return (error);
2476 }
2477
2478 static int
hn_detach(device_t dev)2479 hn_detach(device_t dev)
2480 {
2481 struct hn_softc *sc = device_get_softc(dev);
2482 struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2483
2484 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2485 /*
2486 * In case that the vmbus missed the orphan handler
2487 * installation.
2488 */
2489 vmbus_xact_ctx_orphan(sc->hn_xact);
2490 }
2491
2492 if (sc->hn_ifaddr_evthand != NULL)
2493 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2494 if (sc->hn_ifnet_evthand != NULL)
2495 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2496 if (sc->hn_ifnet_atthand != NULL) {
2497 EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2498 sc->hn_ifnet_atthand);
2499 }
2500 if (sc->hn_ifnet_dethand != NULL) {
2501 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2502 sc->hn_ifnet_dethand);
2503 }
2504 if (sc->hn_ifnet_lnkhand != NULL)
2505 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2506
2507 vf_ifp = sc->hn_vf_ifp;
2508 __compiler_membar();
2509 if (vf_ifp != NULL)
2510 hn_ifnet_detevent(sc, vf_ifp);
2511
2512 if (device_is_attached(dev)) {
2513 HN_LOCK(sc);
2514 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2515 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2516 hn_stop(sc, true);
2517 /*
2518 * NOTE:
2519 * hn_stop() only suspends data, so managment
2520 * stuffs have to be suspended manually here.
2521 */
2522 hn_suspend_mgmt(sc);
2523 hn_synth_detach(sc);
2524 }
2525 HN_UNLOCK(sc);
2526 ether_ifdetach(ifp);
2527 }
2528
2529 ifmedia_removeall(&sc->hn_media);
2530 hn_destroy_rx_data(sc);
2531 hn_destroy_tx_data(sc);
2532
2533 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2534 int i;
2535
2536 for (i = 0; i < hn_tx_taskq_cnt; ++i)
2537 taskqueue_free(sc->hn_tx_taskqs[i]);
2538 free(sc->hn_tx_taskqs, M_DEVBUF);
2539 }
2540 taskqueue_free(sc->hn_mgmt_taskq0);
2541 if (sc->hn_vf_taskq != NULL)
2542 taskqueue_free(sc->hn_vf_taskq);
2543
2544 if (sc->hn_xact != NULL) {
2545 /*
2546 * Uninstall the orphan handler _before_ the xact is
2547 * destructed.
2548 */
2549 vmbus_chan_unset_orphan(sc->hn_prichan);
2550 vmbus_xact_ctx_destroy(sc->hn_xact);
2551 }
2552
2553 if_free(ifp);
2554
2555 HN_LOCK_DESTROY(sc);
2556 rm_destroy(&sc->hn_vf_lock);
2557 return (0);
2558 }
2559
2560 static int
hn_shutdown(device_t dev)2561 hn_shutdown(device_t dev)
2562 {
2563
2564 return (0);
2565 }
2566
2567 static void
hn_link_status(struct hn_softc * sc)2568 hn_link_status(struct hn_softc *sc)
2569 {
2570 uint32_t link_status;
2571 int error;
2572
2573 error = hn_rndis_get_linkstatus(sc, &link_status);
2574 if (error) {
2575 /* XXX what to do? */
2576 return;
2577 }
2578
2579 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2580 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2581 else
2582 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2583 if_link_state_change(sc->hn_ifp,
2584 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2585 LINK_STATE_UP : LINK_STATE_DOWN);
2586 }
2587
2588 static void
hn_link_taskfunc(void * xsc,int pending __unused)2589 hn_link_taskfunc(void *xsc, int pending __unused)
2590 {
2591 struct hn_softc *sc = xsc;
2592
2593 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2594 return;
2595 hn_link_status(sc);
2596 }
2597
2598 static void
hn_netchg_init_taskfunc(void * xsc,int pending __unused)2599 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2600 {
2601 struct hn_softc *sc = xsc;
2602
2603 /* Prevent any link status checks from running. */
2604 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2605
2606 /*
2607 * Fake up a [link down --> link up] state change; 5 seconds
2608 * delay is used, which closely simulates miibus reaction
2609 * upon link down event.
2610 */
2611 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2612 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2613 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2614 &sc->hn_netchg_status, 5 * hz);
2615 }
2616
2617 static void
hn_netchg_status_taskfunc(void * xsc,int pending __unused)2618 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2619 {
2620 struct hn_softc *sc = xsc;
2621
2622 /* Re-allow link status checks. */
2623 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2624 hn_link_status(sc);
2625 }
2626
2627 static void
hn_update_link_status(struct hn_softc * sc)2628 hn_update_link_status(struct hn_softc *sc)
2629 {
2630
2631 if (sc->hn_mgmt_taskq != NULL)
2632 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2633 }
2634
2635 static void
hn_change_network(struct hn_softc * sc)2636 hn_change_network(struct hn_softc *sc)
2637 {
2638
2639 if (sc->hn_mgmt_taskq != NULL)
2640 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2641 }
2642
2643 static __inline int
hn_txdesc_dmamap_load(struct hn_tx_ring * txr,struct hn_txdesc * txd,struct mbuf ** m_head,bus_dma_segment_t * segs,int * nsegs)2644 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2645 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2646 {
2647 struct mbuf *m = *m_head;
2648 int error;
2649
2650 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2651
2652 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2653 m, segs, nsegs, BUS_DMA_NOWAIT);
2654 if (error == EFBIG) {
2655 struct mbuf *m_new;
2656
2657 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2658 if (m_new == NULL)
2659 return ENOBUFS;
2660 else
2661 *m_head = m = m_new;
2662 txr->hn_tx_collapsed++;
2663
2664 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2665 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2666 }
2667 if (!error) {
2668 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2669 BUS_DMASYNC_PREWRITE);
2670 txd->flags |= HN_TXD_FLAG_DMAMAP;
2671 }
2672 return error;
2673 }
2674
2675 static __inline int
hn_txdesc_put(struct hn_tx_ring * txr,struct hn_txdesc * txd)2676 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2677 {
2678
2679 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2680 ("put an onlist txd %#x", txd->flags));
2681 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2682 ("put an onagg txd %#x", txd->flags));
2683
2684 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2685 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2686 return 0;
2687
2688 if (!STAILQ_EMPTY(&txd->agg_list)) {
2689 struct hn_txdesc *tmp_txd;
2690
2691 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2692 int freed;
2693
2694 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2695 ("resursive aggregation on aggregated txdesc"));
2696 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2697 ("not aggregated txdesc"));
2698 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2699 ("aggregated txdesc uses dmamap"));
2700 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2701 ("aggregated txdesc consumes "
2702 "chimney sending buffer"));
2703 KASSERT(tmp_txd->chim_size == 0,
2704 ("aggregated txdesc has non-zero "
2705 "chimney sending size"));
2706
2707 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2708 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2709 freed = hn_txdesc_put(txr, tmp_txd);
2710 KASSERT(freed, ("failed to free aggregated txdesc"));
2711 }
2712 }
2713
2714 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2715 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2716 ("chim txd uses dmamap"));
2717 hn_chim_free(txr->hn_sc, txd->chim_index);
2718 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2719 txd->chim_size = 0;
2720 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2721 bus_dmamap_sync(txr->hn_tx_data_dtag,
2722 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2723 bus_dmamap_unload(txr->hn_tx_data_dtag,
2724 txd->data_dmap);
2725 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2726 }
2727
2728 if (txd->m != NULL) {
2729 m_freem(txd->m);
2730 txd->m = NULL;
2731 }
2732
2733 txd->flags |= HN_TXD_FLAG_ONLIST;
2734 #ifndef HN_USE_TXDESC_BUFRING
2735 mtx_lock_spin(&txr->hn_txlist_spin);
2736 KASSERT(txr->hn_txdesc_avail >= 0 &&
2737 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2738 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2739 txr->hn_txdesc_avail++;
2740 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2741 mtx_unlock_spin(&txr->hn_txlist_spin);
2742 #else /* HN_USE_TXDESC_BUFRING */
2743 #ifdef HN_DEBUG
2744 atomic_add_int(&txr->hn_txdesc_avail, 1);
2745 #endif
2746 buf_ring_enqueue(txr->hn_txdesc_br, txd);
2747 #endif /* !HN_USE_TXDESC_BUFRING */
2748
2749 return 1;
2750 }
2751
2752 static __inline struct hn_txdesc *
hn_txdesc_get(struct hn_tx_ring * txr)2753 hn_txdesc_get(struct hn_tx_ring *txr)
2754 {
2755 struct hn_txdesc *txd;
2756
2757 #ifndef HN_USE_TXDESC_BUFRING
2758 mtx_lock_spin(&txr->hn_txlist_spin);
2759 txd = SLIST_FIRST(&txr->hn_txlist);
2760 if (txd != NULL) {
2761 KASSERT(txr->hn_txdesc_avail > 0,
2762 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2763 txr->hn_txdesc_avail--;
2764 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2765 }
2766 mtx_unlock_spin(&txr->hn_txlist_spin);
2767 #else
2768 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2769 #endif
2770
2771 if (txd != NULL) {
2772 #ifdef HN_USE_TXDESC_BUFRING
2773 #ifdef HN_DEBUG
2774 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2775 #endif
2776 #endif /* HN_USE_TXDESC_BUFRING */
2777 KASSERT(txd->m == NULL && txd->refs == 0 &&
2778 STAILQ_EMPTY(&txd->agg_list) &&
2779 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2780 txd->chim_size == 0 &&
2781 (txd->flags & HN_TXD_FLAG_ONLIST) &&
2782 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2783 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2784 txd->flags &= ~HN_TXD_FLAG_ONLIST;
2785 txd->refs = 1;
2786 }
2787 return txd;
2788 }
2789
2790 static __inline void
hn_txdesc_hold(struct hn_txdesc * txd)2791 hn_txdesc_hold(struct hn_txdesc *txd)
2792 {
2793
2794 /* 0->1 transition will never work */
2795 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2796 atomic_add_int(&txd->refs, 1);
2797 }
2798
2799 static __inline void
hn_txdesc_agg(struct hn_txdesc * agg_txd,struct hn_txdesc * txd)2800 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2801 {
2802
2803 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2804 ("recursive aggregation on aggregating txdesc"));
2805
2806 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2807 ("already aggregated"));
2808 KASSERT(STAILQ_EMPTY(&txd->agg_list),
2809 ("recursive aggregation on to-be-aggregated txdesc"));
2810
2811 txd->flags |= HN_TXD_FLAG_ONAGG;
2812 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2813 }
2814
2815 static bool
hn_tx_ring_pending(struct hn_tx_ring * txr)2816 hn_tx_ring_pending(struct hn_tx_ring *txr)
2817 {
2818 bool pending = false;
2819
2820 #ifndef HN_USE_TXDESC_BUFRING
2821 mtx_lock_spin(&txr->hn_txlist_spin);
2822 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2823 pending = true;
2824 mtx_unlock_spin(&txr->hn_txlist_spin);
2825 #else
2826 if (!buf_ring_full(txr->hn_txdesc_br))
2827 pending = true;
2828 #endif
2829 return (pending);
2830 }
2831
2832 static __inline void
hn_txeof(struct hn_tx_ring * txr)2833 hn_txeof(struct hn_tx_ring *txr)
2834 {
2835 txr->hn_has_txeof = 0;
2836 txr->hn_txeof(txr);
2837 }
2838
2839 static void
hn_txpkt_done(struct hn_nvs_sendctx * sndc,struct hn_softc * sc,struct vmbus_channel * chan,const void * data __unused,int dlen __unused)2840 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2841 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2842 {
2843 struct hn_txdesc *txd = sndc->hn_cbarg;
2844 struct hn_tx_ring *txr;
2845
2846 txr = txd->txr;
2847 KASSERT(txr->hn_chan == chan,
2848 ("channel mismatch, on chan%u, should be chan%u",
2849 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2850
2851 txr->hn_has_txeof = 1;
2852 hn_txdesc_put(txr, txd);
2853
2854 ++txr->hn_txdone_cnt;
2855 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2856 txr->hn_txdone_cnt = 0;
2857 if (txr->hn_oactive)
2858 hn_txeof(txr);
2859 }
2860 }
2861
2862 static void
hn_chan_rollup(struct hn_rx_ring * rxr,struct hn_tx_ring * txr)2863 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2864 {
2865 #if defined(INET) || defined(INET6)
2866 tcp_lro_flush_all(&rxr->hn_lro);
2867 #endif
2868
2869 /*
2870 * NOTE:
2871 * 'txr' could be NULL, if multiple channels and
2872 * ifnet.if_start method are enabled.
2873 */
2874 if (txr == NULL || !txr->hn_has_txeof)
2875 return;
2876
2877 txr->hn_txdone_cnt = 0;
2878 hn_txeof(txr);
2879 }
2880
2881 static __inline uint32_t
hn_rndis_pktmsg_offset(uint32_t ofs)2882 hn_rndis_pktmsg_offset(uint32_t ofs)
2883 {
2884
2885 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2886 ("invalid RNDIS packet msg offset %u", ofs));
2887 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2888 }
2889
2890 static __inline void *
hn_rndis_pktinfo_append(struct rndis_packet_msg * pkt,size_t pktsize,size_t pi_dlen,uint32_t pi_type)2891 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2892 size_t pi_dlen, uint32_t pi_type)
2893 {
2894 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2895 struct rndis_pktinfo *pi;
2896
2897 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2898 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2899
2900 /*
2901 * Per-packet-info does not move; it only grows.
2902 *
2903 * NOTE:
2904 * rm_pktinfooffset in this phase counts from the beginning
2905 * of rndis_packet_msg.
2906 */
2907 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2908 ("%u pktinfo overflows RNDIS packet msg", pi_type));
2909 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2910 pkt->rm_pktinfolen);
2911 pkt->rm_pktinfolen += pi_size;
2912
2913 pi->rm_size = pi_size;
2914 pi->rm_type = pi_type;
2915 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2916
2917 return (pi->rm_data);
2918 }
2919
2920 static __inline int
hn_flush_txagg(struct ifnet * ifp,struct hn_tx_ring * txr)2921 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2922 {
2923 struct hn_txdesc *txd;
2924 struct mbuf *m;
2925 int error, pkts;
2926
2927 txd = txr->hn_agg_txd;
2928 KASSERT(txd != NULL, ("no aggregate txdesc"));
2929
2930 /*
2931 * Since hn_txpkt() will reset this temporary stat, save
2932 * it now, so that oerrors can be updated properly, if
2933 * hn_txpkt() ever fails.
2934 */
2935 pkts = txr->hn_stat_pkts;
2936
2937 /*
2938 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2939 * failure, save it for later freeing, if hn_txpkt() ever
2940 * fails.
2941 */
2942 m = txd->m;
2943 error = hn_txpkt(ifp, txr, txd);
2944 if (__predict_false(error)) {
2945 /* txd is freed, but m is not. */
2946 m_freem(m);
2947
2948 txr->hn_flush_failed++;
2949 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2950 }
2951
2952 /* Reset all aggregation states. */
2953 txr->hn_agg_txd = NULL;
2954 txr->hn_agg_szleft = 0;
2955 txr->hn_agg_pktleft = 0;
2956 txr->hn_agg_prevpkt = NULL;
2957
2958 return (error);
2959 }
2960
2961 static void *
hn_try_txagg(struct ifnet * ifp,struct hn_tx_ring * txr,struct hn_txdesc * txd,int pktsize)2962 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2963 int pktsize)
2964 {
2965 void *chim;
2966
2967 if (txr->hn_agg_txd != NULL) {
2968 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2969 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2970 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2971 int olen;
2972
2973 /*
2974 * Update the previous RNDIS packet's total length,
2975 * it can be increased due to the mandatory alignment
2976 * padding for this RNDIS packet. And update the
2977 * aggregating txdesc's chimney sending buffer size
2978 * accordingly.
2979 *
2980 * XXX
2981 * Zero-out the padding, as required by the RNDIS spec.
2982 */
2983 olen = pkt->rm_len;
2984 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2985 agg_txd->chim_size += pkt->rm_len - olen;
2986
2987 /* Link this txdesc to the parent. */
2988 hn_txdesc_agg(agg_txd, txd);
2989
2990 chim = (uint8_t *)pkt + pkt->rm_len;
2991 /* Save the current packet for later fixup. */
2992 txr->hn_agg_prevpkt = chim;
2993
2994 txr->hn_agg_pktleft--;
2995 txr->hn_agg_szleft -= pktsize;
2996 if (txr->hn_agg_szleft <=
2997 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2998 /*
2999 * Probably can't aggregate more packets,
3000 * flush this aggregating txdesc proactively.
3001 */
3002 txr->hn_agg_pktleft = 0;
3003 }
3004 /* Done! */
3005 return (chim);
3006 }
3007 hn_flush_txagg(ifp, txr);
3008 }
3009 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3010
3011 txr->hn_tx_chimney_tried++;
3012 txd->chim_index = hn_chim_alloc(txr->hn_sc);
3013 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3014 return (NULL);
3015 txr->hn_tx_chimney++;
3016
3017 chim = txr->hn_sc->hn_chim +
3018 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3019
3020 if (txr->hn_agg_pktmax > 1 &&
3021 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3022 txr->hn_agg_txd = txd;
3023 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3024 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3025 txr->hn_agg_prevpkt = chim;
3026 }
3027 return (chim);
3028 }
3029
3030 /*
3031 * NOTE:
3032 * If this function fails, then both txd and m_head0 will be freed.
3033 */
3034 static int
hn_encap(struct ifnet * ifp,struct hn_tx_ring * txr,struct hn_txdesc * txd,struct mbuf ** m_head0)3035 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3036 struct mbuf **m_head0)
3037 {
3038 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3039 int error, nsegs, i;
3040 struct mbuf *m_head = *m_head0;
3041 struct rndis_packet_msg *pkt;
3042 uint32_t *pi_data;
3043 void *chim = NULL;
3044 int pkt_hlen, pkt_size;
3045
3046 pkt = txd->rndis_pkt;
3047 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3048 if (pkt_size < txr->hn_chim_size) {
3049 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3050 if (chim != NULL)
3051 pkt = chim;
3052 } else {
3053 if (txr->hn_agg_txd != NULL)
3054 hn_flush_txagg(ifp, txr);
3055 }
3056
3057 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3058 pkt->rm_len = m_head->m_pkthdr.len;
3059 pkt->rm_dataoffset = 0;
3060 pkt->rm_datalen = m_head->m_pkthdr.len;
3061 pkt->rm_oobdataoffset = 0;
3062 pkt->rm_oobdatalen = 0;
3063 pkt->rm_oobdataelements = 0;
3064 pkt->rm_pktinfooffset = sizeof(*pkt);
3065 pkt->rm_pktinfolen = 0;
3066 pkt->rm_vchandle = 0;
3067 pkt->rm_reserved = 0;
3068
3069 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3070 /*
3071 * Set the hash value for this packet, so that the host could
3072 * dispatch the TX done event for this packet back to this TX
3073 * ring's channel.
3074 */
3075 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3076 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3077 *pi_data = txr->hn_tx_idx;
3078 }
3079
3080 if (m_head->m_flags & M_VLANTAG) {
3081 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3082 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3083 *pi_data = NDIS_VLAN_INFO_MAKE(
3084 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3085 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3086 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3087 }
3088
3089 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3090 #if defined(INET6) || defined(INET)
3091 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3092 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3093 #ifdef INET
3094 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3095 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3096 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3097 m_head->m_pkthdr.tso_segsz);
3098 }
3099 #endif
3100 #if defined(INET6) && defined(INET)
3101 else
3102 #endif
3103 #ifdef INET6
3104 {
3105 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3106 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3107 m_head->m_pkthdr.tso_segsz);
3108 }
3109 #endif
3110 #endif /* INET6 || INET */
3111 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3112 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3113 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3114 if (m_head->m_pkthdr.csum_flags &
3115 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3116 *pi_data = NDIS_TXCSUM_INFO_IPV6;
3117 } else {
3118 *pi_data = NDIS_TXCSUM_INFO_IPV4;
3119 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3120 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
3121 }
3122
3123 if (m_head->m_pkthdr.csum_flags &
3124 (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3125 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3126 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3127 } else if (m_head->m_pkthdr.csum_flags &
3128 (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3129 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3130 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3131 }
3132 }
3133
3134 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3135 /* Fixup RNDIS packet message total length */
3136 pkt->rm_len += pkt_hlen;
3137 /* Convert RNDIS packet message offsets */
3138 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3139 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3140
3141 /*
3142 * Fast path: Chimney sending.
3143 */
3144 if (chim != NULL) {
3145 struct hn_txdesc *tgt_txd = txd;
3146
3147 if (txr->hn_agg_txd != NULL) {
3148 tgt_txd = txr->hn_agg_txd;
3149 #ifdef INVARIANTS
3150 *m_head0 = NULL;
3151 #endif
3152 }
3153
3154 KASSERT(pkt == chim,
3155 ("RNDIS pkt not in chimney sending buffer"));
3156 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3157 ("chimney sending buffer is not used"));
3158 tgt_txd->chim_size += pkt->rm_len;
3159
3160 m_copydata(m_head, 0, m_head->m_pkthdr.len,
3161 ((uint8_t *)chim) + pkt_hlen);
3162
3163 txr->hn_gpa_cnt = 0;
3164 txr->hn_sendpkt = hn_txpkt_chim;
3165 goto done;
3166 }
3167
3168 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3169 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3170 ("chimney buffer is used"));
3171 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3172
3173 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3174 if (__predict_false(error)) {
3175 int freed;
3176
3177 /*
3178 * This mbuf is not linked w/ the txd yet, so free it now.
3179 */
3180 m_freem(m_head);
3181 *m_head0 = NULL;
3182
3183 freed = hn_txdesc_put(txr, txd);
3184 KASSERT(freed != 0,
3185 ("fail to free txd upon txdma error"));
3186
3187 txr->hn_txdma_failed++;
3188 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3189 return error;
3190 }
3191 *m_head0 = m_head;
3192
3193 /* +1 RNDIS packet message */
3194 txr->hn_gpa_cnt = nsegs + 1;
3195
3196 /* send packet with page buffer */
3197 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3198 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3199 txr->hn_gpa[0].gpa_len = pkt_hlen;
3200
3201 /*
3202 * Fill the page buffers with mbuf info after the page
3203 * buffer for RNDIS packet message.
3204 */
3205 for (i = 0; i < nsegs; ++i) {
3206 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3207
3208 gpa->gpa_page = atop(segs[i].ds_addr);
3209 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3210 gpa->gpa_len = segs[i].ds_len;
3211 }
3212
3213 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3214 txd->chim_size = 0;
3215 txr->hn_sendpkt = hn_txpkt_sglist;
3216 done:
3217 txd->m = m_head;
3218
3219 /* Set the completion routine */
3220 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3221
3222 /* Update temporary stats for later use. */
3223 txr->hn_stat_pkts++;
3224 txr->hn_stat_size += m_head->m_pkthdr.len;
3225 if (m_head->m_flags & M_MCAST)
3226 txr->hn_stat_mcasts++;
3227
3228 return 0;
3229 }
3230
3231 /*
3232 * NOTE:
3233 * If this function fails, then txd will be freed, but the mbuf
3234 * associated w/ the txd will _not_ be freed.
3235 */
3236 static int
hn_txpkt(struct ifnet * ifp,struct hn_tx_ring * txr,struct hn_txdesc * txd)3237 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3238 {
3239 int error, send_failed = 0, has_bpf;
3240
3241 again:
3242 has_bpf = bpf_peers_present(ifp->if_bpf);
3243 if (has_bpf) {
3244 /*
3245 * Make sure that this txd and any aggregated txds are not
3246 * freed before ETHER_BPF_MTAP.
3247 */
3248 hn_txdesc_hold(txd);
3249 }
3250 error = txr->hn_sendpkt(txr, txd);
3251 if (!error) {
3252 if (has_bpf) {
3253 const struct hn_txdesc *tmp_txd;
3254
3255 ETHER_BPF_MTAP(ifp, txd->m);
3256 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3257 ETHER_BPF_MTAP(ifp, tmp_txd->m);
3258 }
3259
3260 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3261 #ifdef HN_IFSTART_SUPPORT
3262 if (!hn_use_if_start)
3263 #endif
3264 {
3265 if_inc_counter(ifp, IFCOUNTER_OBYTES,
3266 txr->hn_stat_size);
3267 if (txr->hn_stat_mcasts != 0) {
3268 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3269 txr->hn_stat_mcasts);
3270 }
3271 }
3272 txr->hn_pkts += txr->hn_stat_pkts;
3273 txr->hn_sends++;
3274 }
3275 if (has_bpf)
3276 hn_txdesc_put(txr, txd);
3277
3278 if (__predict_false(error)) {
3279 int freed;
3280
3281 /*
3282 * This should "really rarely" happen.
3283 *
3284 * XXX Too many RX to be acked or too many sideband
3285 * commands to run? Ask netvsc_channel_rollup()
3286 * to kick start later.
3287 */
3288 txr->hn_has_txeof = 1;
3289 if (!send_failed) {
3290 txr->hn_send_failed++;
3291 send_failed = 1;
3292 /*
3293 * Try sending again after set hn_has_txeof;
3294 * in case that we missed the last
3295 * netvsc_channel_rollup().
3296 */
3297 goto again;
3298 }
3299 if_printf(ifp, "send failed\n");
3300
3301 /*
3302 * Caller will perform further processing on the
3303 * associated mbuf, so don't free it in hn_txdesc_put();
3304 * only unload it from the DMA map in hn_txdesc_put(),
3305 * if it was loaded.
3306 */
3307 txd->m = NULL;
3308 freed = hn_txdesc_put(txr, txd);
3309 KASSERT(freed != 0,
3310 ("fail to free txd upon send error"));
3311
3312 txr->hn_send_failed++;
3313 }
3314
3315 /* Reset temporary stats, after this sending is done. */
3316 txr->hn_stat_size = 0;
3317 txr->hn_stat_pkts = 0;
3318 txr->hn_stat_mcasts = 0;
3319
3320 return (error);
3321 }
3322
3323 /*
3324 * Append the specified data to the indicated mbuf chain,
3325 * Extend the mbuf chain if the new data does not fit in
3326 * existing space.
3327 *
3328 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3329 * There should be an equivalent in the kernel mbuf code,
3330 * but there does not appear to be one yet.
3331 *
3332 * Differs from m_append() in that additional mbufs are
3333 * allocated with cluster size MJUMPAGESIZE, and filled
3334 * accordingly.
3335 *
3336 * Return 1 if able to complete the job; otherwise 0.
3337 */
3338 static int
hv_m_append(struct mbuf * m0,int len,c_caddr_t cp)3339 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3340 {
3341 struct mbuf *m, *n;
3342 int remainder, space;
3343
3344 for (m = m0; m->m_next != NULL; m = m->m_next)
3345 ;
3346 remainder = len;
3347 space = M_TRAILINGSPACE(m);
3348 if (space > 0) {
3349 /*
3350 * Copy into available space.
3351 */
3352 if (space > remainder)
3353 space = remainder;
3354 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3355 m->m_len += space;
3356 cp += space;
3357 remainder -= space;
3358 }
3359 while (remainder > 0) {
3360 /*
3361 * Allocate a new mbuf; could check space
3362 * and allocate a cluster instead.
3363 */
3364 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3365 if (n == NULL)
3366 break;
3367 n->m_len = min(MJUMPAGESIZE, remainder);
3368 bcopy(cp, mtod(n, caddr_t), n->m_len);
3369 cp += n->m_len;
3370 remainder -= n->m_len;
3371 m->m_next = n;
3372 m = n;
3373 }
3374 if (m0->m_flags & M_PKTHDR)
3375 m0->m_pkthdr.len += len - remainder;
3376
3377 return (remainder == 0);
3378 }
3379
3380 #if defined(INET) || defined(INET6)
3381 static __inline int
hn_lro_rx(struct lro_ctrl * lc,struct mbuf * m)3382 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3383 {
3384 #if __FreeBSD_version >= 1100095
3385 if (hn_lro_mbufq_depth) {
3386 tcp_lro_queue_mbuf(lc, m);
3387 return 0;
3388 }
3389 #endif
3390 return tcp_lro_rx(lc, m, 0);
3391 }
3392 #endif
3393
3394 static int
hn_rxpkt(struct hn_rx_ring * rxr,const void * data,int dlen,const struct hn_rxinfo * info)3395 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3396 const struct hn_rxinfo *info)
3397 {
3398 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3399 struct mbuf *m_new;
3400 int size, do_lro = 0, do_csum = 1, is_vf = 0;
3401 int hash_type = M_HASHTYPE_NONE;
3402 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3403
3404 ifp = hn_ifp;
3405 if (rxr->hn_rxvf_ifp != NULL) {
3406 /*
3407 * Non-transparent mode VF; pretend this packet is from
3408 * the VF.
3409 */
3410 ifp = rxr->hn_rxvf_ifp;
3411 is_vf = 1;
3412 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3413 /* Transparent mode VF. */
3414 is_vf = 1;
3415 }
3416
3417 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3418 /*
3419 * NOTE:
3420 * See the NOTE of hn_rndis_init_fixat(). This
3421 * function can be reached, immediately after the
3422 * RNDIS is initialized but before the ifnet is
3423 * setup on the hn_attach() path; drop the unexpected
3424 * packets.
3425 */
3426 return (0);
3427 }
3428
3429 if (__predict_false(dlen < ETHER_HDR_LEN)) {
3430 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3431 return (0);
3432 }
3433
3434 if (dlen <= MHLEN) {
3435 m_new = m_gethdr(M_NOWAIT, MT_DATA);
3436 if (m_new == NULL) {
3437 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3438 return (0);
3439 }
3440 memcpy(mtod(m_new, void *), data, dlen);
3441 m_new->m_pkthdr.len = m_new->m_len = dlen;
3442 rxr->hn_small_pkts++;
3443 } else {
3444 /*
3445 * Get an mbuf with a cluster. For packets 2K or less,
3446 * get a standard 2K cluster. For anything larger, get a
3447 * 4K cluster. Any buffers larger than 4K can cause problems
3448 * if looped around to the Hyper-V TX channel, so avoid them.
3449 */
3450 size = MCLBYTES;
3451 if (dlen > MCLBYTES) {
3452 /* 4096 */
3453 size = MJUMPAGESIZE;
3454 }
3455
3456 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3457 if (m_new == NULL) {
3458 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3459 return (0);
3460 }
3461
3462 hv_m_append(m_new, dlen, data);
3463 }
3464 m_new->m_pkthdr.rcvif = ifp;
3465
3466 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3467 do_csum = 0;
3468
3469 /* receive side checksum offload */
3470 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3471 /* IP csum offload */
3472 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3473 m_new->m_pkthdr.csum_flags |=
3474 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3475 rxr->hn_csum_ip++;
3476 }
3477
3478 /* TCP/UDP csum offload */
3479 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3480 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3481 m_new->m_pkthdr.csum_flags |=
3482 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3483 m_new->m_pkthdr.csum_data = 0xffff;
3484 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3485 rxr->hn_csum_tcp++;
3486 else
3487 rxr->hn_csum_udp++;
3488 }
3489
3490 /*
3491 * XXX
3492 * As of this write (Oct 28th, 2016), host side will turn
3493 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3494 * the do_lro setting here is actually _not_ accurate. We
3495 * depend on the RSS hash type check to reset do_lro.
3496 */
3497 if ((info->csum_info &
3498 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3499 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3500 do_lro = 1;
3501 } else {
3502 hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3503 if (l3proto == ETHERTYPE_IP) {
3504 if (l4proto == IPPROTO_TCP) {
3505 if (do_csum &&
3506 (rxr->hn_trust_hcsum &
3507 HN_TRUST_HCSUM_TCP)) {
3508 rxr->hn_csum_trusted++;
3509 m_new->m_pkthdr.csum_flags |=
3510 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3511 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3512 m_new->m_pkthdr.csum_data = 0xffff;
3513 }
3514 do_lro = 1;
3515 } else if (l4proto == IPPROTO_UDP) {
3516 if (do_csum &&
3517 (rxr->hn_trust_hcsum &
3518 HN_TRUST_HCSUM_UDP)) {
3519 rxr->hn_csum_trusted++;
3520 m_new->m_pkthdr.csum_flags |=
3521 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3522 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3523 m_new->m_pkthdr.csum_data = 0xffff;
3524 }
3525 } else if (l4proto != IPPROTO_DONE && do_csum &&
3526 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3527 rxr->hn_csum_trusted++;
3528 m_new->m_pkthdr.csum_flags |=
3529 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3530 }
3531 }
3532 }
3533
3534 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3535 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3536 NDIS_VLAN_INFO_ID(info->vlan_info),
3537 NDIS_VLAN_INFO_PRI(info->vlan_info),
3538 NDIS_VLAN_INFO_CFI(info->vlan_info));
3539 m_new->m_flags |= M_VLANTAG;
3540 }
3541
3542 /*
3543 * If VF is activated (tranparent/non-transparent mode does not
3544 * matter here).
3545 *
3546 * - Disable LRO
3547 *
3548 * hn(4) will only receive broadcast packets, multicast packets,
3549 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3550 * packet types.
3551 *
3552 * For non-transparent, we definitely _cannot_ enable LRO at
3553 * all, since the LRO flush will use hn(4) as the receiving
3554 * interface; i.e. hn_ifp->if_input(hn_ifp, m).
3555 */
3556 if (is_vf)
3557 do_lro = 0;
3558
3559 /*
3560 * If VF is activated (tranparent/non-transparent mode does not
3561 * matter here), do _not_ mess with unsupported hash types or
3562 * functions.
3563 */
3564 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3565 rxr->hn_rss_pkts++;
3566 m_new->m_pkthdr.flowid = info->hash_value;
3567 if (!is_vf)
3568 hash_type = M_HASHTYPE_OPAQUE_HASH;
3569 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3570 NDIS_HASH_FUNCTION_TOEPLITZ) {
3571 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3572 rxr->hn_mbuf_hash);
3573
3574 /*
3575 * NOTE:
3576 * do_lro is resetted, if the hash types are not TCP
3577 * related. See the comment in the above csum_flags
3578 * setup section.
3579 */
3580 switch (type) {
3581 case NDIS_HASH_IPV4:
3582 hash_type = M_HASHTYPE_RSS_IPV4;
3583 do_lro = 0;
3584 break;
3585
3586 case NDIS_HASH_TCP_IPV4:
3587 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3588 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3589 int def_htype = M_HASHTYPE_OPAQUE_HASH;
3590
3591 if (is_vf)
3592 def_htype = M_HASHTYPE_NONE;
3593
3594 /*
3595 * UDP 4-tuple hash is delivered as
3596 * TCP 4-tuple hash.
3597 */
3598 if (l3proto == ETHERTYPE_MAX) {
3599 hn_rxpkt_proto(m_new,
3600 &l3proto, &l4proto);
3601 }
3602 if (l3proto == ETHERTYPE_IP) {
3603 if (l4proto == IPPROTO_UDP &&
3604 (rxr->hn_mbuf_hash &
3605 NDIS_HASH_UDP_IPV4_X)) {
3606 hash_type =
3607 M_HASHTYPE_RSS_UDP_IPV4;
3608 do_lro = 0;
3609 } else if (l4proto !=
3610 IPPROTO_TCP) {
3611 hash_type = def_htype;
3612 do_lro = 0;
3613 }
3614 } else {
3615 hash_type = def_htype;
3616 do_lro = 0;
3617 }
3618 }
3619 break;
3620
3621 case NDIS_HASH_IPV6:
3622 hash_type = M_HASHTYPE_RSS_IPV6;
3623 do_lro = 0;
3624 break;
3625
3626 case NDIS_HASH_IPV6_EX:
3627 hash_type = M_HASHTYPE_RSS_IPV6_EX;
3628 do_lro = 0;
3629 break;
3630
3631 case NDIS_HASH_TCP_IPV6:
3632 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3633 break;
3634
3635 case NDIS_HASH_TCP_IPV6_EX:
3636 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3637 break;
3638 }
3639 }
3640 } else if (!is_vf) {
3641 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3642 hash_type = M_HASHTYPE_OPAQUE;
3643 }
3644 M_HASHTYPE_SET(m_new, hash_type);
3645
3646 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3647 if (hn_ifp != ifp) {
3648 const struct ether_header *eh;
3649
3650 /*
3651 * Non-transparent mode VF is activated.
3652 */
3653
3654 /*
3655 * Allow tapping on hn(4).
3656 */
3657 ETHER_BPF_MTAP(hn_ifp, m_new);
3658
3659 /*
3660 * Update hn(4)'s stats.
3661 */
3662 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3663 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3664 /* Checked at the beginning of this function. */
3665 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3666 eh = mtod(m_new, struct ether_header *);
3667 if (ETHER_IS_MULTICAST(eh->ether_dhost))
3668 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3669 }
3670 rxr->hn_pkts++;
3671
3672 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3673 #if defined(INET) || defined(INET6)
3674 struct lro_ctrl *lro = &rxr->hn_lro;
3675
3676 if (lro->lro_cnt) {
3677 rxr->hn_lro_tried++;
3678 if (hn_lro_rx(lro, m_new) == 0) {
3679 /* DONE! */
3680 return 0;
3681 }
3682 }
3683 #endif
3684 }
3685 ifp->if_input(ifp, m_new);
3686
3687 return (0);
3688 }
3689
3690 static int
hn_ioctl(struct ifnet * ifp,u_long cmd,caddr_t data)3691 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3692 {
3693 struct hn_softc *sc = ifp->if_softc;
3694 struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3695 struct ifnet *vf_ifp;
3696 int mask, error = 0;
3697 struct ifrsskey *ifrk;
3698 struct ifrsshash *ifrh;
3699 uint32_t mtu;
3700
3701 switch (cmd) {
3702 case SIOCSIFMTU:
3703 if (ifr->ifr_mtu > HN_MTU_MAX) {
3704 error = EINVAL;
3705 break;
3706 }
3707
3708 HN_LOCK(sc);
3709
3710 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3711 HN_UNLOCK(sc);
3712 break;
3713 }
3714
3715 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3716 /* Can't change MTU */
3717 HN_UNLOCK(sc);
3718 error = EOPNOTSUPP;
3719 break;
3720 }
3721
3722 if (ifp->if_mtu == ifr->ifr_mtu) {
3723 HN_UNLOCK(sc);
3724 break;
3725 }
3726
3727 if (hn_xpnt_vf_isready(sc)) {
3728 vf_ifp = sc->hn_vf_ifp;
3729 ifr_vf = *ifr;
3730 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3731 sizeof(ifr_vf.ifr_name));
3732 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3733 (caddr_t)&ifr_vf);
3734 if (error) {
3735 HN_UNLOCK(sc);
3736 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3737 vf_ifp->if_xname, ifr->ifr_mtu, error);
3738 break;
3739 }
3740 }
3741
3742 /*
3743 * Suspend this interface before the synthetic parts
3744 * are ripped.
3745 */
3746 hn_suspend(sc);
3747
3748 /*
3749 * Detach the synthetics parts, i.e. NVS and RNDIS.
3750 */
3751 hn_synth_detach(sc);
3752
3753 /*
3754 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3755 * with the new MTU setting.
3756 */
3757 error = hn_synth_attach(sc, ifr->ifr_mtu);
3758 if (error) {
3759 HN_UNLOCK(sc);
3760 break;
3761 }
3762
3763 error = hn_rndis_get_mtu(sc, &mtu);
3764 if (error)
3765 mtu = ifr->ifr_mtu;
3766 else if (bootverbose)
3767 if_printf(ifp, "RNDIS mtu %u\n", mtu);
3768
3769 /*
3770 * Commit the requested MTU, after the synthetic parts
3771 * have been successfully attached.
3772 */
3773 if (mtu >= ifr->ifr_mtu) {
3774 mtu = ifr->ifr_mtu;
3775 } else {
3776 if_printf(ifp, "fixup mtu %d -> %u\n",
3777 ifr->ifr_mtu, mtu);
3778 }
3779 ifp->if_mtu = mtu;
3780
3781 /*
3782 * Synthetic parts' reattach may change the chimney
3783 * sending size; update it.
3784 */
3785 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3786 hn_set_chim_size(sc, sc->hn_chim_szmax);
3787
3788 /*
3789 * Make sure that various parameters based on MTU are
3790 * still valid, after the MTU change.
3791 */
3792 hn_mtu_change_fixup(sc);
3793
3794 /*
3795 * All done! Resume the interface now.
3796 */
3797 hn_resume(sc);
3798
3799 if ((sc->hn_flags & HN_FLAG_RXVF) ||
3800 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3801 /*
3802 * Since we have reattached the NVS part,
3803 * change the datapath to VF again; in case
3804 * that it is lost, after the NVS was detached.
3805 */
3806 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3807 }
3808
3809 HN_UNLOCK(sc);
3810 break;
3811
3812 case SIOCSIFFLAGS:
3813 HN_LOCK(sc);
3814
3815 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3816 HN_UNLOCK(sc);
3817 break;
3818 }
3819
3820 if (hn_xpnt_vf_isready(sc))
3821 hn_xpnt_vf_saveifflags(sc);
3822
3823 if (ifp->if_flags & IFF_UP) {
3824 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3825 /*
3826 * Caller meight hold mutex, e.g.
3827 * bpf; use busy-wait for the RNDIS
3828 * reply.
3829 */
3830 HN_NO_SLEEPING(sc);
3831 hn_rxfilter_config(sc);
3832 HN_SLEEPING_OK(sc);
3833
3834 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3835 error = hn_xpnt_vf_iocsetflags(sc);
3836 } else {
3837 hn_init_locked(sc);
3838 }
3839 } else {
3840 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3841 hn_stop(sc, false);
3842 }
3843 sc->hn_if_flags = ifp->if_flags;
3844
3845 HN_UNLOCK(sc);
3846 break;
3847
3848 case SIOCSIFCAP:
3849 HN_LOCK(sc);
3850
3851 if (hn_xpnt_vf_isready(sc)) {
3852 ifr_vf = *ifr;
3853 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3854 sizeof(ifr_vf.ifr_name));
3855 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3856 HN_UNLOCK(sc);
3857 break;
3858 }
3859
3860 /*
3861 * Fix up requested capabilities w/ supported capabilities,
3862 * since the supported capabilities could have been changed.
3863 */
3864 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3865 ifp->if_capenable;
3866
3867 if (mask & IFCAP_TXCSUM) {
3868 ifp->if_capenable ^= IFCAP_TXCSUM;
3869 if (ifp->if_capenable & IFCAP_TXCSUM)
3870 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3871 else
3872 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3873 }
3874 if (mask & IFCAP_TXCSUM_IPV6) {
3875 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3876 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3877 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3878 else
3879 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3880 }
3881
3882 /* TODO: flip RNDIS offload parameters for RXCSUM. */
3883 if (mask & IFCAP_RXCSUM)
3884 ifp->if_capenable ^= IFCAP_RXCSUM;
3885 #ifdef foo
3886 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
3887 if (mask & IFCAP_RXCSUM_IPV6)
3888 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3889 #endif
3890
3891 if (mask & IFCAP_LRO)
3892 ifp->if_capenable ^= IFCAP_LRO;
3893
3894 if (mask & IFCAP_TSO4) {
3895 ifp->if_capenable ^= IFCAP_TSO4;
3896 if (ifp->if_capenable & IFCAP_TSO4)
3897 ifp->if_hwassist |= CSUM_IP_TSO;
3898 else
3899 ifp->if_hwassist &= ~CSUM_IP_TSO;
3900 }
3901 if (mask & IFCAP_TSO6) {
3902 ifp->if_capenable ^= IFCAP_TSO6;
3903 if (ifp->if_capenable & IFCAP_TSO6)
3904 ifp->if_hwassist |= CSUM_IP6_TSO;
3905 else
3906 ifp->if_hwassist &= ~CSUM_IP6_TSO;
3907 }
3908
3909 HN_UNLOCK(sc);
3910 break;
3911
3912 case SIOCADDMULTI:
3913 case SIOCDELMULTI:
3914 HN_LOCK(sc);
3915
3916 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3917 HN_UNLOCK(sc);
3918 break;
3919 }
3920 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3921 /*
3922 * Multicast uses mutex; use busy-wait for
3923 * the RNDIS reply.
3924 */
3925 HN_NO_SLEEPING(sc);
3926 hn_rxfilter_config(sc);
3927 HN_SLEEPING_OK(sc);
3928 }
3929
3930 /* XXX vlan(4) style mcast addr maintenance */
3931 if (hn_xpnt_vf_isready(sc)) {
3932 int old_if_flags;
3933
3934 old_if_flags = sc->hn_vf_ifp->if_flags;
3935 hn_xpnt_vf_saveifflags(sc);
3936
3937 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3938 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3939 IFF_ALLMULTI))
3940 error = hn_xpnt_vf_iocsetflags(sc);
3941 }
3942
3943 HN_UNLOCK(sc);
3944 break;
3945
3946 case SIOCSIFMEDIA:
3947 case SIOCGIFMEDIA:
3948 HN_LOCK(sc);
3949 if (hn_xpnt_vf_isready(sc)) {
3950 /*
3951 * SIOCGIFMEDIA expects ifmediareq, so don't
3952 * create and pass ifr_vf to the VF here; just
3953 * replace the ifr_name.
3954 */
3955 vf_ifp = sc->hn_vf_ifp;
3956 strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3957 sizeof(ifr->ifr_name));
3958 error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3959 /* Restore the ifr_name. */
3960 strlcpy(ifr->ifr_name, ifp->if_xname,
3961 sizeof(ifr->ifr_name));
3962 HN_UNLOCK(sc);
3963 break;
3964 }
3965 HN_UNLOCK(sc);
3966 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3967 break;
3968
3969 case SIOCGIFRSSHASH:
3970 ifrh = (struct ifrsshash *)data;
3971 HN_LOCK(sc);
3972 if (sc->hn_rx_ring_inuse == 1) {
3973 HN_UNLOCK(sc);
3974 ifrh->ifrh_func = RSS_FUNC_NONE;
3975 ifrh->ifrh_types = 0;
3976 break;
3977 }
3978
3979 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3980 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3981 else
3982 ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3983 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3984 HN_UNLOCK(sc);
3985 break;
3986
3987 case SIOCGIFRSSKEY:
3988 ifrk = (struct ifrsskey *)data;
3989 HN_LOCK(sc);
3990 if (sc->hn_rx_ring_inuse == 1) {
3991 HN_UNLOCK(sc);
3992 ifrk->ifrk_func = RSS_FUNC_NONE;
3993 ifrk->ifrk_keylen = 0;
3994 break;
3995 }
3996 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3997 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3998 else
3999 ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4000 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4001 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4002 NDIS_HASH_KEYSIZE_TOEPLITZ);
4003 HN_UNLOCK(sc);
4004 break;
4005
4006 default:
4007 error = ether_ioctl(ifp, cmd, data);
4008 break;
4009 }
4010 return (error);
4011 }
4012
4013 static void
hn_stop(struct hn_softc * sc,bool detaching)4014 hn_stop(struct hn_softc *sc, bool detaching)
4015 {
4016 struct ifnet *ifp = sc->hn_ifp;
4017 int i;
4018
4019 HN_LOCK_ASSERT(sc);
4020
4021 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4022 ("synthetic parts were not attached"));
4023
4024 /* Clear RUNNING bit ASAP. */
4025 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4026
4027 /* Disable polling. */
4028 hn_polling(sc, 0);
4029
4030 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4031 KASSERT(sc->hn_vf_ifp != NULL,
4032 ("%s: VF is not attached", ifp->if_xname));
4033
4034 /* Mark transparent mode VF as disabled. */
4035 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4036
4037 /*
4038 * NOTE:
4039 * Datapath setting must happen _before_ bringing
4040 * the VF down.
4041 */
4042 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4043
4044 /*
4045 * Bring the VF down.
4046 */
4047 hn_xpnt_vf_saveifflags(sc);
4048 sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4049 hn_xpnt_vf_iocsetflags(sc);
4050 }
4051
4052 /* Suspend data transfers. */
4053 hn_suspend_data(sc);
4054
4055 /* Clear OACTIVE bit. */
4056 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4057 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4058 sc->hn_tx_ring[i].hn_oactive = 0;
4059
4060 /*
4061 * If the non-transparent mode VF is active, make sure
4062 * that the RX filter still allows packet reception.
4063 */
4064 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4065 hn_rxfilter_config(sc);
4066 }
4067
4068 static void
hn_init_locked(struct hn_softc * sc)4069 hn_init_locked(struct hn_softc *sc)
4070 {
4071 struct ifnet *ifp = sc->hn_ifp;
4072 int i;
4073
4074 HN_LOCK_ASSERT(sc);
4075
4076 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4077 return;
4078
4079 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4080 return;
4081
4082 /* Configure RX filter */
4083 hn_rxfilter_config(sc);
4084
4085 /* Clear OACTIVE bit. */
4086 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4087 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4088 sc->hn_tx_ring[i].hn_oactive = 0;
4089
4090 /* Clear TX 'suspended' bit. */
4091 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4092
4093 if (hn_xpnt_vf_isready(sc)) {
4094 /* Initialize transparent VF. */
4095 hn_xpnt_vf_init(sc);
4096 }
4097
4098 /* Everything is ready; unleash! */
4099 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4100
4101 /* Re-enable polling if requested. */
4102 if (sc->hn_pollhz > 0)
4103 hn_polling(sc, sc->hn_pollhz);
4104 }
4105
4106 static void
hn_init(void * xsc)4107 hn_init(void *xsc)
4108 {
4109 struct hn_softc *sc = xsc;
4110
4111 HN_LOCK(sc);
4112 hn_init_locked(sc);
4113 HN_UNLOCK(sc);
4114 }
4115
4116 #if __FreeBSD_version >= 1100099
4117
4118 static int
hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)4119 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4120 {
4121 struct hn_softc *sc = arg1;
4122 unsigned int lenlim;
4123 int error;
4124
4125 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4126 error = sysctl_handle_int(oidp, &lenlim, 0, req);
4127 if (error || req->newptr == NULL)
4128 return error;
4129
4130 HN_LOCK(sc);
4131 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4132 lenlim > TCP_LRO_LENGTH_MAX) {
4133 HN_UNLOCK(sc);
4134 return EINVAL;
4135 }
4136 hn_set_lro_lenlim(sc, lenlim);
4137 HN_UNLOCK(sc);
4138
4139 return 0;
4140 }
4141
4142 static int
hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)4143 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4144 {
4145 struct hn_softc *sc = arg1;
4146 int ackcnt, error, i;
4147
4148 /*
4149 * lro_ackcnt_lim is append count limit,
4150 * +1 to turn it into aggregation limit.
4151 */
4152 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4153 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4154 if (error || req->newptr == NULL)
4155 return error;
4156
4157 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4158 return EINVAL;
4159
4160 /*
4161 * Convert aggregation limit back to append
4162 * count limit.
4163 */
4164 --ackcnt;
4165 HN_LOCK(sc);
4166 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4167 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4168 HN_UNLOCK(sc);
4169 return 0;
4170 }
4171
4172 #endif
4173
4174 static int
hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)4175 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4176 {
4177 struct hn_softc *sc = arg1;
4178 int hcsum = arg2;
4179 int on, error, i;
4180
4181 on = 0;
4182 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4183 on = 1;
4184
4185 error = sysctl_handle_int(oidp, &on, 0, req);
4186 if (error || req->newptr == NULL)
4187 return error;
4188
4189 HN_LOCK(sc);
4190 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4191 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4192
4193 if (on)
4194 rxr->hn_trust_hcsum |= hcsum;
4195 else
4196 rxr->hn_trust_hcsum &= ~hcsum;
4197 }
4198 HN_UNLOCK(sc);
4199 return 0;
4200 }
4201
4202 static int
hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)4203 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4204 {
4205 struct hn_softc *sc = arg1;
4206 int chim_size, error;
4207
4208 chim_size = sc->hn_tx_ring[0].hn_chim_size;
4209 error = sysctl_handle_int(oidp, &chim_size, 0, req);
4210 if (error || req->newptr == NULL)
4211 return error;
4212
4213 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4214 return EINVAL;
4215
4216 HN_LOCK(sc);
4217 hn_set_chim_size(sc, chim_size);
4218 HN_UNLOCK(sc);
4219 return 0;
4220 }
4221
4222 #if __FreeBSD_version < 1100095
4223 static int
hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)4224 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4225 {
4226 struct hn_softc *sc = arg1;
4227 int ofs = arg2, i, error;
4228 struct hn_rx_ring *rxr;
4229 uint64_t stat;
4230
4231 stat = 0;
4232 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4233 rxr = &sc->hn_rx_ring[i];
4234 stat += *((int *)((uint8_t *)rxr + ofs));
4235 }
4236
4237 error = sysctl_handle_64(oidp, &stat, 0, req);
4238 if (error || req->newptr == NULL)
4239 return error;
4240
4241 /* Zero out this stat. */
4242 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4243 rxr = &sc->hn_rx_ring[i];
4244 *((int *)((uint8_t *)rxr + ofs)) = 0;
4245 }
4246 return 0;
4247 }
4248 #else
4249 static int
hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)4250 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4251 {
4252 struct hn_softc *sc = arg1;
4253 int ofs = arg2, i, error;
4254 struct hn_rx_ring *rxr;
4255 uint64_t stat;
4256
4257 stat = 0;
4258 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4259 rxr = &sc->hn_rx_ring[i];
4260 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4261 }
4262
4263 error = sysctl_handle_64(oidp, &stat, 0, req);
4264 if (error || req->newptr == NULL)
4265 return error;
4266
4267 /* Zero out this stat. */
4268 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4269 rxr = &sc->hn_rx_ring[i];
4270 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4271 }
4272 return 0;
4273 }
4274
4275 #endif
4276
4277 static int
hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)4278 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4279 {
4280 struct hn_softc *sc = arg1;
4281 int ofs = arg2, i, error;
4282 struct hn_rx_ring *rxr;
4283 u_long stat;
4284
4285 stat = 0;
4286 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4287 rxr = &sc->hn_rx_ring[i];
4288 stat += *((u_long *)((uint8_t *)rxr + ofs));
4289 }
4290
4291 error = sysctl_handle_long(oidp, &stat, 0, req);
4292 if (error || req->newptr == NULL)
4293 return error;
4294
4295 /* Zero out this stat. */
4296 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4297 rxr = &sc->hn_rx_ring[i];
4298 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
4299 }
4300 return 0;
4301 }
4302
4303 static int
hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)4304 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4305 {
4306 struct hn_softc *sc = arg1;
4307 int ofs = arg2, i, error;
4308 struct hn_tx_ring *txr;
4309 u_long stat;
4310
4311 stat = 0;
4312 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4313 txr = &sc->hn_tx_ring[i];
4314 stat += *((u_long *)((uint8_t *)txr + ofs));
4315 }
4316
4317 error = sysctl_handle_long(oidp, &stat, 0, req);
4318 if (error || req->newptr == NULL)
4319 return error;
4320
4321 /* Zero out this stat. */
4322 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4323 txr = &sc->hn_tx_ring[i];
4324 *((u_long *)((uint8_t *)txr + ofs)) = 0;
4325 }
4326 return 0;
4327 }
4328
4329 static int
hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)4330 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4331 {
4332 struct hn_softc *sc = arg1;
4333 int ofs = arg2, i, error, conf;
4334 struct hn_tx_ring *txr;
4335
4336 txr = &sc->hn_tx_ring[0];
4337 conf = *((int *)((uint8_t *)txr + ofs));
4338
4339 error = sysctl_handle_int(oidp, &conf, 0, req);
4340 if (error || req->newptr == NULL)
4341 return error;
4342
4343 HN_LOCK(sc);
4344 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4345 txr = &sc->hn_tx_ring[i];
4346 *((int *)((uint8_t *)txr + ofs)) = conf;
4347 }
4348 HN_UNLOCK(sc);
4349
4350 return 0;
4351 }
4352
4353 static int
hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)4354 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4355 {
4356 struct hn_softc *sc = arg1;
4357 int error, size;
4358
4359 size = sc->hn_agg_size;
4360 error = sysctl_handle_int(oidp, &size, 0, req);
4361 if (error || req->newptr == NULL)
4362 return (error);
4363
4364 HN_LOCK(sc);
4365 sc->hn_agg_size = size;
4366 hn_set_txagg(sc);
4367 HN_UNLOCK(sc);
4368
4369 return (0);
4370 }
4371
4372 static int
hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)4373 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4374 {
4375 struct hn_softc *sc = arg1;
4376 int error, pkts;
4377
4378 pkts = sc->hn_agg_pkts;
4379 error = sysctl_handle_int(oidp, &pkts, 0, req);
4380 if (error || req->newptr == NULL)
4381 return (error);
4382
4383 HN_LOCK(sc);
4384 sc->hn_agg_pkts = pkts;
4385 hn_set_txagg(sc);
4386 HN_UNLOCK(sc);
4387
4388 return (0);
4389 }
4390
4391 static int
hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)4392 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4393 {
4394 struct hn_softc *sc = arg1;
4395 int pkts;
4396
4397 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4398 return (sysctl_handle_int(oidp, &pkts, 0, req));
4399 }
4400
4401 static int
hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)4402 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4403 {
4404 struct hn_softc *sc = arg1;
4405 int align;
4406
4407 align = sc->hn_tx_ring[0].hn_agg_align;
4408 return (sysctl_handle_int(oidp, &align, 0, req));
4409 }
4410
4411 static void
hn_chan_polling(struct vmbus_channel * chan,u_int pollhz)4412 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4413 {
4414 if (pollhz == 0)
4415 vmbus_chan_poll_disable(chan);
4416 else
4417 vmbus_chan_poll_enable(chan, pollhz);
4418 }
4419
4420 static void
hn_polling(struct hn_softc * sc,u_int pollhz)4421 hn_polling(struct hn_softc *sc, u_int pollhz)
4422 {
4423 int nsubch = sc->hn_rx_ring_inuse - 1;
4424
4425 HN_LOCK_ASSERT(sc);
4426
4427 if (nsubch > 0) {
4428 struct vmbus_channel **subch;
4429 int i;
4430
4431 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4432 for (i = 0; i < nsubch; ++i)
4433 hn_chan_polling(subch[i], pollhz);
4434 vmbus_subchan_rel(subch, nsubch);
4435 }
4436 hn_chan_polling(sc->hn_prichan, pollhz);
4437 }
4438
4439 static int
hn_polling_sysctl(SYSCTL_HANDLER_ARGS)4440 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4441 {
4442 struct hn_softc *sc = arg1;
4443 int pollhz, error;
4444
4445 pollhz = sc->hn_pollhz;
4446 error = sysctl_handle_int(oidp, &pollhz, 0, req);
4447 if (error || req->newptr == NULL)
4448 return (error);
4449
4450 if (pollhz != 0 &&
4451 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4452 return (EINVAL);
4453
4454 HN_LOCK(sc);
4455 if (sc->hn_pollhz != pollhz) {
4456 sc->hn_pollhz = pollhz;
4457 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4458 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4459 hn_polling(sc, sc->hn_pollhz);
4460 }
4461 HN_UNLOCK(sc);
4462
4463 return (0);
4464 }
4465
4466 static int
hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)4467 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4468 {
4469 struct hn_softc *sc = arg1;
4470 char verstr[16];
4471
4472 snprintf(verstr, sizeof(verstr), "%u.%u",
4473 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4474 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4475 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4476 }
4477
4478 static int
hn_caps_sysctl(SYSCTL_HANDLER_ARGS)4479 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4480 {
4481 struct hn_softc *sc = arg1;
4482 char caps_str[128];
4483 uint32_t caps;
4484
4485 HN_LOCK(sc);
4486 caps = sc->hn_caps;
4487 HN_UNLOCK(sc);
4488 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4489 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4490 }
4491
4492 static int
hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)4493 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4494 {
4495 struct hn_softc *sc = arg1;
4496 char assist_str[128];
4497 uint32_t hwassist;
4498
4499 HN_LOCK(sc);
4500 hwassist = sc->hn_ifp->if_hwassist;
4501 HN_UNLOCK(sc);
4502 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4503 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4504 }
4505
4506 static int
hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)4507 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4508 {
4509 struct hn_softc *sc = arg1;
4510 char filter_str[128];
4511 uint32_t filter;
4512
4513 HN_LOCK(sc);
4514 filter = sc->hn_rx_filter;
4515 HN_UNLOCK(sc);
4516 snprintf(filter_str, sizeof(filter_str), "%b", filter,
4517 NDIS_PACKET_TYPES);
4518 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4519 }
4520
4521 #ifndef RSS
4522
4523 static int
hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)4524 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4525 {
4526 struct hn_softc *sc = arg1;
4527 int error;
4528
4529 HN_LOCK(sc);
4530
4531 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4532 if (error || req->newptr == NULL)
4533 goto back;
4534
4535 if ((sc->hn_flags & HN_FLAG_RXVF) ||
4536 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4537 /*
4538 * RSS key is synchronized w/ VF's, don't allow users
4539 * to change it.
4540 */
4541 error = EBUSY;
4542 goto back;
4543 }
4544
4545 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4546 if (error)
4547 goto back;
4548 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4549
4550 if (sc->hn_rx_ring_inuse > 1) {
4551 error = hn_rss_reconfig(sc);
4552 } else {
4553 /* Not RSS capable, at least for now; just save the RSS key. */
4554 error = 0;
4555 }
4556 back:
4557 HN_UNLOCK(sc);
4558 return (error);
4559 }
4560
4561 static int
hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)4562 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4563 {
4564 struct hn_softc *sc = arg1;
4565 int error;
4566
4567 HN_LOCK(sc);
4568
4569 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4570 if (error || req->newptr == NULL)
4571 goto back;
4572
4573 /*
4574 * Don't allow RSS indirect table change, if this interface is not
4575 * RSS capable currently.
4576 */
4577 if (sc->hn_rx_ring_inuse == 1) {
4578 error = EOPNOTSUPP;
4579 goto back;
4580 }
4581
4582 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4583 if (error)
4584 goto back;
4585 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4586
4587 hn_rss_ind_fixup(sc);
4588 error = hn_rss_reconfig(sc);
4589 back:
4590 HN_UNLOCK(sc);
4591 return (error);
4592 }
4593
4594 #endif /* !RSS */
4595
4596 static int
hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)4597 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4598 {
4599 struct hn_softc *sc = arg1;
4600 char hash_str[128];
4601 uint32_t hash;
4602
4603 HN_LOCK(sc);
4604 hash = sc->hn_rss_hash;
4605 HN_UNLOCK(sc);
4606 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4607 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4608 }
4609
4610 static int
hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)4611 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4612 {
4613 struct hn_softc *sc = arg1;
4614 char hash_str[128];
4615 uint32_t hash;
4616
4617 HN_LOCK(sc);
4618 hash = sc->hn_rss_hcap;
4619 HN_UNLOCK(sc);
4620 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4621 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4622 }
4623
4624 static int
hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)4625 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4626 {
4627 struct hn_softc *sc = arg1;
4628 char hash_str[128];
4629 uint32_t hash;
4630
4631 HN_LOCK(sc);
4632 hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4633 HN_UNLOCK(sc);
4634 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4635 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4636 }
4637
4638 static int
hn_vf_sysctl(SYSCTL_HANDLER_ARGS)4639 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4640 {
4641 struct hn_softc *sc = arg1;
4642 char vf_name[IFNAMSIZ + 1];
4643 struct ifnet *vf_ifp;
4644
4645 HN_LOCK(sc);
4646 vf_name[0] = '\0';
4647 vf_ifp = sc->hn_vf_ifp;
4648 if (vf_ifp != NULL)
4649 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4650 HN_UNLOCK(sc);
4651 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4652 }
4653
4654 static int
hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)4655 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4656 {
4657 struct hn_softc *sc = arg1;
4658 char vf_name[IFNAMSIZ + 1];
4659 struct ifnet *vf_ifp;
4660
4661 HN_LOCK(sc);
4662 vf_name[0] = '\0';
4663 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4664 if (vf_ifp != NULL)
4665 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4666 HN_UNLOCK(sc);
4667 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4668 }
4669
4670 static int
hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)4671 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4672 {
4673 struct rm_priotracker pt;
4674 struct sbuf *sb;
4675 int error, i;
4676 bool first;
4677
4678 error = sysctl_wire_old_buffer(req, 0);
4679 if (error != 0)
4680 return (error);
4681
4682 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4683 if (sb == NULL)
4684 return (ENOMEM);
4685
4686 rm_rlock(&hn_vfmap_lock, &pt);
4687
4688 first = true;
4689 for (i = 0; i < hn_vfmap_size; ++i) {
4690 struct ifnet *ifp;
4691
4692 if (hn_vfmap[i] == NULL)
4693 continue;
4694
4695 ifp = ifnet_byindex(i);
4696 if (ifp != NULL) {
4697 if (first)
4698 sbuf_printf(sb, "%s", ifp->if_xname);
4699 else
4700 sbuf_printf(sb, " %s", ifp->if_xname);
4701 first = false;
4702 }
4703 }
4704
4705 rm_runlock(&hn_vfmap_lock, &pt);
4706
4707 error = sbuf_finish(sb);
4708 sbuf_delete(sb);
4709 return (error);
4710 }
4711
4712 static int
hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)4713 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4714 {
4715 struct rm_priotracker pt;
4716 struct sbuf *sb;
4717 int error, i;
4718 bool first;
4719
4720 error = sysctl_wire_old_buffer(req, 0);
4721 if (error != 0)
4722 return (error);
4723
4724 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4725 if (sb == NULL)
4726 return (ENOMEM);
4727
4728 rm_rlock(&hn_vfmap_lock, &pt);
4729
4730 first = true;
4731 for (i = 0; i < hn_vfmap_size; ++i) {
4732 struct ifnet *ifp, *hn_ifp;
4733
4734 hn_ifp = hn_vfmap[i];
4735 if (hn_ifp == NULL)
4736 continue;
4737
4738 ifp = ifnet_byindex(i);
4739 if (ifp != NULL) {
4740 if (first) {
4741 sbuf_printf(sb, "%s:%s", ifp->if_xname,
4742 hn_ifp->if_xname);
4743 } else {
4744 sbuf_printf(sb, " %s:%s", ifp->if_xname,
4745 hn_ifp->if_xname);
4746 }
4747 first = false;
4748 }
4749 }
4750
4751 rm_runlock(&hn_vfmap_lock, &pt);
4752
4753 error = sbuf_finish(sb);
4754 sbuf_delete(sb);
4755 return (error);
4756 }
4757
4758 static int
hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)4759 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4760 {
4761 struct hn_softc *sc = arg1;
4762 int error, onoff = 0;
4763
4764 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4765 onoff = 1;
4766 error = sysctl_handle_int(oidp, &onoff, 0, req);
4767 if (error || req->newptr == NULL)
4768 return (error);
4769
4770 HN_LOCK(sc);
4771 /* NOTE: hn_vf_lock for hn_transmit() */
4772 rm_wlock(&sc->hn_vf_lock);
4773 if (onoff)
4774 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4775 else
4776 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4777 rm_wunlock(&sc->hn_vf_lock);
4778 HN_UNLOCK(sc);
4779
4780 return (0);
4781 }
4782
4783 static int
hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)4784 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4785 {
4786 struct hn_softc *sc = arg1;
4787 int enabled = 0;
4788
4789 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4790 enabled = 1;
4791 return (sysctl_handle_int(oidp, &enabled, 0, req));
4792 }
4793
4794 static int
hn_check_iplen(const struct mbuf * m,int hoff)4795 hn_check_iplen(const struct mbuf *m, int hoff)
4796 {
4797 const struct ip *ip;
4798 int len, iphlen, iplen;
4799 const struct tcphdr *th;
4800 int thoff; /* TCP data offset */
4801
4802 len = hoff + sizeof(struct ip);
4803
4804 /* The packet must be at least the size of an IP header. */
4805 if (m->m_pkthdr.len < len)
4806 return IPPROTO_DONE;
4807
4808 /* The fixed IP header must reside completely in the first mbuf. */
4809 if (m->m_len < len)
4810 return IPPROTO_DONE;
4811
4812 ip = mtodo(m, hoff);
4813
4814 /* Bound check the packet's stated IP header length. */
4815 iphlen = ip->ip_hl << 2;
4816 if (iphlen < sizeof(struct ip)) /* minimum header length */
4817 return IPPROTO_DONE;
4818
4819 /* The full IP header must reside completely in the one mbuf. */
4820 if (m->m_len < hoff + iphlen)
4821 return IPPROTO_DONE;
4822
4823 iplen = ntohs(ip->ip_len);
4824
4825 /*
4826 * Check that the amount of data in the buffers is as
4827 * at least much as the IP header would have us expect.
4828 */
4829 if (m->m_pkthdr.len < hoff + iplen)
4830 return IPPROTO_DONE;
4831
4832 /*
4833 * Ignore IP fragments.
4834 */
4835 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4836 return IPPROTO_DONE;
4837
4838 /*
4839 * The TCP/IP or UDP/IP header must be entirely contained within
4840 * the first fragment of a packet.
4841 */
4842 switch (ip->ip_p) {
4843 case IPPROTO_TCP:
4844 if (iplen < iphlen + sizeof(struct tcphdr))
4845 return IPPROTO_DONE;
4846 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4847 return IPPROTO_DONE;
4848 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4849 thoff = th->th_off << 2;
4850 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4851 return IPPROTO_DONE;
4852 if (m->m_len < hoff + iphlen + thoff)
4853 return IPPROTO_DONE;
4854 break;
4855 case IPPROTO_UDP:
4856 if (iplen < iphlen + sizeof(struct udphdr))
4857 return IPPROTO_DONE;
4858 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4859 return IPPROTO_DONE;
4860 break;
4861 default:
4862 if (iplen < iphlen)
4863 return IPPROTO_DONE;
4864 break;
4865 }
4866 return ip->ip_p;
4867 }
4868
4869 static void
hn_rxpkt_proto(const struct mbuf * m_new,int * l3proto,int * l4proto)4870 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4871 {
4872 const struct ether_header *eh;
4873 uint16_t etype;
4874 int hoff;
4875
4876 hoff = sizeof(*eh);
4877 /* Checked at the beginning of this function. */
4878 KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4879
4880 eh = mtod(m_new, const struct ether_header *);
4881 etype = ntohs(eh->ether_type);
4882 if (etype == ETHERTYPE_VLAN) {
4883 const struct ether_vlan_header *evl;
4884
4885 hoff = sizeof(*evl);
4886 if (m_new->m_len < hoff)
4887 return;
4888 evl = mtod(m_new, const struct ether_vlan_header *);
4889 etype = ntohs(evl->evl_proto);
4890 }
4891 *l3proto = etype;
4892
4893 if (etype == ETHERTYPE_IP)
4894 *l4proto = hn_check_iplen(m_new, hoff);
4895 else
4896 *l4proto = IPPROTO_DONE;
4897 }
4898
4899 static int
hn_create_rx_data(struct hn_softc * sc,int ring_cnt)4900 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4901 {
4902 struct sysctl_oid_list *child;
4903 struct sysctl_ctx_list *ctx;
4904 device_t dev = sc->hn_dev;
4905 #if defined(INET) || defined(INET6)
4906 #if __FreeBSD_version >= 1100095
4907 int lroent_cnt;
4908 #endif
4909 #endif
4910 int i;
4911
4912 /*
4913 * Create RXBUF for reception.
4914 *
4915 * NOTE:
4916 * - It is shared by all channels.
4917 * - A large enough buffer is allocated, certain version of NVSes
4918 * may further limit the usable space.
4919 */
4920 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4921 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4922 BUS_DMA_WAITOK | BUS_DMA_ZERO);
4923 if (sc->hn_rxbuf == NULL) {
4924 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4925 return (ENOMEM);
4926 }
4927
4928 sc->hn_rx_ring_cnt = ring_cnt;
4929 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4930
4931 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4932 M_DEVBUF, M_WAITOK | M_ZERO);
4933
4934 #if defined(INET) || defined(INET6)
4935 #if __FreeBSD_version >= 1100095
4936 lroent_cnt = hn_lro_entry_count;
4937 if (lroent_cnt < TCP_LRO_ENTRIES)
4938 lroent_cnt = TCP_LRO_ENTRIES;
4939 if (bootverbose)
4940 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4941 #endif
4942 #endif /* INET || INET6 */
4943
4944 ctx = device_get_sysctl_ctx(dev);
4945 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4946
4947 /* Create dev.hn.UNIT.rx sysctl tree */
4948 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4949 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4950
4951 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4952 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4953
4954 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4955 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4956 &rxr->hn_br_dma, BUS_DMA_WAITOK);
4957 if (rxr->hn_br == NULL) {
4958 device_printf(dev, "allocate bufring failed\n");
4959 return (ENOMEM);
4960 }
4961
4962 if (hn_trust_hosttcp)
4963 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4964 if (hn_trust_hostudp)
4965 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4966 if (hn_trust_hostip)
4967 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4968 rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4969 rxr->hn_ifp = sc->hn_ifp;
4970 if (i < sc->hn_tx_ring_cnt)
4971 rxr->hn_txr = &sc->hn_tx_ring[i];
4972 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4973 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4974 rxr->hn_rx_idx = i;
4975 rxr->hn_rxbuf = sc->hn_rxbuf;
4976
4977 /*
4978 * Initialize LRO.
4979 */
4980 #if defined(INET) || defined(INET6)
4981 #if __FreeBSD_version >= 1100095
4982 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4983 hn_lro_mbufq_depth);
4984 #else
4985 tcp_lro_init(&rxr->hn_lro);
4986 rxr->hn_lro.ifp = sc->hn_ifp;
4987 #endif
4988 #if __FreeBSD_version >= 1100099
4989 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4990 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4991 #endif
4992 #endif /* INET || INET6 */
4993
4994 if (sc->hn_rx_sysctl_tree != NULL) {
4995 char name[16];
4996
4997 /*
4998 * Create per RX ring sysctl tree:
4999 * dev.hn.UNIT.rx.RINGID
5000 */
5001 snprintf(name, sizeof(name), "%d", i);
5002 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5003 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5004 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5005
5006 if (rxr->hn_rx_sysctl_tree != NULL) {
5007 SYSCTL_ADD_ULONG(ctx,
5008 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5009 OID_AUTO, "packets", CTLFLAG_RW,
5010 &rxr->hn_pkts, "# of packets received");
5011 SYSCTL_ADD_ULONG(ctx,
5012 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5013 OID_AUTO, "rss_pkts", CTLFLAG_RW,
5014 &rxr->hn_rss_pkts,
5015 "# of packets w/ RSS info received");
5016 SYSCTL_ADD_INT(ctx,
5017 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5018 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5019 &rxr->hn_pktbuf_len, 0,
5020 "Temporary channel packet buffer length");
5021 }
5022 }
5023 }
5024
5025 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5026 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5027 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5028 #if __FreeBSD_version < 1100095
5029 hn_rx_stat_int_sysctl,
5030 #else
5031 hn_rx_stat_u64_sysctl,
5032 #endif
5033 "LU", "LRO queued");
5034 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5035 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5036 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5037 #if __FreeBSD_version < 1100095
5038 hn_rx_stat_int_sysctl,
5039 #else
5040 hn_rx_stat_u64_sysctl,
5041 #endif
5042 "LU", "LRO flushed");
5043 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5044 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5045 __offsetof(struct hn_rx_ring, hn_lro_tried),
5046 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5047 #if __FreeBSD_version >= 1100099
5048 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5049 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5050 hn_lro_lenlim_sysctl, "IU",
5051 "Max # of data bytes to be aggregated by LRO");
5052 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5053 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5054 hn_lro_ackcnt_sysctl, "I",
5055 "Max # of ACKs to be aggregated by LRO");
5056 #endif
5057 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5058 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5059 hn_trust_hcsum_sysctl, "I",
5060 "Trust tcp segement verification on host side, "
5061 "when csum info is missing");
5062 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5063 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5064 hn_trust_hcsum_sysctl, "I",
5065 "Trust udp datagram verification on host side, "
5066 "when csum info is missing");
5067 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5068 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5069 hn_trust_hcsum_sysctl, "I",
5070 "Trust ip packet verification on host side, "
5071 "when csum info is missing");
5072 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5073 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5074 __offsetof(struct hn_rx_ring, hn_csum_ip),
5075 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5076 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5077 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5078 __offsetof(struct hn_rx_ring, hn_csum_tcp),
5079 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5080 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5081 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5082 __offsetof(struct hn_rx_ring, hn_csum_udp),
5083 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5084 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5085 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5086 __offsetof(struct hn_rx_ring, hn_csum_trusted),
5087 hn_rx_stat_ulong_sysctl, "LU",
5088 "# of packets that we trust host's csum verification");
5089 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5090 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5091 __offsetof(struct hn_rx_ring, hn_small_pkts),
5092 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5093 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5094 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5095 __offsetof(struct hn_rx_ring, hn_ack_failed),
5096 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5097 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5098 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5099 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5100 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5101
5102 return (0);
5103 }
5104
5105 static void
hn_destroy_rx_data(struct hn_softc * sc)5106 hn_destroy_rx_data(struct hn_softc *sc)
5107 {
5108 int i;
5109
5110 if (sc->hn_rxbuf != NULL) {
5111 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5112 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5113 else
5114 device_printf(sc->hn_dev, "RXBUF is referenced\n");
5115 sc->hn_rxbuf = NULL;
5116 }
5117
5118 if (sc->hn_rx_ring_cnt == 0)
5119 return;
5120
5121 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5122 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5123
5124 if (rxr->hn_br == NULL)
5125 continue;
5126 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5127 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5128 } else {
5129 device_printf(sc->hn_dev,
5130 "%dth channel bufring is referenced", i);
5131 }
5132 rxr->hn_br = NULL;
5133
5134 #if defined(INET) || defined(INET6)
5135 tcp_lro_free(&rxr->hn_lro);
5136 #endif
5137 free(rxr->hn_pktbuf, M_DEVBUF);
5138 }
5139 free(sc->hn_rx_ring, M_DEVBUF);
5140 sc->hn_rx_ring = NULL;
5141
5142 sc->hn_rx_ring_cnt = 0;
5143 sc->hn_rx_ring_inuse = 0;
5144 }
5145
5146 static int
hn_tx_ring_create(struct hn_softc * sc,int id)5147 hn_tx_ring_create(struct hn_softc *sc, int id)
5148 {
5149 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5150 device_t dev = sc->hn_dev;
5151 bus_dma_tag_t parent_dtag;
5152 int error, i;
5153
5154 txr->hn_sc = sc;
5155 txr->hn_tx_idx = id;
5156
5157 #ifndef HN_USE_TXDESC_BUFRING
5158 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5159 #endif
5160 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5161
5162 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5163 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5164 M_DEVBUF, M_WAITOK | M_ZERO);
5165 #ifndef HN_USE_TXDESC_BUFRING
5166 SLIST_INIT(&txr->hn_txlist);
5167 #else
5168 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5169 M_WAITOK, &txr->hn_tx_lock);
5170 #endif
5171
5172 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5173 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5174 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5175 } else {
5176 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5177 }
5178
5179 #ifdef HN_IFSTART_SUPPORT
5180 if (hn_use_if_start) {
5181 txr->hn_txeof = hn_start_txeof;
5182 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5183 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5184 } else
5185 #endif
5186 {
5187 int br_depth;
5188
5189 txr->hn_txeof = hn_xmit_txeof;
5190 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5191 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5192
5193 br_depth = hn_get_txswq_depth(txr);
5194 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5195 M_WAITOK, &txr->hn_tx_lock);
5196 }
5197
5198 txr->hn_direct_tx_size = hn_direct_tx_size;
5199
5200 /*
5201 * Always schedule transmission instead of trying to do direct
5202 * transmission. This one gives the best performance so far.
5203 */
5204 txr->hn_sched_tx = 1;
5205
5206 parent_dtag = bus_get_dma_tag(dev);
5207
5208 /* DMA tag for RNDIS packet messages. */
5209 error = bus_dma_tag_create(parent_dtag, /* parent */
5210 HN_RNDIS_PKT_ALIGN, /* alignment */
5211 HN_RNDIS_PKT_BOUNDARY, /* boundary */
5212 BUS_SPACE_MAXADDR, /* lowaddr */
5213 BUS_SPACE_MAXADDR, /* highaddr */
5214 NULL, NULL, /* filter, filterarg */
5215 HN_RNDIS_PKT_LEN, /* maxsize */
5216 1, /* nsegments */
5217 HN_RNDIS_PKT_LEN, /* maxsegsize */
5218 0, /* flags */
5219 NULL, /* lockfunc */
5220 NULL, /* lockfuncarg */
5221 &txr->hn_tx_rndis_dtag);
5222 if (error) {
5223 device_printf(dev, "failed to create rndis dmatag\n");
5224 return error;
5225 }
5226
5227 /* DMA tag for data. */
5228 error = bus_dma_tag_create(parent_dtag, /* parent */
5229 1, /* alignment */
5230 HN_TX_DATA_BOUNDARY, /* boundary */
5231 BUS_SPACE_MAXADDR, /* lowaddr */
5232 BUS_SPACE_MAXADDR, /* highaddr */
5233 NULL, NULL, /* filter, filterarg */
5234 HN_TX_DATA_MAXSIZE, /* maxsize */
5235 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
5236 HN_TX_DATA_SEGSIZE, /* maxsegsize */
5237 0, /* flags */
5238 NULL, /* lockfunc */
5239 NULL, /* lockfuncarg */
5240 &txr->hn_tx_data_dtag);
5241 if (error) {
5242 device_printf(dev, "failed to create data dmatag\n");
5243 return error;
5244 }
5245
5246 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5247 struct hn_txdesc *txd = &txr->hn_txdesc[i];
5248
5249 txd->txr = txr;
5250 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5251 STAILQ_INIT(&txd->agg_list);
5252
5253 /*
5254 * Allocate and load RNDIS packet message.
5255 */
5256 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5257 (void **)&txd->rndis_pkt,
5258 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5259 &txd->rndis_pkt_dmap);
5260 if (error) {
5261 device_printf(dev,
5262 "failed to allocate rndis_packet_msg, %d\n", i);
5263 return error;
5264 }
5265
5266 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5267 txd->rndis_pkt_dmap,
5268 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5269 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5270 BUS_DMA_NOWAIT);
5271 if (error) {
5272 device_printf(dev,
5273 "failed to load rndis_packet_msg, %d\n", i);
5274 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5275 txd->rndis_pkt, txd->rndis_pkt_dmap);
5276 return error;
5277 }
5278
5279 /* DMA map for TX data. */
5280 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5281 &txd->data_dmap);
5282 if (error) {
5283 device_printf(dev,
5284 "failed to allocate tx data dmamap\n");
5285 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5286 txd->rndis_pkt_dmap);
5287 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5288 txd->rndis_pkt, txd->rndis_pkt_dmap);
5289 return error;
5290 }
5291
5292 /* All set, put it to list */
5293 txd->flags |= HN_TXD_FLAG_ONLIST;
5294 #ifndef HN_USE_TXDESC_BUFRING
5295 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5296 #else
5297 buf_ring_enqueue(txr->hn_txdesc_br, txd);
5298 #endif
5299 }
5300 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5301
5302 if (sc->hn_tx_sysctl_tree != NULL) {
5303 struct sysctl_oid_list *child;
5304 struct sysctl_ctx_list *ctx;
5305 char name[16];
5306
5307 /*
5308 * Create per TX ring sysctl tree:
5309 * dev.hn.UNIT.tx.RINGID
5310 */
5311 ctx = device_get_sysctl_ctx(dev);
5312 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5313
5314 snprintf(name, sizeof(name), "%d", id);
5315 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5316 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5317
5318 if (txr->hn_tx_sysctl_tree != NULL) {
5319 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5320
5321 #ifdef HN_DEBUG
5322 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5323 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5324 "# of available TX descs");
5325 #endif
5326 #ifdef HN_IFSTART_SUPPORT
5327 if (!hn_use_if_start)
5328 #endif
5329 {
5330 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5331 CTLFLAG_RD, &txr->hn_oactive, 0,
5332 "over active");
5333 }
5334 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5335 CTLFLAG_RW, &txr->hn_pkts,
5336 "# of packets transmitted");
5337 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5338 CTLFLAG_RW, &txr->hn_sends, "# of sends");
5339 }
5340 }
5341
5342 return 0;
5343 }
5344
5345 static void
hn_txdesc_dmamap_destroy(struct hn_txdesc * txd)5346 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5347 {
5348 struct hn_tx_ring *txr = txd->txr;
5349
5350 KASSERT(txd->m == NULL, ("still has mbuf installed"));
5351 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5352
5353 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5354 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5355 txd->rndis_pkt_dmap);
5356 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5357 }
5358
5359 static void
hn_txdesc_gc(struct hn_tx_ring * txr,struct hn_txdesc * txd)5360 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5361 {
5362
5363 KASSERT(txd->refs == 0 || txd->refs == 1,
5364 ("invalid txd refs %d", txd->refs));
5365
5366 /* Aggregated txds will be freed by their aggregating txd. */
5367 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5368 int freed;
5369
5370 freed = hn_txdesc_put(txr, txd);
5371 KASSERT(freed, ("can't free txdesc"));
5372 }
5373 }
5374
5375 static void
hn_tx_ring_destroy(struct hn_tx_ring * txr)5376 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5377 {
5378 int i;
5379
5380 if (txr->hn_txdesc == NULL)
5381 return;
5382
5383 /*
5384 * NOTE:
5385 * Because the freeing of aggregated txds will be deferred
5386 * to the aggregating txd, two passes are used here:
5387 * - The first pass GCes any pending txds. This GC is necessary,
5388 * since if the channels are revoked, hypervisor will not
5389 * deliver send-done for all pending txds.
5390 * - The second pass frees the busdma stuffs, i.e. after all txds
5391 * were freed.
5392 */
5393 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5394 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5395 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5396 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5397
5398 if (txr->hn_tx_data_dtag != NULL)
5399 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5400 if (txr->hn_tx_rndis_dtag != NULL)
5401 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5402
5403 #ifdef HN_USE_TXDESC_BUFRING
5404 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5405 #endif
5406
5407 free(txr->hn_txdesc, M_DEVBUF);
5408 txr->hn_txdesc = NULL;
5409
5410 if (txr->hn_mbuf_br != NULL)
5411 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5412
5413 #ifndef HN_USE_TXDESC_BUFRING
5414 mtx_destroy(&txr->hn_txlist_spin);
5415 #endif
5416 mtx_destroy(&txr->hn_tx_lock);
5417 }
5418
5419 static int
hn_create_tx_data(struct hn_softc * sc,int ring_cnt)5420 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5421 {
5422 struct sysctl_oid_list *child;
5423 struct sysctl_ctx_list *ctx;
5424 int i;
5425
5426 /*
5427 * Create TXBUF for chimney sending.
5428 *
5429 * NOTE: It is shared by all channels.
5430 */
5431 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5432 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5433 BUS_DMA_WAITOK | BUS_DMA_ZERO);
5434 if (sc->hn_chim == NULL) {
5435 device_printf(sc->hn_dev, "allocate txbuf failed\n");
5436 return (ENOMEM);
5437 }
5438
5439 sc->hn_tx_ring_cnt = ring_cnt;
5440 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5441
5442 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5443 M_DEVBUF, M_WAITOK | M_ZERO);
5444
5445 ctx = device_get_sysctl_ctx(sc->hn_dev);
5446 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5447
5448 /* Create dev.hn.UNIT.tx sysctl tree */
5449 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5450 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5451
5452 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5453 int error;
5454
5455 error = hn_tx_ring_create(sc, i);
5456 if (error)
5457 return error;
5458 }
5459
5460 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5461 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5462 __offsetof(struct hn_tx_ring, hn_no_txdescs),
5463 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5464 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5465 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5466 __offsetof(struct hn_tx_ring, hn_send_failed),
5467 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5468 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5469 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5470 __offsetof(struct hn_tx_ring, hn_txdma_failed),
5471 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5472 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5473 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5474 __offsetof(struct hn_tx_ring, hn_flush_failed),
5475 hn_tx_stat_ulong_sysctl, "LU",
5476 "# of packet transmission aggregation flush failure");
5477 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5478 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5479 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5480 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5481 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5482 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5483 __offsetof(struct hn_tx_ring, hn_tx_chimney),
5484 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5485 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5486 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5487 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5488 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5489 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5490 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5491 "# of total TX descs");
5492 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5493 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5494 "Chimney send packet size upper boundary");
5495 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5496 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5497 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5498 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5499 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5500 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5501 hn_tx_conf_int_sysctl, "I",
5502 "Size of the packet for direct transmission");
5503 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5504 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5505 __offsetof(struct hn_tx_ring, hn_sched_tx),
5506 hn_tx_conf_int_sysctl, "I",
5507 "Always schedule transmission "
5508 "instead of doing direct transmission");
5509 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5510 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5511 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5512 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5513 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5514 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5515 "Applied packet transmission aggregation size");
5516 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5517 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5518 hn_txagg_pktmax_sysctl, "I",
5519 "Applied packet transmission aggregation packets");
5520 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5521 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5522 hn_txagg_align_sysctl, "I",
5523 "Applied packet transmission aggregation alignment");
5524
5525 return 0;
5526 }
5527
5528 static void
hn_set_chim_size(struct hn_softc * sc,int chim_size)5529 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5530 {
5531 int i;
5532
5533 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5534 sc->hn_tx_ring[i].hn_chim_size = chim_size;
5535 }
5536
5537 static void
hn_set_tso_maxsize(struct hn_softc * sc,int tso_maxlen,int mtu)5538 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5539 {
5540 struct ifnet *ifp = sc->hn_ifp;
5541 u_int hw_tsomax;
5542 int tso_minlen;
5543
5544 HN_LOCK_ASSERT(sc);
5545
5546 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5547 return;
5548
5549 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5550 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5551 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5552
5553 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5554 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5555 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5556
5557 if (tso_maxlen < tso_minlen)
5558 tso_maxlen = tso_minlen;
5559 else if (tso_maxlen > IP_MAXPACKET)
5560 tso_maxlen = IP_MAXPACKET;
5561 if (tso_maxlen > sc->hn_ndis_tso_szmax)
5562 tso_maxlen = sc->hn_ndis_tso_szmax;
5563 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5564
5565 if (hn_xpnt_vf_isready(sc)) {
5566 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5567 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5568 }
5569 ifp->if_hw_tsomax = hw_tsomax;
5570 if (bootverbose)
5571 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5572 }
5573
5574 static void
hn_fixup_tx_data(struct hn_softc * sc)5575 hn_fixup_tx_data(struct hn_softc *sc)
5576 {
5577 uint64_t csum_assist;
5578 int i;
5579
5580 hn_set_chim_size(sc, sc->hn_chim_szmax);
5581 if (hn_tx_chimney_size > 0 &&
5582 hn_tx_chimney_size < sc->hn_chim_szmax)
5583 hn_set_chim_size(sc, hn_tx_chimney_size);
5584
5585 csum_assist = 0;
5586 if (sc->hn_caps & HN_CAP_IPCS)
5587 csum_assist |= CSUM_IP;
5588 if (sc->hn_caps & HN_CAP_TCP4CS)
5589 csum_assist |= CSUM_IP_TCP;
5590 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5591 csum_assist |= CSUM_IP_UDP;
5592 if (sc->hn_caps & HN_CAP_TCP6CS)
5593 csum_assist |= CSUM_IP6_TCP;
5594 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5595 csum_assist |= CSUM_IP6_UDP;
5596 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5597 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5598
5599 if (sc->hn_caps & HN_CAP_HASHVAL) {
5600 /*
5601 * Support HASHVAL pktinfo on TX path.
5602 */
5603 if (bootverbose)
5604 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5605 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5606 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5607 }
5608 }
5609
5610 static void
hn_fixup_rx_data(struct hn_softc * sc)5611 hn_fixup_rx_data(struct hn_softc *sc)
5612 {
5613
5614 if (sc->hn_caps & HN_CAP_UDPHASH) {
5615 int i;
5616
5617 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5618 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5619 }
5620 }
5621
5622 static void
hn_destroy_tx_data(struct hn_softc * sc)5623 hn_destroy_tx_data(struct hn_softc *sc)
5624 {
5625 int i;
5626
5627 if (sc->hn_chim != NULL) {
5628 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5629 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5630 } else {
5631 device_printf(sc->hn_dev,
5632 "chimney sending buffer is referenced");
5633 }
5634 sc->hn_chim = NULL;
5635 }
5636
5637 if (sc->hn_tx_ring_cnt == 0)
5638 return;
5639
5640 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5641 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5642
5643 free(sc->hn_tx_ring, M_DEVBUF);
5644 sc->hn_tx_ring = NULL;
5645
5646 sc->hn_tx_ring_cnt = 0;
5647 sc->hn_tx_ring_inuse = 0;
5648 }
5649
5650 #ifdef HN_IFSTART_SUPPORT
5651
5652 static void
hn_start_taskfunc(void * xtxr,int pending __unused)5653 hn_start_taskfunc(void *xtxr, int pending __unused)
5654 {
5655 struct hn_tx_ring *txr = xtxr;
5656
5657 mtx_lock(&txr->hn_tx_lock);
5658 hn_start_locked(txr, 0);
5659 mtx_unlock(&txr->hn_tx_lock);
5660 }
5661
5662 static int
hn_start_locked(struct hn_tx_ring * txr,int len)5663 hn_start_locked(struct hn_tx_ring *txr, int len)
5664 {
5665 struct hn_softc *sc = txr->hn_sc;
5666 struct ifnet *ifp = sc->hn_ifp;
5667 int sched = 0;
5668
5669 KASSERT(hn_use_if_start,
5670 ("hn_start_locked is called, when if_start is disabled"));
5671 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5672 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5673 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5674
5675 if (__predict_false(txr->hn_suspended))
5676 return (0);
5677
5678 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5679 IFF_DRV_RUNNING)
5680 return (0);
5681
5682 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5683 struct hn_txdesc *txd;
5684 struct mbuf *m_head;
5685 int error;
5686
5687 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5688 if (m_head == NULL)
5689 break;
5690
5691 if (len > 0 && m_head->m_pkthdr.len > len) {
5692 /*
5693 * This sending could be time consuming; let callers
5694 * dispatch this packet sending (and sending of any
5695 * following up packets) to tx taskqueue.
5696 */
5697 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5698 sched = 1;
5699 break;
5700 }
5701
5702 #if defined(INET6) || defined(INET)
5703 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5704 m_head = hn_tso_fixup(m_head);
5705 if (__predict_false(m_head == NULL)) {
5706 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5707 continue;
5708 }
5709 } else if (m_head->m_pkthdr.csum_flags &
5710 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5711 m_head = hn_set_hlen(m_head);
5712 if (__predict_false(m_head == NULL)) {
5713 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5714 continue;
5715 }
5716 }
5717 #endif
5718
5719 txd = hn_txdesc_get(txr);
5720 if (txd == NULL) {
5721 txr->hn_no_txdescs++;
5722 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5723 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5724 break;
5725 }
5726
5727 error = hn_encap(ifp, txr, txd, &m_head);
5728 if (error) {
5729 /* Both txd and m_head are freed */
5730 KASSERT(txr->hn_agg_txd == NULL,
5731 ("encap failed w/ pending aggregating txdesc"));
5732 continue;
5733 }
5734
5735 if (txr->hn_agg_pktleft == 0) {
5736 if (txr->hn_agg_txd != NULL) {
5737 KASSERT(m_head == NULL,
5738 ("pending mbuf for aggregating txdesc"));
5739 error = hn_flush_txagg(ifp, txr);
5740 if (__predict_false(error)) {
5741 atomic_set_int(&ifp->if_drv_flags,
5742 IFF_DRV_OACTIVE);
5743 break;
5744 }
5745 } else {
5746 KASSERT(m_head != NULL, ("mbuf was freed"));
5747 error = hn_txpkt(ifp, txr, txd);
5748 if (__predict_false(error)) {
5749 /* txd is freed, but m_head is not */
5750 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5751 atomic_set_int(&ifp->if_drv_flags,
5752 IFF_DRV_OACTIVE);
5753 break;
5754 }
5755 }
5756 }
5757 #ifdef INVARIANTS
5758 else {
5759 KASSERT(txr->hn_agg_txd != NULL,
5760 ("no aggregating txdesc"));
5761 KASSERT(m_head == NULL,
5762 ("pending mbuf for aggregating txdesc"));
5763 }
5764 #endif
5765 }
5766
5767 /* Flush pending aggerated transmission. */
5768 if (txr->hn_agg_txd != NULL)
5769 hn_flush_txagg(ifp, txr);
5770 return (sched);
5771 }
5772
5773 static void
hn_start(struct ifnet * ifp)5774 hn_start(struct ifnet *ifp)
5775 {
5776 struct hn_softc *sc = ifp->if_softc;
5777 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5778
5779 if (txr->hn_sched_tx)
5780 goto do_sched;
5781
5782 if (mtx_trylock(&txr->hn_tx_lock)) {
5783 int sched;
5784
5785 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5786 mtx_unlock(&txr->hn_tx_lock);
5787 if (!sched)
5788 return;
5789 }
5790 do_sched:
5791 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5792 }
5793
5794 static void
hn_start_txeof_taskfunc(void * xtxr,int pending __unused)5795 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5796 {
5797 struct hn_tx_ring *txr = xtxr;
5798
5799 mtx_lock(&txr->hn_tx_lock);
5800 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5801 hn_start_locked(txr, 0);
5802 mtx_unlock(&txr->hn_tx_lock);
5803 }
5804
5805 static void
hn_start_txeof(struct hn_tx_ring * txr)5806 hn_start_txeof(struct hn_tx_ring *txr)
5807 {
5808 struct hn_softc *sc = txr->hn_sc;
5809 struct ifnet *ifp = sc->hn_ifp;
5810
5811 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5812
5813 if (txr->hn_sched_tx)
5814 goto do_sched;
5815
5816 if (mtx_trylock(&txr->hn_tx_lock)) {
5817 int sched;
5818
5819 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5820 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5821 mtx_unlock(&txr->hn_tx_lock);
5822 if (sched) {
5823 taskqueue_enqueue(txr->hn_tx_taskq,
5824 &txr->hn_tx_task);
5825 }
5826 } else {
5827 do_sched:
5828 /*
5829 * Release the OACTIVE earlier, with the hope, that
5830 * others could catch up. The task will clear the
5831 * flag again with the hn_tx_lock to avoid possible
5832 * races.
5833 */
5834 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5835 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5836 }
5837 }
5838
5839 #endif /* HN_IFSTART_SUPPORT */
5840
5841 static int
hn_xmit(struct hn_tx_ring * txr,int len)5842 hn_xmit(struct hn_tx_ring *txr, int len)
5843 {
5844 struct hn_softc *sc = txr->hn_sc;
5845 struct ifnet *ifp = sc->hn_ifp;
5846 struct mbuf *m_head;
5847 int sched = 0;
5848
5849 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5850 #ifdef HN_IFSTART_SUPPORT
5851 KASSERT(hn_use_if_start == 0,
5852 ("hn_xmit is called, when if_start is enabled"));
5853 #endif
5854 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5855
5856 if (__predict_false(txr->hn_suspended))
5857 return (0);
5858
5859 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5860 return (0);
5861
5862 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5863 struct hn_txdesc *txd;
5864 int error;
5865
5866 if (len > 0 && m_head->m_pkthdr.len > len) {
5867 /*
5868 * This sending could be time consuming; let callers
5869 * dispatch this packet sending (and sending of any
5870 * following up packets) to tx taskqueue.
5871 */
5872 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5873 sched = 1;
5874 break;
5875 }
5876
5877 txd = hn_txdesc_get(txr);
5878 if (txd == NULL) {
5879 txr->hn_no_txdescs++;
5880 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5881 txr->hn_oactive = 1;
5882 break;
5883 }
5884
5885 error = hn_encap(ifp, txr, txd, &m_head);
5886 if (error) {
5887 /* Both txd and m_head are freed; discard */
5888 KASSERT(txr->hn_agg_txd == NULL,
5889 ("encap failed w/ pending aggregating txdesc"));
5890 drbr_advance(ifp, txr->hn_mbuf_br);
5891 continue;
5892 }
5893
5894 if (txr->hn_agg_pktleft == 0) {
5895 if (txr->hn_agg_txd != NULL) {
5896 KASSERT(m_head == NULL,
5897 ("pending mbuf for aggregating txdesc"));
5898 error = hn_flush_txagg(ifp, txr);
5899 if (__predict_false(error)) {
5900 txr->hn_oactive = 1;
5901 break;
5902 }
5903 } else {
5904 KASSERT(m_head != NULL, ("mbuf was freed"));
5905 error = hn_txpkt(ifp, txr, txd);
5906 if (__predict_false(error)) {
5907 /* txd is freed, but m_head is not */
5908 drbr_putback(ifp, txr->hn_mbuf_br,
5909 m_head);
5910 txr->hn_oactive = 1;
5911 break;
5912 }
5913 }
5914 }
5915 #ifdef INVARIANTS
5916 else {
5917 KASSERT(txr->hn_agg_txd != NULL,
5918 ("no aggregating txdesc"));
5919 KASSERT(m_head == NULL,
5920 ("pending mbuf for aggregating txdesc"));
5921 }
5922 #endif
5923
5924 /* Sent */
5925 drbr_advance(ifp, txr->hn_mbuf_br);
5926 }
5927
5928 /* Flush pending aggerated transmission. */
5929 if (txr->hn_agg_txd != NULL)
5930 hn_flush_txagg(ifp, txr);
5931 return (sched);
5932 }
5933
5934 static int
hn_transmit(struct ifnet * ifp,struct mbuf * m)5935 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5936 {
5937 struct hn_softc *sc = ifp->if_softc;
5938 struct hn_tx_ring *txr;
5939 int error, idx = 0;
5940
5941 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5942 struct rm_priotracker pt;
5943
5944 rm_rlock(&sc->hn_vf_lock, &pt);
5945 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5946 struct mbuf *m_bpf = NULL;
5947 int obytes, omcast;
5948
5949 obytes = m->m_pkthdr.len;
5950 if (m->m_flags & M_MCAST)
5951 omcast = 1;
5952
5953 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5954 if (bpf_peers_present(ifp->if_bpf)) {
5955 m_bpf = m_copypacket(m, M_NOWAIT);
5956 if (m_bpf == NULL) {
5957 /*
5958 * Failed to grab a shallow
5959 * copy; tap now.
5960 */
5961 ETHER_BPF_MTAP(ifp, m);
5962 }
5963 }
5964 } else {
5965 ETHER_BPF_MTAP(ifp, m);
5966 }
5967
5968 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5969 rm_runlock(&sc->hn_vf_lock, &pt);
5970
5971 if (m_bpf != NULL) {
5972 if (!error)
5973 ETHER_BPF_MTAP(ifp, m_bpf);
5974 m_freem(m_bpf);
5975 }
5976
5977 if (error == ENOBUFS) {
5978 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5979 } else if (error) {
5980 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5981 } else {
5982 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5983 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5984 if (omcast) {
5985 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5986 omcast);
5987 }
5988 }
5989 return (error);
5990 }
5991 rm_runlock(&sc->hn_vf_lock, &pt);
5992 }
5993
5994 #if defined(INET6) || defined(INET)
5995 /*
5996 * Perform TSO packet header fixup or get l2/l3 header length now,
5997 * since packet headers should be cache-hot.
5998 */
5999 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6000 m = hn_tso_fixup(m);
6001 if (__predict_false(m == NULL)) {
6002 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6003 return EIO;
6004 }
6005 } else if (m->m_pkthdr.csum_flags &
6006 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6007 m = hn_set_hlen(m);
6008 if (__predict_false(m == NULL)) {
6009 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6010 return EIO;
6011 }
6012 }
6013 #endif
6014
6015 /*
6016 * Select the TX ring based on flowid
6017 */
6018 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6019 #ifdef RSS
6020 uint32_t bid;
6021
6022 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6023 &bid) == 0)
6024 idx = bid % sc->hn_tx_ring_inuse;
6025 else
6026 #endif
6027 {
6028 #if defined(INET6) || defined(INET)
6029 int tcpsyn = 0;
6030
6031 if (m->m_pkthdr.len < 128 &&
6032 (m->m_pkthdr.csum_flags &
6033 (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6034 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6035 m = hn_check_tcpsyn(m, &tcpsyn);
6036 if (__predict_false(m == NULL)) {
6037 if_inc_counter(ifp,
6038 IFCOUNTER_OERRORS, 1);
6039 return (EIO);
6040 }
6041 }
6042 #else
6043 const int tcpsyn = 0;
6044 #endif
6045 if (tcpsyn)
6046 idx = 0;
6047 else
6048 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6049 }
6050 }
6051 txr = &sc->hn_tx_ring[idx];
6052
6053 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6054 if (error) {
6055 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6056 return error;
6057 }
6058
6059 if (txr->hn_oactive)
6060 return 0;
6061
6062 if (txr->hn_sched_tx)
6063 goto do_sched;
6064
6065 if (mtx_trylock(&txr->hn_tx_lock)) {
6066 int sched;
6067
6068 sched = hn_xmit(txr, txr->hn_direct_tx_size);
6069 mtx_unlock(&txr->hn_tx_lock);
6070 if (!sched)
6071 return 0;
6072 }
6073 do_sched:
6074 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6075 return 0;
6076 }
6077
6078 static void
hn_tx_ring_qflush(struct hn_tx_ring * txr)6079 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6080 {
6081 struct mbuf *m;
6082
6083 mtx_lock(&txr->hn_tx_lock);
6084 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6085 m_freem(m);
6086 mtx_unlock(&txr->hn_tx_lock);
6087 }
6088
6089 static void
hn_xmit_qflush(struct ifnet * ifp)6090 hn_xmit_qflush(struct ifnet *ifp)
6091 {
6092 struct hn_softc *sc = ifp->if_softc;
6093 struct rm_priotracker pt;
6094 int i;
6095
6096 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6097 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6098 if_qflush(ifp);
6099
6100 rm_rlock(&sc->hn_vf_lock, &pt);
6101 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6102 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6103 rm_runlock(&sc->hn_vf_lock, &pt);
6104 }
6105
6106 static void
hn_xmit_txeof(struct hn_tx_ring * txr)6107 hn_xmit_txeof(struct hn_tx_ring *txr)
6108 {
6109
6110 if (txr->hn_sched_tx)
6111 goto do_sched;
6112
6113 if (mtx_trylock(&txr->hn_tx_lock)) {
6114 int sched;
6115
6116 txr->hn_oactive = 0;
6117 sched = hn_xmit(txr, txr->hn_direct_tx_size);
6118 mtx_unlock(&txr->hn_tx_lock);
6119 if (sched) {
6120 taskqueue_enqueue(txr->hn_tx_taskq,
6121 &txr->hn_tx_task);
6122 }
6123 } else {
6124 do_sched:
6125 /*
6126 * Release the oactive earlier, with the hope, that
6127 * others could catch up. The task will clear the
6128 * oactive again with the hn_tx_lock to avoid possible
6129 * races.
6130 */
6131 txr->hn_oactive = 0;
6132 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6133 }
6134 }
6135
6136 static void
hn_xmit_taskfunc(void * xtxr,int pending __unused)6137 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6138 {
6139 struct hn_tx_ring *txr = xtxr;
6140
6141 mtx_lock(&txr->hn_tx_lock);
6142 hn_xmit(txr, 0);
6143 mtx_unlock(&txr->hn_tx_lock);
6144 }
6145
6146 static void
hn_xmit_txeof_taskfunc(void * xtxr,int pending __unused)6147 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6148 {
6149 struct hn_tx_ring *txr = xtxr;
6150
6151 mtx_lock(&txr->hn_tx_lock);
6152 txr->hn_oactive = 0;
6153 hn_xmit(txr, 0);
6154 mtx_unlock(&txr->hn_tx_lock);
6155 }
6156
6157 static int
hn_chan_attach(struct hn_softc * sc,struct vmbus_channel * chan)6158 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6159 {
6160 struct vmbus_chan_br cbr;
6161 struct hn_rx_ring *rxr;
6162 struct hn_tx_ring *txr = NULL;
6163 int idx, error;
6164
6165 idx = vmbus_chan_subidx(chan);
6166
6167 /*
6168 * Link this channel to RX/TX ring.
6169 */
6170 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6171 ("invalid channel index %d, should > 0 && < %d",
6172 idx, sc->hn_rx_ring_inuse));
6173 rxr = &sc->hn_rx_ring[idx];
6174 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6175 ("RX ring %d already attached", idx));
6176 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6177 rxr->hn_chan = chan;
6178
6179 if (bootverbose) {
6180 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6181 idx, vmbus_chan_id(chan));
6182 }
6183
6184 if (idx < sc->hn_tx_ring_inuse) {
6185 txr = &sc->hn_tx_ring[idx];
6186 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6187 ("TX ring %d already attached", idx));
6188 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6189
6190 txr->hn_chan = chan;
6191 if (bootverbose) {
6192 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6193 idx, vmbus_chan_id(chan));
6194 }
6195 }
6196
6197 /* Bind this channel to a proper CPU. */
6198 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6199
6200 /*
6201 * Open this channel
6202 */
6203 cbr.cbr = rxr->hn_br;
6204 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6205 cbr.cbr_txsz = HN_TXBR_SIZE;
6206 cbr.cbr_rxsz = HN_RXBR_SIZE;
6207 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6208 if (error) {
6209 if (error == EISCONN) {
6210 if_printf(sc->hn_ifp, "bufring is connected after "
6211 "chan%u open failure\n", vmbus_chan_id(chan));
6212 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6213 } else {
6214 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6215 vmbus_chan_id(chan), error);
6216 }
6217 }
6218 return (error);
6219 }
6220
6221 static void
hn_chan_detach(struct hn_softc * sc,struct vmbus_channel * chan)6222 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6223 {
6224 struct hn_rx_ring *rxr;
6225 int idx, error;
6226
6227 idx = vmbus_chan_subidx(chan);
6228
6229 /*
6230 * Link this channel to RX/TX ring.
6231 */
6232 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6233 ("invalid channel index %d, should > 0 && < %d",
6234 idx, sc->hn_rx_ring_inuse));
6235 rxr = &sc->hn_rx_ring[idx];
6236 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6237 ("RX ring %d is not attached", idx));
6238 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6239
6240 if (idx < sc->hn_tx_ring_inuse) {
6241 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6242
6243 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6244 ("TX ring %d is not attached attached", idx));
6245 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6246 }
6247
6248 /*
6249 * Close this channel.
6250 *
6251 * NOTE:
6252 * Channel closing does _not_ destroy the target channel.
6253 */
6254 error = vmbus_chan_close_direct(chan);
6255 if (error == EISCONN) {
6256 if_printf(sc->hn_ifp, "chan%u bufring is connected "
6257 "after being closed\n", vmbus_chan_id(chan));
6258 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6259 } else if (error) {
6260 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6261 vmbus_chan_id(chan), error);
6262 }
6263 }
6264
6265 static int
hn_attach_subchans(struct hn_softc * sc)6266 hn_attach_subchans(struct hn_softc *sc)
6267 {
6268 struct vmbus_channel **subchans;
6269 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6270 int i, error = 0;
6271
6272 KASSERT(subchan_cnt > 0, ("no sub-channels"));
6273
6274 /* Attach the sub-channels. */
6275 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6276 for (i = 0; i < subchan_cnt; ++i) {
6277 int error1;
6278
6279 error1 = hn_chan_attach(sc, subchans[i]);
6280 if (error1) {
6281 error = error1;
6282 /* Move on; all channels will be detached later. */
6283 }
6284 }
6285 vmbus_subchan_rel(subchans, subchan_cnt);
6286
6287 if (error) {
6288 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6289 } else {
6290 if (bootverbose) {
6291 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6292 subchan_cnt);
6293 }
6294 }
6295 return (error);
6296 }
6297
6298 static void
hn_detach_allchans(struct hn_softc * sc)6299 hn_detach_allchans(struct hn_softc *sc)
6300 {
6301 struct vmbus_channel **subchans;
6302 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6303 int i;
6304
6305 if (subchan_cnt == 0)
6306 goto back;
6307
6308 /* Detach the sub-channels. */
6309 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6310 for (i = 0; i < subchan_cnt; ++i)
6311 hn_chan_detach(sc, subchans[i]);
6312 vmbus_subchan_rel(subchans, subchan_cnt);
6313
6314 back:
6315 /*
6316 * Detach the primary channel, _after_ all sub-channels
6317 * are detached.
6318 */
6319 hn_chan_detach(sc, sc->hn_prichan);
6320
6321 /* Wait for sub-channels to be destroyed, if any. */
6322 vmbus_subchan_drain(sc->hn_prichan);
6323
6324 #ifdef INVARIANTS
6325 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6326 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6327 HN_RX_FLAG_ATTACHED) == 0,
6328 ("%dth RX ring is still attached", i));
6329 }
6330 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6331 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6332 HN_TX_FLAG_ATTACHED) == 0,
6333 ("%dth TX ring is still attached", i));
6334 }
6335 #endif
6336 }
6337
6338 static int
hn_synth_alloc_subchans(struct hn_softc * sc,int * nsubch)6339 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6340 {
6341 struct vmbus_channel **subchans;
6342 int nchan, rxr_cnt, error;
6343
6344 nchan = *nsubch + 1;
6345 if (nchan == 1) {
6346 /*
6347 * Multiple RX/TX rings are not requested.
6348 */
6349 *nsubch = 0;
6350 return (0);
6351 }
6352
6353 /*
6354 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6355 * table entries.
6356 */
6357 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6358 if (error) {
6359 /* No RSS; this is benign. */
6360 *nsubch = 0;
6361 return (0);
6362 }
6363 if (bootverbose) {
6364 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6365 rxr_cnt, nchan);
6366 }
6367
6368 if (nchan > rxr_cnt)
6369 nchan = rxr_cnt;
6370 if (nchan == 1) {
6371 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6372 *nsubch = 0;
6373 return (0);
6374 }
6375
6376 /*
6377 * Allocate sub-channels from NVS.
6378 */
6379 *nsubch = nchan - 1;
6380 error = hn_nvs_alloc_subchans(sc, nsubch);
6381 if (error || *nsubch == 0) {
6382 /* Failed to allocate sub-channels. */
6383 *nsubch = 0;
6384 return (0);
6385 }
6386
6387 /*
6388 * Wait for all sub-channels to become ready before moving on.
6389 */
6390 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6391 vmbus_subchan_rel(subchans, *nsubch);
6392 return (0);
6393 }
6394
6395 static bool
hn_synth_attachable(const struct hn_softc * sc)6396 hn_synth_attachable(const struct hn_softc *sc)
6397 {
6398 int i;
6399
6400 if (sc->hn_flags & HN_FLAG_ERRORS)
6401 return (false);
6402
6403 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6404 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6405
6406 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6407 return (false);
6408 }
6409 return (true);
6410 }
6411
6412 /*
6413 * Make sure that the RX filter is zero after the successful
6414 * RNDIS initialization.
6415 *
6416 * NOTE:
6417 * Under certain conditions on certain versions of Hyper-V,
6418 * the RNDIS rxfilter is _not_ zero on the hypervisor side
6419 * after the successful RNDIS initialization, which breaks
6420 * the assumption of any following code (well, it breaks the
6421 * RNDIS API contract actually). Clear the RNDIS rxfilter
6422 * explicitly, drain packets sneaking through, and drain the
6423 * interrupt taskqueues scheduled due to the stealth packets.
6424 */
6425 static void
hn_rndis_init_fixat(struct hn_softc * sc,int nchan)6426 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6427 {
6428
6429 hn_disable_rx(sc);
6430 hn_drain_rxtx(sc, nchan);
6431 }
6432
6433 static int
hn_synth_attach(struct hn_softc * sc,int mtu)6434 hn_synth_attach(struct hn_softc *sc, int mtu)
6435 {
6436 #define ATTACHED_NVS 0x0002
6437 #define ATTACHED_RNDIS 0x0004
6438
6439 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6440 int error, nsubch, nchan = 1, i, rndis_inited;
6441 uint32_t old_caps, attached = 0;
6442
6443 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6444 ("synthetic parts were attached"));
6445
6446 if (!hn_synth_attachable(sc))
6447 return (ENXIO);
6448
6449 /* Save capabilities for later verification. */
6450 old_caps = sc->hn_caps;
6451 sc->hn_caps = 0;
6452
6453 /* Clear RSS stuffs. */
6454 sc->hn_rss_ind_size = 0;
6455 sc->hn_rss_hash = 0;
6456 sc->hn_rss_hcap = 0;
6457
6458 /*
6459 * Attach the primary channel _before_ attaching NVS and RNDIS.
6460 */
6461 error = hn_chan_attach(sc, sc->hn_prichan);
6462 if (error)
6463 goto failed;
6464
6465 /*
6466 * Attach NVS.
6467 */
6468 error = hn_nvs_attach(sc, mtu);
6469 if (error)
6470 goto failed;
6471 attached |= ATTACHED_NVS;
6472
6473 /*
6474 * Attach RNDIS _after_ NVS is attached.
6475 */
6476 error = hn_rndis_attach(sc, mtu, &rndis_inited);
6477 if (rndis_inited)
6478 attached |= ATTACHED_RNDIS;
6479 if (error)
6480 goto failed;
6481
6482 /*
6483 * Make sure capabilities are not changed.
6484 */
6485 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6486 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6487 old_caps, sc->hn_caps);
6488 error = ENXIO;
6489 goto failed;
6490 }
6491
6492 /*
6493 * Allocate sub-channels for multi-TX/RX rings.
6494 *
6495 * NOTE:
6496 * The # of RX rings that can be used is equivalent to the # of
6497 * channels to be requested.
6498 */
6499 nsubch = sc->hn_rx_ring_cnt - 1;
6500 error = hn_synth_alloc_subchans(sc, &nsubch);
6501 if (error)
6502 goto failed;
6503 /* NOTE: _Full_ synthetic parts detach is required now. */
6504 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6505
6506 /*
6507 * Set the # of TX/RX rings that could be used according to
6508 * the # of channels that NVS offered.
6509 */
6510 nchan = nsubch + 1;
6511 hn_set_ring_inuse(sc, nchan);
6512 if (nchan == 1) {
6513 /* Only the primary channel can be used; done */
6514 goto back;
6515 }
6516
6517 /*
6518 * Attach the sub-channels.
6519 *
6520 * NOTE: hn_set_ring_inuse() _must_ have been called.
6521 */
6522 error = hn_attach_subchans(sc);
6523 if (error)
6524 goto failed;
6525
6526 /*
6527 * Configure RSS key and indirect table _after_ all sub-channels
6528 * are attached.
6529 */
6530 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6531 /*
6532 * RSS key is not set yet; set it to the default RSS key.
6533 */
6534 if (bootverbose)
6535 if_printf(sc->hn_ifp, "setup default RSS key\n");
6536 #ifdef RSS
6537 rss_getkey(rss->rss_key);
6538 #else
6539 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6540 #endif
6541 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6542 }
6543
6544 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6545 /*
6546 * RSS indirect table is not set yet; set it up in round-
6547 * robin fashion.
6548 */
6549 if (bootverbose) {
6550 if_printf(sc->hn_ifp, "setup default RSS indirect "
6551 "table\n");
6552 }
6553 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6554 uint32_t subidx;
6555
6556 #ifdef RSS
6557 subidx = rss_get_indirection_to_bucket(i);
6558 #else
6559 subidx = i;
6560 #endif
6561 rss->rss_ind[i] = subidx % nchan;
6562 }
6563 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6564 } else {
6565 /*
6566 * # of usable channels may be changed, so we have to
6567 * make sure that all entries in RSS indirect table
6568 * are valid.
6569 *
6570 * NOTE: hn_set_ring_inuse() _must_ have been called.
6571 */
6572 hn_rss_ind_fixup(sc);
6573 }
6574
6575 sc->hn_rss_hash = sc->hn_rss_hcap;
6576 if ((sc->hn_flags & HN_FLAG_RXVF) ||
6577 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6578 /* NOTE: Don't reconfigure RSS; will do immediately. */
6579 hn_vf_rss_fixup(sc, false);
6580 }
6581 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6582 if (error)
6583 goto failed;
6584 back:
6585 /*
6586 * Fixup transmission aggregation setup.
6587 */
6588 hn_set_txagg(sc);
6589 hn_rndis_init_fixat(sc, nchan);
6590 return (0);
6591
6592 failed:
6593 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6594 hn_rndis_init_fixat(sc, nchan);
6595 hn_synth_detach(sc);
6596 } else {
6597 if (attached & ATTACHED_RNDIS) {
6598 hn_rndis_init_fixat(sc, nchan);
6599 hn_rndis_detach(sc);
6600 }
6601 if (attached & ATTACHED_NVS)
6602 hn_nvs_detach(sc);
6603 hn_chan_detach(sc, sc->hn_prichan);
6604 /* Restore old capabilities. */
6605 sc->hn_caps = old_caps;
6606 }
6607 return (error);
6608
6609 #undef ATTACHED_RNDIS
6610 #undef ATTACHED_NVS
6611 }
6612
6613 /*
6614 * NOTE:
6615 * The interface must have been suspended though hn_suspend(), before
6616 * this function get called.
6617 */
6618 static void
hn_synth_detach(struct hn_softc * sc)6619 hn_synth_detach(struct hn_softc *sc)
6620 {
6621
6622 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6623 ("synthetic parts were not attached"));
6624
6625 /* Detach the RNDIS first. */
6626 hn_rndis_detach(sc);
6627
6628 /* Detach NVS. */
6629 hn_nvs_detach(sc);
6630
6631 /* Detach all of the channels. */
6632 hn_detach_allchans(sc);
6633
6634 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6635 }
6636
6637 static void
hn_set_ring_inuse(struct hn_softc * sc,int ring_cnt)6638 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6639 {
6640 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6641 ("invalid ring count %d", ring_cnt));
6642
6643 if (sc->hn_tx_ring_cnt > ring_cnt)
6644 sc->hn_tx_ring_inuse = ring_cnt;
6645 else
6646 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6647 sc->hn_rx_ring_inuse = ring_cnt;
6648
6649 #ifdef RSS
6650 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6651 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6652 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6653 rss_getnumbuckets());
6654 }
6655 #endif
6656
6657 if (bootverbose) {
6658 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6659 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6660 }
6661 }
6662
6663 static void
hn_chan_drain(struct hn_softc * sc,struct vmbus_channel * chan)6664 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6665 {
6666
6667 /*
6668 * NOTE:
6669 * The TX bufring will not be drained by the hypervisor,
6670 * if the primary channel is revoked.
6671 */
6672 while (!vmbus_chan_rx_empty(chan) ||
6673 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6674 !vmbus_chan_tx_empty(chan)))
6675 pause("waitch", 1);
6676 vmbus_chan_intr_drain(chan);
6677 }
6678
6679 static void
hn_disable_rx(struct hn_softc * sc)6680 hn_disable_rx(struct hn_softc *sc)
6681 {
6682
6683 /*
6684 * Disable RX by clearing RX filter forcefully.
6685 */
6686 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6687 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6688
6689 /*
6690 * Give RNDIS enough time to flush all pending data packets.
6691 */
6692 pause("waitrx", (200 * hz) / 1000);
6693 }
6694
6695 /*
6696 * NOTE:
6697 * RX/TX _must_ have been suspended/disabled, before this function
6698 * is called.
6699 */
6700 static void
hn_drain_rxtx(struct hn_softc * sc,int nchan)6701 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6702 {
6703 struct vmbus_channel **subch = NULL;
6704 int nsubch;
6705
6706 /*
6707 * Drain RX/TX bufrings and interrupts.
6708 */
6709 nsubch = nchan - 1;
6710 if (nsubch > 0)
6711 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6712
6713 if (subch != NULL) {
6714 int i;
6715
6716 for (i = 0; i < nsubch; ++i)
6717 hn_chan_drain(sc, subch[i]);
6718 }
6719 hn_chan_drain(sc, sc->hn_prichan);
6720
6721 if (subch != NULL)
6722 vmbus_subchan_rel(subch, nsubch);
6723 }
6724
6725 static void
hn_suspend_data(struct hn_softc * sc)6726 hn_suspend_data(struct hn_softc *sc)
6727 {
6728 struct hn_tx_ring *txr;
6729 int i;
6730
6731 HN_LOCK_ASSERT(sc);
6732
6733 /*
6734 * Suspend TX.
6735 */
6736 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6737 txr = &sc->hn_tx_ring[i];
6738
6739 mtx_lock(&txr->hn_tx_lock);
6740 txr->hn_suspended = 1;
6741 mtx_unlock(&txr->hn_tx_lock);
6742 /* No one is able send more packets now. */
6743
6744 /*
6745 * Wait for all pending sends to finish.
6746 *
6747 * NOTE:
6748 * We will _not_ receive all pending send-done, if the
6749 * primary channel is revoked.
6750 */
6751 while (hn_tx_ring_pending(txr) &&
6752 !vmbus_chan_is_revoked(sc->hn_prichan))
6753 pause("hnwtx", 1 /* 1 tick */);
6754 }
6755
6756 /*
6757 * Disable RX.
6758 */
6759 hn_disable_rx(sc);
6760
6761 /*
6762 * Drain RX/TX.
6763 */
6764 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6765
6766 /*
6767 * Drain any pending TX tasks.
6768 *
6769 * NOTE:
6770 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6771 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6772 */
6773 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6774 txr = &sc->hn_tx_ring[i];
6775
6776 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6777 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6778 }
6779 }
6780
6781 static void
hn_suspend_mgmt_taskfunc(void * xsc,int pending __unused)6782 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6783 {
6784
6785 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6786 }
6787
6788 static void
hn_suspend_mgmt(struct hn_softc * sc)6789 hn_suspend_mgmt(struct hn_softc *sc)
6790 {
6791 struct task task;
6792
6793 HN_LOCK_ASSERT(sc);
6794
6795 /*
6796 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6797 * through hn_mgmt_taskq.
6798 */
6799 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6800 vmbus_chan_run_task(sc->hn_prichan, &task);
6801
6802 /*
6803 * Make sure that all pending management tasks are completed.
6804 */
6805 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6806 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6807 taskqueue_drain_all(sc->hn_mgmt_taskq0);
6808 }
6809
6810 static void
hn_suspend(struct hn_softc * sc)6811 hn_suspend(struct hn_softc *sc)
6812 {
6813
6814 /* Disable polling. */
6815 hn_polling(sc, 0);
6816
6817 /*
6818 * If the non-transparent mode VF is activated, the synthetic
6819 * device is receiving packets, so the data path of the
6820 * synthetic device must be suspended.
6821 */
6822 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6823 (sc->hn_flags & HN_FLAG_RXVF))
6824 hn_suspend_data(sc);
6825 hn_suspend_mgmt(sc);
6826 }
6827
6828 static void
hn_resume_tx(struct hn_softc * sc,int tx_ring_cnt)6829 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6830 {
6831 int i;
6832
6833 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6834 ("invalid TX ring count %d", tx_ring_cnt));
6835
6836 for (i = 0; i < tx_ring_cnt; ++i) {
6837 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6838
6839 mtx_lock(&txr->hn_tx_lock);
6840 txr->hn_suspended = 0;
6841 mtx_unlock(&txr->hn_tx_lock);
6842 }
6843 }
6844
6845 static void
hn_resume_data(struct hn_softc * sc)6846 hn_resume_data(struct hn_softc *sc)
6847 {
6848 int i;
6849
6850 HN_LOCK_ASSERT(sc);
6851
6852 /*
6853 * Re-enable RX.
6854 */
6855 hn_rxfilter_config(sc);
6856
6857 /*
6858 * Make sure to clear suspend status on "all" TX rings,
6859 * since hn_tx_ring_inuse can be changed after
6860 * hn_suspend_data().
6861 */
6862 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6863
6864 #ifdef HN_IFSTART_SUPPORT
6865 if (!hn_use_if_start)
6866 #endif
6867 {
6868 /*
6869 * Flush unused drbrs, since hn_tx_ring_inuse may be
6870 * reduced.
6871 */
6872 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6873 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6874 }
6875
6876 /*
6877 * Kick start TX.
6878 */
6879 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6880 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6881
6882 /*
6883 * Use txeof task, so that any pending oactive can be
6884 * cleared properly.
6885 */
6886 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6887 }
6888 }
6889
6890 static void
hn_resume_mgmt(struct hn_softc * sc)6891 hn_resume_mgmt(struct hn_softc *sc)
6892 {
6893
6894 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6895
6896 /*
6897 * Kick off network change detection, if it was pending.
6898 * If no network change was pending, start link status
6899 * checks, which is more lightweight than network change
6900 * detection.
6901 */
6902 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6903 hn_change_network(sc);
6904 else
6905 hn_update_link_status(sc);
6906 }
6907
6908 static void
hn_resume(struct hn_softc * sc)6909 hn_resume(struct hn_softc *sc)
6910 {
6911
6912 /*
6913 * If the non-transparent mode VF is activated, the synthetic
6914 * device have to receive packets, so the data path of the
6915 * synthetic device must be resumed.
6916 */
6917 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6918 (sc->hn_flags & HN_FLAG_RXVF))
6919 hn_resume_data(sc);
6920
6921 /*
6922 * Don't resume link status change if VF is attached/activated.
6923 * - In the non-transparent VF mode, the synthetic device marks
6924 * link down until the VF is deactivated; i.e. VF is down.
6925 * - In transparent VF mode, VF's media status is used until
6926 * the VF is detached.
6927 */
6928 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6929 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6930 hn_resume_mgmt(sc);
6931
6932 /*
6933 * Re-enable polling if this interface is running and
6934 * the polling is requested.
6935 */
6936 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6937 hn_polling(sc, sc->hn_pollhz);
6938 }
6939
6940 static void
hn_rndis_rx_status(struct hn_softc * sc,const void * data,int dlen)6941 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6942 {
6943 const struct rndis_status_msg *msg;
6944 int ofs;
6945
6946 if (dlen < sizeof(*msg)) {
6947 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6948 return;
6949 }
6950 msg = data;
6951
6952 switch (msg->rm_status) {
6953 case RNDIS_STATUS_MEDIA_CONNECT:
6954 case RNDIS_STATUS_MEDIA_DISCONNECT:
6955 hn_update_link_status(sc);
6956 break;
6957
6958 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6959 case RNDIS_STATUS_LINK_SPEED_CHANGE:
6960 /* Not really useful; ignore. */
6961 break;
6962
6963 case RNDIS_STATUS_NETWORK_CHANGE:
6964 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6965 if (dlen < ofs + msg->rm_stbuflen ||
6966 msg->rm_stbuflen < sizeof(uint32_t)) {
6967 if_printf(sc->hn_ifp, "network changed\n");
6968 } else {
6969 uint32_t change;
6970
6971 memcpy(&change, ((const uint8_t *)msg) + ofs,
6972 sizeof(change));
6973 if_printf(sc->hn_ifp, "network changed, change %u\n",
6974 change);
6975 }
6976 hn_change_network(sc);
6977 break;
6978
6979 default:
6980 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6981 msg->rm_status);
6982 break;
6983 }
6984 }
6985
6986 static int
hn_rndis_rxinfo(const void * info_data,int info_dlen,struct hn_rxinfo * info)6987 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6988 {
6989 const struct rndis_pktinfo *pi = info_data;
6990 uint32_t mask = 0;
6991
6992 while (info_dlen != 0) {
6993 const void *data;
6994 uint32_t dlen;
6995
6996 if (__predict_false(info_dlen < sizeof(*pi)))
6997 return (EINVAL);
6998 if (__predict_false(info_dlen < pi->rm_size))
6999 return (EINVAL);
7000 info_dlen -= pi->rm_size;
7001
7002 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7003 return (EINVAL);
7004 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7005 return (EINVAL);
7006 dlen = pi->rm_size - pi->rm_pktinfooffset;
7007 data = pi->rm_data;
7008
7009 switch (pi->rm_type) {
7010 case NDIS_PKTINFO_TYPE_VLAN:
7011 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
7012 return (EINVAL);
7013 info->vlan_info = *((const uint32_t *)data);
7014 mask |= HN_RXINFO_VLAN;
7015 break;
7016
7017 case NDIS_PKTINFO_TYPE_CSUM:
7018 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
7019 return (EINVAL);
7020 info->csum_info = *((const uint32_t *)data);
7021 mask |= HN_RXINFO_CSUM;
7022 break;
7023
7024 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7025 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
7026 return (EINVAL);
7027 info->hash_value = *((const uint32_t *)data);
7028 mask |= HN_RXINFO_HASHVAL;
7029 break;
7030
7031 case HN_NDIS_PKTINFO_TYPE_HASHINF:
7032 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
7033 return (EINVAL);
7034 info->hash_info = *((const uint32_t *)data);
7035 mask |= HN_RXINFO_HASHINF;
7036 break;
7037
7038 default:
7039 goto next;
7040 }
7041
7042 if (mask == HN_RXINFO_ALL) {
7043 /* All found; done */
7044 break;
7045 }
7046 next:
7047 pi = (const struct rndis_pktinfo *)
7048 ((const uint8_t *)pi + pi->rm_size);
7049 }
7050
7051 /*
7052 * Final fixup.
7053 * - If there is no hash value, invalidate the hash info.
7054 */
7055 if ((mask & HN_RXINFO_HASHVAL) == 0)
7056 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
7057 return (0);
7058 }
7059
7060 static __inline bool
hn_rndis_check_overlap(int off,int len,int check_off,int check_len)7061 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7062 {
7063
7064 if (off < check_off) {
7065 if (__predict_true(off + len <= check_off))
7066 return (false);
7067 } else if (off > check_off) {
7068 if (__predict_true(check_off + check_len <= off))
7069 return (false);
7070 }
7071 return (true);
7072 }
7073
7074 static void
hn_rndis_rx_data(struct hn_rx_ring * rxr,const void * data,int dlen)7075 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7076 {
7077 const struct rndis_packet_msg *pkt;
7078 struct hn_rxinfo info;
7079 int data_off, pktinfo_off, data_len, pktinfo_len;
7080
7081 /*
7082 * Check length.
7083 */
7084 if (__predict_false(dlen < sizeof(*pkt))) {
7085 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7086 return;
7087 }
7088 pkt = data;
7089
7090 if (__predict_false(dlen < pkt->rm_len)) {
7091 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7092 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7093 return;
7094 }
7095 if (__predict_false(pkt->rm_len <
7096 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7097 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7098 "msglen %u, data %u, oob %u, pktinfo %u\n",
7099 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7100 pkt->rm_pktinfolen);
7101 return;
7102 }
7103 if (__predict_false(pkt->rm_datalen == 0)) {
7104 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7105 return;
7106 }
7107
7108 /*
7109 * Check offests.
7110 */
7111 #define IS_OFFSET_INVALID(ofs) \
7112 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
7113 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7114
7115 /* XXX Hyper-V does not meet data offset alignment requirement */
7116 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7117 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7118 "data offset %u\n", pkt->rm_dataoffset);
7119 return;
7120 }
7121 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7122 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7123 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7124 "oob offset %u\n", pkt->rm_oobdataoffset);
7125 return;
7126 }
7127 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7128 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7129 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7130 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7131 return;
7132 }
7133
7134 #undef IS_OFFSET_INVALID
7135
7136 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7137 data_len = pkt->rm_datalen;
7138 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7139 pktinfo_len = pkt->rm_pktinfolen;
7140
7141 /*
7142 * Check OOB coverage.
7143 */
7144 if (__predict_false(pkt->rm_oobdatalen != 0)) {
7145 int oob_off, oob_len;
7146
7147 if_printf(rxr->hn_ifp, "got oobdata\n");
7148 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7149 oob_len = pkt->rm_oobdatalen;
7150
7151 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7152 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7153 "oob overflow, msglen %u, oob abs %d len %d\n",
7154 pkt->rm_len, oob_off, oob_len);
7155 return;
7156 }
7157
7158 /*
7159 * Check against data.
7160 */
7161 if (hn_rndis_check_overlap(oob_off, oob_len,
7162 data_off, data_len)) {
7163 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7164 "oob overlaps data, oob abs %d len %d, "
7165 "data abs %d len %d\n",
7166 oob_off, oob_len, data_off, data_len);
7167 return;
7168 }
7169
7170 /*
7171 * Check against pktinfo.
7172 */
7173 if (pktinfo_len != 0 &&
7174 hn_rndis_check_overlap(oob_off, oob_len,
7175 pktinfo_off, pktinfo_len)) {
7176 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7177 "oob overlaps pktinfo, oob abs %d len %d, "
7178 "pktinfo abs %d len %d\n",
7179 oob_off, oob_len, pktinfo_off, pktinfo_len);
7180 return;
7181 }
7182 }
7183
7184 /*
7185 * Check per-packet-info coverage and find useful per-packet-info.
7186 */
7187 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
7188 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
7189 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
7190 if (__predict_true(pktinfo_len != 0)) {
7191 bool overlap;
7192 int error;
7193
7194 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7195 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7196 "pktinfo overflow, msglen %u, "
7197 "pktinfo abs %d len %d\n",
7198 pkt->rm_len, pktinfo_off, pktinfo_len);
7199 return;
7200 }
7201
7202 /*
7203 * Check packet info coverage.
7204 */
7205 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7206 data_off, data_len);
7207 if (__predict_false(overlap)) {
7208 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7209 "pktinfo overlap data, pktinfo abs %d len %d, "
7210 "data abs %d len %d\n",
7211 pktinfo_off, pktinfo_len, data_off, data_len);
7212 return;
7213 }
7214
7215 /*
7216 * Find useful per-packet-info.
7217 */
7218 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7219 pktinfo_len, &info);
7220 if (__predict_false(error)) {
7221 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7222 "pktinfo\n");
7223 return;
7224 }
7225 }
7226
7227 if (__predict_false(data_off + data_len > pkt->rm_len)) {
7228 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7229 "data overflow, msglen %u, data abs %d len %d\n",
7230 pkt->rm_len, data_off, data_len);
7231 return;
7232 }
7233 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7234 }
7235
7236 static __inline void
hn_rndis_rxpkt(struct hn_rx_ring * rxr,const void * data,int dlen)7237 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7238 {
7239 const struct rndis_msghdr *hdr;
7240
7241 if (__predict_false(dlen < sizeof(*hdr))) {
7242 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7243 return;
7244 }
7245 hdr = data;
7246
7247 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7248 /* Hot data path. */
7249 hn_rndis_rx_data(rxr, data, dlen);
7250 /* Done! */
7251 return;
7252 }
7253
7254 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7255 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7256 else
7257 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7258 }
7259
7260 static void
hn_nvs_handle_notify(struct hn_softc * sc,const struct vmbus_chanpkt_hdr * pkt)7261 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7262 {
7263 const struct hn_nvs_hdr *hdr;
7264
7265 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7266 if_printf(sc->hn_ifp, "invalid nvs notify\n");
7267 return;
7268 }
7269 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7270
7271 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7272 /* Useless; ignore */
7273 return;
7274 }
7275 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7276 }
7277
7278 static void
hn_nvs_handle_comp(struct hn_softc * sc,struct vmbus_channel * chan,const struct vmbus_chanpkt_hdr * pkt)7279 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7280 const struct vmbus_chanpkt_hdr *pkt)
7281 {
7282 struct hn_nvs_sendctx *sndc;
7283
7284 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7285 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7286 VMBUS_CHANPKT_DATALEN(pkt));
7287 /*
7288 * NOTE:
7289 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7290 * its callback.
7291 */
7292 }
7293
7294 static void
hn_nvs_handle_rxbuf(struct hn_rx_ring * rxr,struct vmbus_channel * chan,const struct vmbus_chanpkt_hdr * pkthdr)7295 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7296 const struct vmbus_chanpkt_hdr *pkthdr)
7297 {
7298 const struct vmbus_chanpkt_rxbuf *pkt;
7299 const struct hn_nvs_hdr *nvs_hdr;
7300 int count, i, hlen;
7301
7302 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7303 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7304 return;
7305 }
7306 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7307
7308 /* Make sure that this is a RNDIS message. */
7309 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7310 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7311 nvs_hdr->nvs_type);
7312 return;
7313 }
7314
7315 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7316 if (__predict_false(hlen < sizeof(*pkt))) {
7317 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7318 return;
7319 }
7320 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7321
7322 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7323 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7324 pkt->cp_rxbuf_id);
7325 return;
7326 }
7327
7328 count = pkt->cp_rxbuf_cnt;
7329 if (__predict_false(hlen <
7330 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7331 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7332 return;
7333 }
7334
7335 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7336 for (i = 0; i < count; ++i) {
7337 int ofs, len;
7338
7339 ofs = pkt->cp_rxbuf[i].rb_ofs;
7340 len = pkt->cp_rxbuf[i].rb_len;
7341 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7342 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7343 "ofs %d, len %d\n", i, ofs, len);
7344 continue;
7345 }
7346 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7347 }
7348
7349 /*
7350 * Ack the consumed RXBUF associated w/ this channel packet,
7351 * so that this RXBUF can be recycled by the hypervisor.
7352 */
7353 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7354 }
7355
7356 static void
hn_nvs_ack_rxbuf(struct hn_rx_ring * rxr,struct vmbus_channel * chan,uint64_t tid)7357 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7358 uint64_t tid)
7359 {
7360 struct hn_nvs_rndis_ack ack;
7361 int retries, error;
7362
7363 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7364 ack.nvs_status = HN_NVS_STATUS_OK;
7365
7366 retries = 0;
7367 again:
7368 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7369 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7370 if (__predict_false(error == EAGAIN)) {
7371 /*
7372 * NOTE:
7373 * This should _not_ happen in real world, since the
7374 * consumption of the TX bufring from the TX path is
7375 * controlled.
7376 */
7377 if (rxr->hn_ack_failed == 0)
7378 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7379 rxr->hn_ack_failed++;
7380 retries++;
7381 if (retries < 10) {
7382 DELAY(100);
7383 goto again;
7384 }
7385 /* RXBUF leaks! */
7386 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7387 }
7388 }
7389
7390 static void
hn_chan_callback(struct vmbus_channel * chan,void * xrxr)7391 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7392 {
7393 struct hn_rx_ring *rxr = xrxr;
7394 struct hn_softc *sc = rxr->hn_ifp->if_softc;
7395
7396 for (;;) {
7397 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7398 int error, pktlen;
7399
7400 pktlen = rxr->hn_pktbuf_len;
7401 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7402 if (__predict_false(error == ENOBUFS)) {
7403 void *nbuf;
7404 int nlen;
7405
7406 /*
7407 * Expand channel packet buffer.
7408 *
7409 * XXX
7410 * Use M_WAITOK here, since allocation failure
7411 * is fatal.
7412 */
7413 nlen = rxr->hn_pktbuf_len * 2;
7414 while (nlen < pktlen)
7415 nlen *= 2;
7416 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7417
7418 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7419 rxr->hn_pktbuf_len, nlen);
7420
7421 free(rxr->hn_pktbuf, M_DEVBUF);
7422 rxr->hn_pktbuf = nbuf;
7423 rxr->hn_pktbuf_len = nlen;
7424 /* Retry! */
7425 continue;
7426 } else if (__predict_false(error == EAGAIN)) {
7427 /* No more channel packets; done! */
7428 break;
7429 }
7430 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7431
7432 switch (pkt->cph_type) {
7433 case VMBUS_CHANPKT_TYPE_COMP:
7434 hn_nvs_handle_comp(sc, chan, pkt);
7435 break;
7436
7437 case VMBUS_CHANPKT_TYPE_RXBUF:
7438 hn_nvs_handle_rxbuf(rxr, chan, pkt);
7439 break;
7440
7441 case VMBUS_CHANPKT_TYPE_INBAND:
7442 hn_nvs_handle_notify(sc, pkt);
7443 break;
7444
7445 default:
7446 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7447 pkt->cph_type);
7448 break;
7449 }
7450 }
7451 hn_chan_rollup(rxr, rxr->hn_txr);
7452 }
7453
7454 static void
hn_sysinit(void * arg __unused)7455 hn_sysinit(void *arg __unused)
7456 {
7457 int i;
7458
7459 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7460
7461 #ifdef HN_IFSTART_SUPPORT
7462 /*
7463 * Don't use ifnet.if_start if transparent VF mode is requested;
7464 * mainly due to the IFF_DRV_OACTIVE flag.
7465 */
7466 if (hn_xpnt_vf && hn_use_if_start) {
7467 hn_use_if_start = 0;
7468 printf("hn: tranparent VF mode, if_transmit will be used, "
7469 "instead of if_start\n");
7470 }
7471 #endif
7472 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7473 printf("hn: invalid transparent VF attach routing "
7474 "wait timeout %d, reset to %d\n",
7475 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7476 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7477 }
7478
7479 /*
7480 * Initialize VF map.
7481 */
7482 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7483 hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7484 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7485 M_WAITOK | M_ZERO);
7486
7487 /*
7488 * Fix the # of TX taskqueues.
7489 */
7490 if (hn_tx_taskq_cnt <= 0)
7491 hn_tx_taskq_cnt = 1;
7492 else if (hn_tx_taskq_cnt > mp_ncpus)
7493 hn_tx_taskq_cnt = mp_ncpus;
7494
7495 /*
7496 * Fix the TX taskqueue mode.
7497 */
7498 switch (hn_tx_taskq_mode) {
7499 case HN_TX_TASKQ_M_INDEP:
7500 case HN_TX_TASKQ_M_GLOBAL:
7501 case HN_TX_TASKQ_M_EVTTQ:
7502 break;
7503 default:
7504 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7505 break;
7506 }
7507
7508 if (vm_guest != VM_GUEST_HV)
7509 return;
7510
7511 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7512 return;
7513
7514 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7515 M_DEVBUF, M_WAITOK);
7516 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7517 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7518 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7519 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7520 "hn tx%d", i);
7521 }
7522 }
7523 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7524
7525 static void
hn_sysuninit(void * arg __unused)7526 hn_sysuninit(void *arg __unused)
7527 {
7528
7529 if (hn_tx_taskque != NULL) {
7530 int i;
7531
7532 for (i = 0; i < hn_tx_taskq_cnt; ++i)
7533 taskqueue_free(hn_tx_taskque[i]);
7534 free(hn_tx_taskque, M_DEVBUF);
7535 }
7536
7537 if (hn_vfmap != NULL)
7538 free(hn_vfmap, M_DEVBUF);
7539 rm_destroy(&hn_vfmap_lock);
7540
7541 counter_u64_free(hn_udpcs_fixup);
7542 }
7543 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7544