1 /*
2 * Copyright (c) 2004 Topspin Communications. All rights reserved.
3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
4 * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
5 *
6 * This software is available to you under a choice of one of two
7 * licenses. You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
11 *
12 * Redistribution and use in source and binary forms, with or
13 * without modification, are permitted provided that the following
14 * conditions are met:
15 *
16 * - Redistributions of source code must retain the above
17 * copyright notice, this list of conditions and the following
18 * disclaimer.
19 *
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials
23 * provided with the distribution.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 * SOFTWARE.
33 */
34
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37
38 #include "ipoib.h"
39
40 static int ipoib_resolvemulti(struct ifnet *, struct sockaddr **,
41 struct sockaddr *);
42
43
44 #include <linux/module.h>
45
46 #include <linux/slab.h>
47 #include <linux/kernel.h>
48 #include <linux/vmalloc.h>
49
50 #include <linux/if_arp.h> /* For ARPHRD_xxx */
51 #include <linux/if_vlan.h>
52 #include <net/ip.h>
53 #include <net/ipv6.h>
54
55 #include <rdma/ib_cache.h>
56
57 MODULE_AUTHOR("Roland Dreier");
58 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
59 MODULE_LICENSE("Dual BSD/GPL");
60
61 int ipoib_sendq_size = IPOIB_TX_RING_SIZE;
62 int ipoib_recvq_size = IPOIB_RX_RING_SIZE;
63
64 module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
65 MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
66 module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
67 MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
68
69 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
70 int ipoib_debug_level = 1;
71
72 module_param_named(debug_level, ipoib_debug_level, int, 0644);
73 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
74 #endif
75
76 struct ipoib_path_iter {
77 struct ipoib_dev_priv *priv;
78 struct ipoib_path path;
79 };
80
81 static const u8 ipv4_bcast_addr[] = {
82 0x00, 0xff, 0xff, 0xff,
83 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
84 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
85 };
86
87 struct workqueue_struct *ipoib_workqueue;
88
89 struct ib_sa_client ipoib_sa_client;
90
91 static void ipoib_add_one(struct ib_device *device);
92 static void ipoib_remove_one(struct ib_device *device, void *client_data);
93 static struct net_device *ipoib_get_net_dev_by_params(
94 struct ib_device *dev, u8 port, u16 pkey,
95 const union ib_gid *gid, const struct sockaddr *addr,
96 void *client_data);
97 static void ipoib_start(struct ifnet *dev);
98 static int ipoib_output(struct ifnet *ifp, struct mbuf *m,
99 const struct sockaddr *dst, struct route *ro);
100 static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data);
101 static void ipoib_input(struct ifnet *ifp, struct mbuf *m);
102
103 #define IPOIB_MTAP(_ifp, _m) \
104 do { \
105 if (bpf_peers_present((_ifp)->if_bpf)) { \
106 M_ASSERTVALID(_m); \
107 ipoib_mtap_mb((_ifp), (_m)); \
108 } \
109 } while (0)
110
111 static struct unrhdr *ipoib_unrhdr;
112
113 static void
ipoib_unrhdr_init(void * arg)114 ipoib_unrhdr_init(void *arg)
115 {
116
117 ipoib_unrhdr = new_unrhdr(0, 65535, NULL);
118 }
119 SYSINIT(ipoib_unrhdr_init, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_init, NULL);
120
121 static void
ipoib_unrhdr_uninit(void * arg)122 ipoib_unrhdr_uninit(void *arg)
123 {
124
125 if (ipoib_unrhdr != NULL) {
126 struct unrhdr *hdr;
127
128 hdr = ipoib_unrhdr;
129 ipoib_unrhdr = NULL;
130
131 delete_unrhdr(hdr);
132 }
133 }
134 SYSUNINIT(ipoib_unrhdr_uninit, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_uninit, NULL);
135
136 /*
137 * This is for clients that have an ipoib_header in the mbuf.
138 */
139 static void
ipoib_mtap_mb(struct ifnet * ifp,struct mbuf * mb)140 ipoib_mtap_mb(struct ifnet *ifp, struct mbuf *mb)
141 {
142 struct ipoib_header *ih;
143 struct ether_header eh;
144
145 ih = mtod(mb, struct ipoib_header *);
146 eh.ether_type = ih->proto;
147 bcopy(ih->hwaddr, &eh.ether_dhost, ETHER_ADDR_LEN);
148 bzero(&eh.ether_shost, ETHER_ADDR_LEN);
149 mb->m_data += sizeof(struct ipoib_header);
150 mb->m_len -= sizeof(struct ipoib_header);
151 bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
152 mb->m_data -= sizeof(struct ipoib_header);
153 mb->m_len += sizeof(struct ipoib_header);
154 }
155
156 void
ipoib_mtap_proto(struct ifnet * ifp,struct mbuf * mb,uint16_t proto)157 ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto)
158 {
159 struct ether_header eh;
160
161 eh.ether_type = proto;
162 bzero(&eh.ether_shost, ETHER_ADDR_LEN);
163 bzero(&eh.ether_dhost, ETHER_ADDR_LEN);
164 bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
165 }
166
167 static struct ib_client ipoib_client = {
168 .name = "ipoib",
169 .add = ipoib_add_one,
170 .remove = ipoib_remove_one,
171 .get_net_dev_by_params = ipoib_get_net_dev_by_params,
172 };
173
174 int
ipoib_open(struct ipoib_dev_priv * priv)175 ipoib_open(struct ipoib_dev_priv *priv)
176 {
177 struct ifnet *dev = priv->dev;
178
179 ipoib_dbg(priv, "bringing up interface\n");
180
181 set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
182
183 if (ipoib_pkey_dev_delay_open(priv))
184 return 0;
185
186 if (ipoib_ib_dev_open(priv))
187 goto err_disable;
188
189 if (ipoib_ib_dev_up(priv))
190 goto err_stop;
191
192 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
193 struct ipoib_dev_priv *cpriv;
194
195 /* Bring up any child interfaces too */
196 mutex_lock(&priv->vlan_mutex);
197 list_for_each_entry(cpriv, &priv->child_intfs, list)
198 if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
199 ipoib_open(cpriv);
200 mutex_unlock(&priv->vlan_mutex);
201 }
202 dev->if_drv_flags |= IFF_DRV_RUNNING;
203 dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
204
205 return 0;
206
207 err_stop:
208 ipoib_ib_dev_stop(priv, 1);
209
210 err_disable:
211 clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
212
213 return -EINVAL;
214 }
215
216 static void
ipoib_init(void * arg)217 ipoib_init(void *arg)
218 {
219 struct ifnet *dev;
220 struct ipoib_dev_priv *priv;
221
222 priv = arg;
223 dev = priv->dev;
224 if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
225 ipoib_open(priv);
226 queue_work(ipoib_workqueue, &priv->flush_light);
227 }
228
229
230 static int
ipoib_stop(struct ipoib_dev_priv * priv)231 ipoib_stop(struct ipoib_dev_priv *priv)
232 {
233 struct ifnet *dev = priv->dev;
234
235 ipoib_dbg(priv, "stopping interface\n");
236
237 clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
238
239 dev->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
240
241 ipoib_ib_dev_down(priv, 0);
242 ipoib_ib_dev_stop(priv, 0);
243
244 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
245 struct ipoib_dev_priv *cpriv;
246
247 /* Bring down any child interfaces too */
248 mutex_lock(&priv->vlan_mutex);
249 list_for_each_entry(cpriv, &priv->child_intfs, list)
250 if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) != 0)
251 ipoib_stop(cpriv);
252 mutex_unlock(&priv->vlan_mutex);
253 }
254
255 return 0;
256 }
257
258 static int
ipoib_propagate_ifnet_mtu(struct ipoib_dev_priv * priv,int new_mtu,bool propagate)259 ipoib_propagate_ifnet_mtu(struct ipoib_dev_priv *priv, int new_mtu,
260 bool propagate)
261 {
262 struct ifnet *ifp;
263 struct ifreq ifr;
264 int error;
265
266 ifp = priv->dev;
267 if (ifp->if_mtu == new_mtu)
268 return (0);
269 if (propagate) {
270 strlcpy(ifr.ifr_name, if_name(ifp), IFNAMSIZ);
271 ifr.ifr_mtu = new_mtu;
272 CURVNET_SET(ifp->if_vnet);
273 error = ifhwioctl(SIOCSIFMTU, ifp, (caddr_t)&ifr, curthread);
274 CURVNET_RESTORE();
275 } else {
276 ifp->if_mtu = new_mtu;
277 error = 0;
278 }
279 return (error);
280 }
281
282 int
ipoib_change_mtu(struct ipoib_dev_priv * priv,int new_mtu,bool propagate)283 ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu, bool propagate)
284 {
285 int error, prev_admin_mtu;
286
287 /* dev->if_mtu > 2K ==> connected mode */
288 if (ipoib_cm_admin_enabled(priv)) {
289 if (new_mtu > IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)))
290 return -EINVAL;
291
292 if (new_mtu > priv->mcast_mtu)
293 ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
294 priv->mcast_mtu);
295
296 return (ipoib_propagate_ifnet_mtu(priv, new_mtu, propagate));
297 }
298
299 if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
300 return -EINVAL;
301
302 prev_admin_mtu = priv->admin_mtu;
303 priv->admin_mtu = new_mtu;
304 error = ipoib_propagate_ifnet_mtu(priv, min(priv->mcast_mtu,
305 priv->admin_mtu), propagate);
306 if (error == 0) {
307 /* check for MTU change to avoid infinite loop */
308 if (prev_admin_mtu != new_mtu)
309 queue_work(ipoib_workqueue, &priv->flush_light);
310 } else
311 priv->admin_mtu = prev_admin_mtu;
312 return (error);
313 }
314
315 static int
ipoib_ioctl(struct ifnet * ifp,u_long command,caddr_t data)316 ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
317 {
318 struct ipoib_dev_priv *priv = ifp->if_softc;
319 struct ifaddr *ifa = (struct ifaddr *) data;
320 struct ifreq *ifr = (struct ifreq *) data;
321 int error = 0;
322
323 /* check if detaching */
324 if (priv == NULL || priv->gone != 0)
325 return (ENXIO);
326
327 switch (command) {
328 case SIOCSIFFLAGS:
329 if (ifp->if_flags & IFF_UP) {
330 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
331 error = -ipoib_open(priv);
332 } else
333 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
334 ipoib_stop(priv);
335 break;
336 case SIOCADDMULTI:
337 case SIOCDELMULTI:
338 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
339 queue_work(ipoib_workqueue, &priv->restart_task);
340 break;
341 case SIOCSIFADDR:
342 ifp->if_flags |= IFF_UP;
343
344 switch (ifa->ifa_addr->sa_family) {
345 #ifdef INET
346 case AF_INET:
347 ifp->if_init(ifp->if_softc); /* before arpwhohas */
348 arp_ifinit(ifp, ifa);
349 break;
350 #endif
351 default:
352 ifp->if_init(ifp->if_softc);
353 break;
354 }
355 break;
356
357 case SIOCGIFADDR:
358 bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0],
359 INFINIBAND_ALEN);
360 break;
361
362 case SIOCSIFMTU:
363 /*
364 * Set the interface MTU.
365 */
366 error = -ipoib_change_mtu(priv, ifr->ifr_mtu, false);
367 break;
368 default:
369 error = EINVAL;
370 break;
371 }
372 return (error);
373 }
374
375
376 static struct ipoib_path *
__path_find(struct ipoib_dev_priv * priv,void * gid)377 __path_find(struct ipoib_dev_priv *priv, void *gid)
378 {
379 struct rb_node *n = priv->path_tree.rb_node;
380 struct ipoib_path *path;
381 int ret;
382
383 while (n) {
384 path = rb_entry(n, struct ipoib_path, rb_node);
385
386 ret = memcmp(gid, path->pathrec.dgid.raw,
387 sizeof (union ib_gid));
388
389 if (ret < 0)
390 n = n->rb_left;
391 else if (ret > 0)
392 n = n->rb_right;
393 else
394 return path;
395 }
396
397 return NULL;
398 }
399
400 static int
__path_add(struct ipoib_dev_priv * priv,struct ipoib_path * path)401 __path_add(struct ipoib_dev_priv *priv, struct ipoib_path *path)
402 {
403 struct rb_node **n = &priv->path_tree.rb_node;
404 struct rb_node *pn = NULL;
405 struct ipoib_path *tpath;
406 int ret;
407
408 while (*n) {
409 pn = *n;
410 tpath = rb_entry(pn, struct ipoib_path, rb_node);
411
412 ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
413 sizeof (union ib_gid));
414 if (ret < 0)
415 n = &pn->rb_left;
416 else if (ret > 0)
417 n = &pn->rb_right;
418 else
419 return -EEXIST;
420 }
421
422 rb_link_node(&path->rb_node, pn, n);
423 rb_insert_color(&path->rb_node, &priv->path_tree);
424
425 list_add_tail(&path->list, &priv->path_list);
426
427 return 0;
428 }
429
430 void
ipoib_path_free(struct ipoib_dev_priv * priv,struct ipoib_path * path)431 ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path)
432 {
433
434 _IF_DRAIN(&path->queue);
435
436 if (path->ah)
437 ipoib_put_ah(path->ah);
438 if (ipoib_cm_get(path))
439 ipoib_cm_destroy_tx(ipoib_cm_get(path));
440
441 kfree(path);
442 }
443
444 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
445
446 struct ipoib_path_iter *
ipoib_path_iter_init(struct ipoib_dev_priv * priv)447 ipoib_path_iter_init(struct ipoib_dev_priv *priv)
448 {
449 struct ipoib_path_iter *iter;
450
451 iter = kmalloc(sizeof *iter, GFP_KERNEL);
452 if (!iter)
453 return NULL;
454
455 iter->priv = priv;
456 memset(iter->path.pathrec.dgid.raw, 0, 16);
457
458 if (ipoib_path_iter_next(iter)) {
459 kfree(iter);
460 return NULL;
461 }
462
463 return iter;
464 }
465
466 int
ipoib_path_iter_next(struct ipoib_path_iter * iter)467 ipoib_path_iter_next(struct ipoib_path_iter *iter)
468 {
469 struct ipoib_dev_priv *priv = iter->priv;
470 struct rb_node *n;
471 struct ipoib_path *path;
472 int ret = 1;
473
474 spin_lock_irq(&priv->lock);
475
476 n = rb_first(&priv->path_tree);
477
478 while (n) {
479 path = rb_entry(n, struct ipoib_path, rb_node);
480
481 if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
482 sizeof (union ib_gid)) < 0) {
483 iter->path = *path;
484 ret = 0;
485 break;
486 }
487
488 n = rb_next(n);
489 }
490
491 spin_unlock_irq(&priv->lock);
492
493 return ret;
494 }
495
496 void
ipoib_path_iter_read(struct ipoib_path_iter * iter,struct ipoib_path * path)497 ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path)
498 {
499 *path = iter->path;
500 }
501
502 #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
503
504 void
ipoib_mark_paths_invalid(struct ipoib_dev_priv * priv)505 ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv)
506 {
507 struct ipoib_path *path, *tp;
508
509 spin_lock_irq(&priv->lock);
510
511 list_for_each_entry_safe(path, tp, &priv->path_list, list) {
512 ipoib_dbg(priv, "mark path LID 0x%04x GID %16D invalid\n",
513 be16_to_cpu(path->pathrec.dlid),
514 path->pathrec.dgid.raw, ":");
515 path->valid = 0;
516 }
517
518 spin_unlock_irq(&priv->lock);
519 }
520
521 void
ipoib_flush_paths(struct ipoib_dev_priv * priv)522 ipoib_flush_paths(struct ipoib_dev_priv *priv)
523 {
524 struct ipoib_path *path, *tp;
525 LIST_HEAD(remove_list);
526 unsigned long flags;
527
528 spin_lock_irqsave(&priv->lock, flags);
529
530 list_splice_init(&priv->path_list, &remove_list);
531
532 list_for_each_entry(path, &remove_list, list)
533 rb_erase(&path->rb_node, &priv->path_tree);
534
535 list_for_each_entry_safe(path, tp, &remove_list, list) {
536 if (path->query)
537 ib_sa_cancel_query(path->query_id, path->query);
538 spin_unlock_irqrestore(&priv->lock, flags);
539 wait_for_completion(&path->done);
540 ipoib_path_free(priv, path);
541 spin_lock_irqsave(&priv->lock, flags);
542 }
543
544 spin_unlock_irqrestore(&priv->lock, flags);
545 }
546
547 static void
path_rec_completion(int status,struct ib_sa_path_rec * pathrec,void * path_ptr)548 path_rec_completion(int status, struct ib_sa_path_rec *pathrec, void *path_ptr)
549 {
550 struct ipoib_path *path = path_ptr;
551 struct ipoib_dev_priv *priv = path->priv;
552 struct ifnet *dev = priv->dev;
553 struct ipoib_ah *ah = NULL;
554 struct ipoib_ah *old_ah = NULL;
555 struct ifqueue mbqueue;
556 struct mbuf *mb;
557 unsigned long flags;
558
559 if (!status)
560 ipoib_dbg(priv, "PathRec LID 0x%04x for GID %16D\n",
561 be16_to_cpu(pathrec->dlid), pathrec->dgid.raw, ":");
562 else
563 ipoib_dbg(priv, "PathRec status %d for GID %16D\n",
564 status, path->pathrec.dgid.raw, ":");
565
566 bzero(&mbqueue, sizeof(mbqueue));
567
568 if (!status) {
569 struct ib_ah_attr av;
570
571 if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
572 ah = ipoib_create_ah(priv, priv->pd, &av);
573 }
574
575 spin_lock_irqsave(&priv->lock, flags);
576
577 if (ah) {
578 path->pathrec = *pathrec;
579
580 old_ah = path->ah;
581 path->ah = ah;
582
583 ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
584 ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
585
586 for (;;) {
587 _IF_DEQUEUE(&path->queue, mb);
588 if (mb == NULL)
589 break;
590 _IF_ENQUEUE(&mbqueue, mb);
591 }
592
593 #ifdef CONFIG_INFINIBAND_IPOIB_CM
594 if (ipoib_cm_enabled(priv, path->hwaddr) && !ipoib_cm_get(path))
595 ipoib_cm_set(path, ipoib_cm_create_tx(priv, path));
596 #endif
597
598 path->valid = 1;
599 }
600
601 path->query = NULL;
602 complete(&path->done);
603
604 spin_unlock_irqrestore(&priv->lock, flags);
605
606 if (old_ah)
607 ipoib_put_ah(old_ah);
608
609 for (;;) {
610 _IF_DEQUEUE(&mbqueue, mb);
611 if (mb == NULL)
612 break;
613 mb->m_pkthdr.rcvif = dev;
614 if (dev->if_transmit(dev, mb))
615 ipoib_warn(priv, "dev_queue_xmit failed "
616 "to requeue packet\n");
617 }
618 }
619
620 static struct ipoib_path *
path_rec_create(struct ipoib_dev_priv * priv,uint8_t * hwaddr)621 path_rec_create(struct ipoib_dev_priv *priv, uint8_t *hwaddr)
622 {
623 struct ipoib_path *path;
624
625 if (!priv->broadcast)
626 return NULL;
627
628 path = kzalloc(sizeof *path, GFP_ATOMIC);
629 if (!path)
630 return NULL;
631
632 path->priv = priv;
633
634 bzero(&path->queue, sizeof(path->queue));
635
636 #ifdef CONFIG_INFINIBAND_IPOIB_CM
637 memcpy(&path->hwaddr, hwaddr, INFINIBAND_ALEN);
638 #endif
639 memcpy(path->pathrec.dgid.raw, &hwaddr[4], sizeof (union ib_gid));
640 path->pathrec.sgid = priv->local_gid;
641 path->pathrec.pkey = cpu_to_be16(priv->pkey);
642 path->pathrec.numb_path = 1;
643 path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
644
645 return path;
646 }
647
648 static int
path_rec_start(struct ipoib_dev_priv * priv,struct ipoib_path * path)649 path_rec_start(struct ipoib_dev_priv *priv, struct ipoib_path *path)
650 {
651 struct ifnet *dev = priv->dev;
652
653 ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU;
654 struct ib_sa_path_rec p_rec;
655
656 p_rec = path->pathrec;
657 p_rec.mtu_selector = IB_SA_GT;
658
659 switch (roundup_pow_of_two(dev->if_mtu + IPOIB_ENCAP_LEN)) {
660 case 512:
661 p_rec.mtu = IB_MTU_256;
662 break;
663 case 1024:
664 p_rec.mtu = IB_MTU_512;
665 break;
666 case 2048:
667 p_rec.mtu = IB_MTU_1024;
668 break;
669 case 4096:
670 p_rec.mtu = IB_MTU_2048;
671 break;
672 default:
673 /* Wildcard everything */
674 comp_mask = 0;
675 p_rec.mtu = 0;
676 p_rec.mtu_selector = 0;
677 }
678
679 ipoib_dbg(priv, "Start path record lookup for %16D MTU > %d\n",
680 p_rec.dgid.raw, ":",
681 comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0);
682
683 init_completion(&path->done);
684
685 path->query_id =
686 ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
687 &p_rec, comp_mask |
688 IB_SA_PATH_REC_DGID |
689 IB_SA_PATH_REC_SGID |
690 IB_SA_PATH_REC_NUMB_PATH |
691 IB_SA_PATH_REC_TRAFFIC_CLASS |
692 IB_SA_PATH_REC_PKEY,
693 1000, GFP_ATOMIC,
694 path_rec_completion,
695 path, &path->query);
696 if (path->query_id < 0) {
697 ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
698 path->query = NULL;
699 complete(&path->done);
700 return path->query_id;
701 }
702
703 return 0;
704 }
705
706 static void
ipoib_unicast_send(struct mbuf * mb,struct ipoib_dev_priv * priv,struct ipoib_header * eh)707 ipoib_unicast_send(struct mbuf *mb, struct ipoib_dev_priv *priv, struct ipoib_header *eh)
708 {
709 struct ipoib_path *path;
710
711 path = __path_find(priv, eh->hwaddr + 4);
712 if (!path || !path->valid) {
713 int new_path = 0;
714
715 if (!path) {
716 path = path_rec_create(priv, eh->hwaddr);
717 new_path = 1;
718 }
719 if (path) {
720 if (_IF_QLEN(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE)
721 _IF_ENQUEUE(&path->queue, mb);
722 else {
723 if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1);
724 m_freem(mb);
725 }
726
727 if (!path->query && path_rec_start(priv, path)) {
728 if (new_path)
729 ipoib_path_free(priv, path);
730 return;
731 } else
732 __path_add(priv, path);
733 } else {
734 if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1);
735 m_freem(mb);
736 }
737
738 return;
739 }
740
741 if (ipoib_cm_get(path) && ipoib_cm_up(path)) {
742 ipoib_cm_send(priv, mb, ipoib_cm_get(path));
743 } else if (path->ah) {
744 ipoib_send(priv, mb, path->ah, IPOIB_QPN(eh->hwaddr));
745 } else if ((path->query || !path_rec_start(priv, path)) &&
746 path->queue.ifq_len < IPOIB_MAX_PATH_REC_QUEUE) {
747 _IF_ENQUEUE(&path->queue, mb);
748 } else {
749 if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1);
750 m_freem(mb);
751 }
752 }
753
754 static int
ipoib_send_one(struct ipoib_dev_priv * priv,struct mbuf * mb)755 ipoib_send_one(struct ipoib_dev_priv *priv, struct mbuf *mb)
756 {
757 struct ipoib_header *eh;
758
759 eh = mtod(mb, struct ipoib_header *);
760 if (IPOIB_IS_MULTICAST(eh->hwaddr)) {
761 /* Add in the P_Key for multicast*/
762 eh->hwaddr[8] = (priv->pkey >> 8) & 0xff;
763 eh->hwaddr[9] = priv->pkey & 0xff;
764
765 ipoib_mcast_send(priv, eh->hwaddr + 4, mb);
766 } else
767 ipoib_unicast_send(mb, priv, eh);
768
769 return 0;
770 }
771
772 void
ipoib_start_locked(struct ifnet * dev,struct ipoib_dev_priv * priv)773 ipoib_start_locked(struct ifnet *dev, struct ipoib_dev_priv *priv)
774 {
775 struct mbuf *mb;
776
777 assert_spin_locked(&priv->lock);
778
779 while (!IFQ_DRV_IS_EMPTY(&dev->if_snd) &&
780 (dev->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
781 IFQ_DRV_DEQUEUE(&dev->if_snd, mb);
782 if (mb == NULL)
783 break;
784 IPOIB_MTAP(dev, mb);
785 ipoib_send_one(priv, mb);
786 }
787 }
788
789 static void
_ipoib_start(struct ifnet * dev,struct ipoib_dev_priv * priv)790 _ipoib_start(struct ifnet *dev, struct ipoib_dev_priv *priv)
791 {
792
793 if ((dev->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
794 IFF_DRV_RUNNING)
795 return;
796
797 spin_lock(&priv->lock);
798 ipoib_start_locked(dev, priv);
799 spin_unlock(&priv->lock);
800 }
801
802 static void
ipoib_start(struct ifnet * dev)803 ipoib_start(struct ifnet *dev)
804 {
805 _ipoib_start(dev, dev->if_softc);
806 }
807
808 static void
ipoib_vlan_start(struct ifnet * dev)809 ipoib_vlan_start(struct ifnet *dev)
810 {
811 struct ipoib_dev_priv *priv;
812 struct mbuf *mb;
813
814 priv = VLAN_COOKIE(dev);
815 if (priv != NULL)
816 return _ipoib_start(dev, priv);
817 while (!IFQ_DRV_IS_EMPTY(&dev->if_snd)) {
818 IFQ_DRV_DEQUEUE(&dev->if_snd, mb);
819 if (mb == NULL)
820 break;
821 m_freem(mb);
822 if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
823 }
824 }
825
826 int
ipoib_dev_init(struct ipoib_dev_priv * priv,struct ib_device * ca,int port)827 ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port)
828 {
829
830 /* Allocate RX/TX "rings" to hold queued mbs */
831 priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
832 GFP_KERNEL);
833 if (!priv->rx_ring) {
834 printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
835 ca->name, ipoib_recvq_size);
836 goto out;
837 }
838
839 priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, GFP_KERNEL);
840 if (!priv->tx_ring) {
841 printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
842 ca->name, ipoib_sendq_size);
843 goto out_rx_ring_cleanup;
844 }
845 memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring);
846
847 /* priv->tx_head, tx_tail & tx_outstanding are already 0 */
848
849 if (ipoib_ib_dev_init(priv, ca, port))
850 goto out_tx_ring_cleanup;
851
852 return 0;
853
854 out_tx_ring_cleanup:
855 kfree(priv->tx_ring);
856
857 out_rx_ring_cleanup:
858 kfree(priv->rx_ring);
859
860 out:
861 return -ENOMEM;
862 }
863
864 static void
ipoib_detach(struct ipoib_dev_priv * priv)865 ipoib_detach(struct ipoib_dev_priv *priv)
866 {
867 struct ifnet *dev;
868
869 dev = priv->dev;
870 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
871 priv->gone = 1;
872 bpfdetach(dev);
873 if_detach(dev);
874 if_free(dev);
875 free_unr(ipoib_unrhdr, priv->unit);
876 } else
877 VLAN_SETCOOKIE(priv->dev, NULL);
878
879 free(priv, M_TEMP);
880 }
881
882 void
ipoib_dev_cleanup(struct ipoib_dev_priv * priv)883 ipoib_dev_cleanup(struct ipoib_dev_priv *priv)
884 {
885 struct ipoib_dev_priv *cpriv, *tcpriv;
886
887 /* Delete any child interfaces first */
888 list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
889 ipoib_dev_cleanup(cpriv);
890 ipoib_detach(cpriv);
891 }
892
893 ipoib_ib_dev_cleanup(priv);
894
895 kfree(priv->rx_ring);
896 kfree(priv->tx_ring);
897
898 priv->rx_ring = NULL;
899 priv->tx_ring = NULL;
900 }
901
902 static struct ipoib_dev_priv *
ipoib_priv_alloc(void)903 ipoib_priv_alloc(void)
904 {
905 struct ipoib_dev_priv *priv;
906
907 priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK);
908 spin_lock_init(&priv->lock);
909 spin_lock_init(&priv->drain_lock);
910 mutex_init(&priv->vlan_mutex);
911 INIT_LIST_HEAD(&priv->path_list);
912 INIT_LIST_HEAD(&priv->child_intfs);
913 INIT_LIST_HEAD(&priv->dead_ahs);
914 INIT_LIST_HEAD(&priv->multicast_list);
915 INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
916 INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task);
917 INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
918 INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light);
919 INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal);
920 INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy);
921 INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
922 INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
923 memcpy(priv->broadcastaddr, ipv4_bcast_addr, INFINIBAND_ALEN);
924
925 return (priv);
926 }
927
928 struct ipoib_dev_priv *
ipoib_intf_alloc(const char * name)929 ipoib_intf_alloc(const char *name)
930 {
931 struct ipoib_dev_priv *priv;
932 struct sockaddr_dl *sdl;
933 struct ifnet *dev;
934
935 priv = ipoib_priv_alloc();
936 dev = priv->dev = if_alloc(IFT_INFINIBAND);
937 if (!dev) {
938 free(priv, M_TEMP);
939 return NULL;
940 }
941 dev->if_softc = priv;
942 priv->unit = alloc_unr(ipoib_unrhdr);
943 if (priv->unit == -1) {
944 if_free(dev);
945 free(priv, M_TEMP);
946 return NULL;
947 }
948 if_initname(dev, name, priv->unit);
949 dev->if_flags = IFF_BROADCAST | IFF_MULTICAST;
950 dev->if_addrlen = INFINIBAND_ALEN;
951 dev->if_hdrlen = IPOIB_HEADER_LEN;
952 if_attach(dev);
953 dev->if_init = ipoib_init;
954 dev->if_ioctl = ipoib_ioctl;
955 dev->if_start = ipoib_start;
956 dev->if_output = ipoib_output;
957 dev->if_input = ipoib_input;
958 dev->if_resolvemulti = ipoib_resolvemulti;
959 dev->if_baudrate = IF_Gbps(10);
960 dev->if_broadcastaddr = priv->broadcastaddr;
961 dev->if_snd.ifq_maxlen = ipoib_sendq_size * 2;
962 sdl = (struct sockaddr_dl *)dev->if_addr->ifa_addr;
963 sdl->sdl_type = IFT_INFINIBAND;
964 sdl->sdl_alen = dev->if_addrlen;
965 priv->dev = dev;
966 if_link_state_change(dev, LINK_STATE_DOWN);
967 bpfattach(dev, DLT_EN10MB, ETHER_HDR_LEN);
968
969 return dev->if_softc;
970 }
971
972 int
ipoib_set_dev_features(struct ipoib_dev_priv * priv,struct ib_device * hca)973 ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
974 {
975 struct ib_device_attr *device_attr = &hca->attrs;
976
977 priv->hca_caps = device_attr->device_cap_flags;
978
979 priv->dev->if_hwassist = 0;
980 priv->dev->if_capabilities = 0;
981
982 #ifndef CONFIG_INFINIBAND_IPOIB_CM
983 if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
984 set_bit(IPOIB_FLAG_CSUM, &priv->flags);
985 priv->dev->if_hwassist = CSUM_IP | CSUM_TCP | CSUM_UDP;
986 priv->dev->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM;
987 }
988
989 #if 0
990 if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) {
991 priv->dev->if_capabilities |= IFCAP_TSO4;
992 priv->dev->if_hwassist |= CSUM_TSO;
993 }
994 #endif
995 #endif
996 priv->dev->if_capabilities |=
997 IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_LINKSTATE;
998 priv->dev->if_capenable = priv->dev->if_capabilities;
999
1000 return 0;
1001 }
1002
1003
1004 static struct ifnet *
ipoib_add_port(const char * format,struct ib_device * hca,u8 port)1005 ipoib_add_port(const char *format, struct ib_device *hca, u8 port)
1006 {
1007 struct ipoib_dev_priv *priv;
1008 struct ib_port_attr attr;
1009 int result = -ENOMEM;
1010
1011 priv = ipoib_intf_alloc(format);
1012 if (!priv)
1013 goto alloc_mem_failed;
1014
1015 if (!ib_query_port(hca, port, &attr))
1016 priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
1017 else {
1018 printk(KERN_WARNING "%s: ib_query_port %d failed\n",
1019 hca->name, port);
1020 goto device_init_failed;
1021 }
1022
1023 /* MTU will be reset when mcast join happens */
1024 priv->dev->if_mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
1025 priv->mcast_mtu = priv->admin_mtu = priv->dev->if_mtu;
1026
1027 result = ib_query_pkey(hca, port, 0, &priv->pkey);
1028 if (result) {
1029 printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
1030 hca->name, port, result);
1031 goto device_init_failed;
1032 }
1033
1034 if (ipoib_set_dev_features(priv, hca))
1035 goto device_init_failed;
1036
1037 /*
1038 * Set the full membership bit, so that we join the right
1039 * broadcast group, etc.
1040 */
1041 priv->pkey |= 0x8000;
1042
1043 priv->broadcastaddr[8] = priv->pkey >> 8;
1044 priv->broadcastaddr[9] = priv->pkey & 0xff;
1045
1046 result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL);
1047 if (result) {
1048 printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
1049 hca->name, port, result);
1050 goto device_init_failed;
1051 }
1052 memcpy(IF_LLADDR(priv->dev) + 4, priv->local_gid.raw, sizeof (union ib_gid));
1053
1054 result = ipoib_dev_init(priv, hca, port);
1055 if (result < 0) {
1056 printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
1057 hca->name, port, result);
1058 goto device_init_failed;
1059 }
1060 if (ipoib_cm_admin_enabled(priv))
1061 priv->dev->if_mtu = IPOIB_CM_MTU(ipoib_cm_max_mtu(priv));
1062
1063 INIT_IB_EVENT_HANDLER(&priv->event_handler,
1064 priv->ca, ipoib_event);
1065 result = ib_register_event_handler(&priv->event_handler);
1066 if (result < 0) {
1067 printk(KERN_WARNING "%s: ib_register_event_handler failed for "
1068 "port %d (ret = %d)\n",
1069 hca->name, port, result);
1070 goto event_failed;
1071 }
1072 if_printf(priv->dev, "Attached to %s port %d\n", hca->name, port);
1073
1074 return priv->dev;
1075
1076 event_failed:
1077 ipoib_dev_cleanup(priv);
1078
1079 device_init_failed:
1080 ipoib_detach(priv);
1081
1082 alloc_mem_failed:
1083 return ERR_PTR(result);
1084 }
1085
1086 static void
ipoib_add_one(struct ib_device * device)1087 ipoib_add_one(struct ib_device *device)
1088 {
1089 struct list_head *dev_list;
1090 struct ifnet *dev;
1091 struct ipoib_dev_priv *priv;
1092 int s, e, p;
1093
1094 if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
1095 return;
1096
1097 dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
1098 if (!dev_list)
1099 return;
1100
1101 INIT_LIST_HEAD(dev_list);
1102
1103 if (device->node_type == RDMA_NODE_IB_SWITCH) {
1104 s = 0;
1105 e = 0;
1106 } else {
1107 s = 1;
1108 e = device->phys_port_cnt;
1109 }
1110
1111 for (p = s; p <= e; ++p) {
1112 if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
1113 continue;
1114 dev = ipoib_add_port("ib", device, p);
1115 if (!IS_ERR(dev)) {
1116 priv = dev->if_softc;
1117 list_add_tail(&priv->list, dev_list);
1118 }
1119 }
1120
1121 ib_set_client_data(device, &ipoib_client, dev_list);
1122 }
1123
1124 static void
ipoib_remove_one(struct ib_device * device,void * client_data)1125 ipoib_remove_one(struct ib_device *device, void *client_data)
1126 {
1127 struct ipoib_dev_priv *priv, *tmp;
1128 struct list_head *dev_list = client_data;
1129
1130 if (!dev_list)
1131 return;
1132
1133 if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
1134 return;
1135
1136 list_for_each_entry_safe(priv, tmp, dev_list, list) {
1137 if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND)
1138 continue;
1139
1140 ipoib_stop(priv);
1141
1142 ib_unregister_event_handler(&priv->event_handler);
1143
1144 /* dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); */
1145
1146 flush_workqueue(ipoib_workqueue);
1147
1148 ipoib_dev_cleanup(priv);
1149 ipoib_detach(priv);
1150 }
1151
1152 kfree(dev_list);
1153 }
1154
1155 static int
ipoib_match_dev_addr(const struct sockaddr * addr,struct net_device * dev)1156 ipoib_match_dev_addr(const struct sockaddr *addr, struct net_device *dev)
1157 {
1158 struct ifaddr *ifa;
1159 int retval = 0;
1160
1161 CURVNET_SET(dev->if_vnet);
1162 IF_ADDR_RLOCK(dev);
1163 TAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) {
1164 if (ifa->ifa_addr == NULL ||
1165 ifa->ifa_addr->sa_family != addr->sa_family ||
1166 ifa->ifa_addr->sa_len != addr->sa_len) {
1167 continue;
1168 }
1169 if (memcmp(ifa->ifa_addr, addr, addr->sa_len) == 0) {
1170 retval = 1;
1171 break;
1172 }
1173 }
1174 IF_ADDR_RUNLOCK(dev);
1175 CURVNET_RESTORE();
1176
1177 return (retval);
1178 }
1179
1180 /*
1181 * ipoib_match_gid_pkey_addr - returns the number of IPoIB netdevs on
1182 * top a given ipoib device matching a pkey_index and address, if one
1183 * exists.
1184 *
1185 * @found_net_dev: contains a matching net_device if the return value
1186 * >= 1, with a reference held.
1187 */
1188 static int
ipoib_match_gid_pkey_addr(struct ipoib_dev_priv * priv,const union ib_gid * gid,u16 pkey_index,const struct sockaddr * addr,struct net_device ** found_net_dev)1189 ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,
1190 const union ib_gid *gid, u16 pkey_index, const struct sockaddr *addr,
1191 struct net_device **found_net_dev)
1192 {
1193 struct ipoib_dev_priv *child_priv;
1194 int matches = 0;
1195
1196 if (priv->pkey_index == pkey_index &&
1197 (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) {
1198 if (addr == NULL || ipoib_match_dev_addr(addr, priv->dev) != 0) {
1199 if (*found_net_dev == NULL) {
1200 struct net_device *net_dev;
1201
1202 if (priv->parent != NULL)
1203 net_dev = priv->parent;
1204 else
1205 net_dev = priv->dev;
1206 *found_net_dev = net_dev;
1207 dev_hold(net_dev);
1208 }
1209 matches++;
1210 }
1211 }
1212
1213 /* Check child interfaces */
1214 mutex_lock(&priv->vlan_mutex);
1215 list_for_each_entry(child_priv, &priv->child_intfs, list) {
1216 matches += ipoib_match_gid_pkey_addr(child_priv, gid,
1217 pkey_index, addr, found_net_dev);
1218 if (matches > 1)
1219 break;
1220 }
1221 mutex_unlock(&priv->vlan_mutex);
1222
1223 return matches;
1224 }
1225
1226 /*
1227 * __ipoib_get_net_dev_by_params - returns the number of matching
1228 * net_devs found (between 0 and 2). Also return the matching
1229 * net_device in the @net_dev parameter, holding a reference to the
1230 * net_device, if the number of matches >= 1
1231 */
1232 static int
__ipoib_get_net_dev_by_params(struct list_head * dev_list,u8 port,u16 pkey_index,const union ib_gid * gid,const struct sockaddr * addr,struct net_device ** net_dev)1233 __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,
1234 u16 pkey_index, const union ib_gid *gid,
1235 const struct sockaddr *addr, struct net_device **net_dev)
1236 {
1237 struct ipoib_dev_priv *priv;
1238 int matches = 0;
1239
1240 *net_dev = NULL;
1241
1242 list_for_each_entry(priv, dev_list, list) {
1243 if (priv->port != port)
1244 continue;
1245
1246 matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index,
1247 addr, net_dev);
1248
1249 if (matches > 1)
1250 break;
1251 }
1252
1253 return matches;
1254 }
1255
1256 static struct net_device *
ipoib_get_net_dev_by_params(struct ib_device * dev,u8 port,u16 pkey,const union ib_gid * gid,const struct sockaddr * addr,void * client_data)1257 ipoib_get_net_dev_by_params(struct ib_device *dev, u8 port, u16 pkey,
1258 const union ib_gid *gid, const struct sockaddr *addr, void *client_data)
1259 {
1260 struct net_device *net_dev;
1261 struct list_head *dev_list = client_data;
1262 u16 pkey_index;
1263 int matches;
1264 int ret;
1265
1266 if (!rdma_protocol_ib(dev, port))
1267 return NULL;
1268
1269 ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);
1270 if (ret)
1271 return NULL;
1272
1273 if (!dev_list)
1274 return NULL;
1275
1276 /* See if we can find a unique device matching the L2 parameters */
1277 matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
1278 gid, NULL, &net_dev);
1279
1280 switch (matches) {
1281 case 0:
1282 return NULL;
1283 case 1:
1284 return net_dev;
1285 }
1286
1287 dev_put(net_dev);
1288
1289 /* Couldn't find a unique device with L2 parameters only. Use L3
1290 * address to uniquely match the net device */
1291 matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
1292 gid, addr, &net_dev);
1293 switch (matches) {
1294 case 0:
1295 return NULL;
1296 default:
1297 dev_warn_ratelimited(&dev->dev,
1298 "duplicate IP address detected\n");
1299 /* Fall through */
1300 case 1:
1301 return net_dev;
1302 }
1303 }
1304
1305 static void
ipoib_config_vlan(void * arg,struct ifnet * ifp,u_int16_t vtag)1306 ipoib_config_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
1307 {
1308 struct ipoib_dev_priv *parent;
1309 struct ipoib_dev_priv *priv;
1310 struct ifnet *dev;
1311 uint16_t pkey;
1312 int error;
1313
1314 if (ifp->if_type != IFT_INFINIBAND)
1315 return;
1316 dev = VLAN_DEVAT(ifp, vtag);
1317 if (dev == NULL)
1318 return;
1319 priv = NULL;
1320 error = 0;
1321 parent = ifp->if_softc;
1322 /* We only support 15 bits of pkey. */
1323 if (vtag & 0x8000)
1324 return;
1325 pkey = vtag | 0x8000; /* Set full membership bit. */
1326 if (pkey == parent->pkey)
1327 return;
1328 /* Check for dups */
1329 mutex_lock(&parent->vlan_mutex);
1330 list_for_each_entry(priv, &parent->child_intfs, list) {
1331 if (priv->pkey == pkey) {
1332 priv = NULL;
1333 error = EBUSY;
1334 goto out;
1335 }
1336 }
1337 priv = ipoib_priv_alloc();
1338 priv->dev = dev;
1339 priv->max_ib_mtu = parent->max_ib_mtu;
1340 priv->mcast_mtu = priv->admin_mtu = parent->dev->if_mtu;
1341 set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
1342 error = ipoib_set_dev_features(priv, parent->ca);
1343 if (error)
1344 goto out;
1345 priv->pkey = pkey;
1346 priv->broadcastaddr[8] = pkey >> 8;
1347 priv->broadcastaddr[9] = pkey & 0xff;
1348 dev->if_broadcastaddr = priv->broadcastaddr;
1349 error = ipoib_dev_init(priv, parent->ca, parent->port);
1350 if (error)
1351 goto out;
1352 priv->parent = parent->dev;
1353 list_add_tail(&priv->list, &parent->child_intfs);
1354 VLAN_SETCOOKIE(dev, priv);
1355 dev->if_start = ipoib_vlan_start;
1356 dev->if_drv_flags &= ~IFF_DRV_RUNNING;
1357 dev->if_hdrlen = IPOIB_HEADER_LEN;
1358 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1359 ipoib_open(priv);
1360 mutex_unlock(&parent->vlan_mutex);
1361 return;
1362 out:
1363 mutex_unlock(&parent->vlan_mutex);
1364 if (priv)
1365 free(priv, M_TEMP);
1366 if (error)
1367 ipoib_warn(parent,
1368 "failed to initialize subinterface: device %s, port %d vtag 0x%X",
1369 parent->ca->name, parent->port, vtag);
1370 return;
1371 }
1372
1373 static void
ipoib_unconfig_vlan(void * arg,struct ifnet * ifp,u_int16_t vtag)1374 ipoib_unconfig_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
1375 {
1376 struct ipoib_dev_priv *parent;
1377 struct ipoib_dev_priv *priv;
1378 struct ifnet *dev;
1379 uint16_t pkey;
1380
1381 if (ifp->if_type != IFT_INFINIBAND)
1382 return;
1383
1384 dev = VLAN_DEVAT(ifp, vtag);
1385 if (dev)
1386 VLAN_SETCOOKIE(dev, NULL);
1387 pkey = vtag | 0x8000;
1388 parent = ifp->if_softc;
1389 mutex_lock(&parent->vlan_mutex);
1390 list_for_each_entry(priv, &parent->child_intfs, list) {
1391 if (priv->pkey == pkey) {
1392 ipoib_dev_cleanup(priv);
1393 list_del(&priv->list);
1394 break;
1395 }
1396 }
1397 mutex_unlock(&parent->vlan_mutex);
1398 }
1399
1400 eventhandler_tag ipoib_vlan_attach;
1401 eventhandler_tag ipoib_vlan_detach;
1402
1403 static int __init
ipoib_init_module(void)1404 ipoib_init_module(void)
1405 {
1406 int ret;
1407
1408 ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
1409 ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
1410 ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
1411
1412 ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
1413 ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
1414 ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE,
1415 IPOIB_MIN_QUEUE_SIZE));
1416 #ifdef CONFIG_INFINIBAND_IPOIB_CM
1417 ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
1418 #endif
1419
1420 ipoib_vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
1421 ipoib_config_vlan, NULL, EVENTHANDLER_PRI_FIRST);
1422 ipoib_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
1423 ipoib_unconfig_vlan, NULL, EVENTHANDLER_PRI_FIRST);
1424
1425 /*
1426 * We create our own workqueue mainly because we want to be
1427 * able to flush it when devices are being removed. We can't
1428 * use schedule_work()/flush_scheduled_work() because both
1429 * unregister_netdev() and linkwatch_event take the rtnl lock,
1430 * so flush_scheduled_work() can deadlock during device
1431 * removal.
1432 */
1433 ipoib_workqueue = create_singlethread_workqueue("ipoib");
1434 if (!ipoib_workqueue) {
1435 ret = -ENOMEM;
1436 goto err_fs;
1437 }
1438
1439 ib_sa_register_client(&ipoib_sa_client);
1440
1441 ret = ib_register_client(&ipoib_client);
1442 if (ret)
1443 goto err_sa;
1444
1445 return 0;
1446
1447 err_sa:
1448 ib_sa_unregister_client(&ipoib_sa_client);
1449 destroy_workqueue(ipoib_workqueue);
1450
1451 err_fs:
1452 return ret;
1453 }
1454
1455 static void __exit
ipoib_cleanup_module(void)1456 ipoib_cleanup_module(void)
1457 {
1458
1459 EVENTHANDLER_DEREGISTER(vlan_config, ipoib_vlan_attach);
1460 EVENTHANDLER_DEREGISTER(vlan_unconfig, ipoib_vlan_detach);
1461 ib_unregister_client(&ipoib_client);
1462 ib_sa_unregister_client(&ipoib_sa_client);
1463 destroy_workqueue(ipoib_workqueue);
1464 }
1465
1466 /*
1467 * Infiniband output routine.
1468 */
1469 static int
ipoib_output(struct ifnet * ifp,struct mbuf * m,const struct sockaddr * dst,struct route * ro)1470 ipoib_output(struct ifnet *ifp, struct mbuf *m,
1471 const struct sockaddr *dst, struct route *ro)
1472 {
1473 u_char edst[INFINIBAND_ALEN];
1474 #if defined(INET) || defined(INET6)
1475 struct llentry *lle = NULL;
1476 #endif
1477 struct ipoib_header *eh;
1478 int error = 0, is_gw = 0;
1479 short type;
1480
1481 if (ro != NULL)
1482 is_gw = (ro->ro_flags & RT_HAS_GW) != 0;
1483 #ifdef MAC
1484 error = mac_ifnet_check_transmit(ifp, m);
1485 if (error)
1486 goto bad;
1487 #endif
1488
1489 M_PROFILE(m);
1490 if (ifp->if_flags & IFF_MONITOR) {
1491 error = ENETDOWN;
1492 goto bad;
1493 }
1494 if (!((ifp->if_flags & IFF_UP) &&
1495 (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
1496 error = ENETDOWN;
1497 goto bad;
1498 }
1499
1500 switch (dst->sa_family) {
1501 #ifdef INET
1502 case AF_INET:
1503 if (lle != NULL && (lle->la_flags & LLE_VALID))
1504 memcpy(edst, lle->ll_addr, sizeof(edst));
1505 else if (m->m_flags & M_MCAST)
1506 ip_ib_mc_map(((struct sockaddr_in *)dst)->sin_addr.s_addr, ifp->if_broadcastaddr, edst);
1507 else
1508 error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL);
1509 if (error)
1510 return (error == EWOULDBLOCK ? 0 : error);
1511 type = htons(ETHERTYPE_IP);
1512 break;
1513 case AF_ARP:
1514 {
1515 struct arphdr *ah;
1516 ah = mtod(m, struct arphdr *);
1517 ah->ar_hrd = htons(ARPHRD_INFINIBAND);
1518
1519 switch(ntohs(ah->ar_op)) {
1520 case ARPOP_REVREQUEST:
1521 case ARPOP_REVREPLY:
1522 type = htons(ETHERTYPE_REVARP);
1523 break;
1524 case ARPOP_REQUEST:
1525 case ARPOP_REPLY:
1526 default:
1527 type = htons(ETHERTYPE_ARP);
1528 break;
1529 }
1530
1531 if (m->m_flags & M_BCAST)
1532 bcopy(ifp->if_broadcastaddr, edst, INFINIBAND_ALEN);
1533 else
1534 bcopy(ar_tha(ah), edst, INFINIBAND_ALEN);
1535
1536 }
1537 break;
1538 #endif
1539 #ifdef INET6
1540 case AF_INET6:
1541 if (lle != NULL && (lle->la_flags & LLE_VALID))
1542 memcpy(edst, lle->ll_addr, sizeof(edst));
1543 else if (m->m_flags & M_MCAST)
1544 ipv6_ib_mc_map(&((struct sockaddr_in6 *)dst)->sin6_addr, ifp->if_broadcastaddr, edst);
1545 else
1546 error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL);
1547 if (error)
1548 return error;
1549 type = htons(ETHERTYPE_IPV6);
1550 break;
1551 #endif
1552
1553 default:
1554 if_printf(ifp, "can't handle af%d\n", dst->sa_family);
1555 error = EAFNOSUPPORT;
1556 goto bad;
1557 }
1558
1559 /*
1560 * Add local net header. If no space in first mbuf,
1561 * allocate another.
1562 */
1563 M_PREPEND(m, IPOIB_HEADER_LEN, M_NOWAIT);
1564 if (m == NULL) {
1565 error = ENOBUFS;
1566 goto bad;
1567 }
1568 eh = mtod(m, struct ipoib_header *);
1569 (void)memcpy(&eh->proto, &type, sizeof(eh->proto));
1570 (void)memcpy(&eh->hwaddr, edst, sizeof (edst));
1571
1572 /*
1573 * Queue message on interface, update output statistics if
1574 * successful, and start output if interface not yet active.
1575 */
1576 return ((ifp->if_transmit)(ifp, m));
1577 bad:
1578 if (m != NULL)
1579 m_freem(m);
1580 return (error);
1581 }
1582
1583 /*
1584 * Upper layer processing for a received Infiniband packet.
1585 */
1586 void
ipoib_demux(struct ifnet * ifp,struct mbuf * m,u_short proto)1587 ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto)
1588 {
1589 int isr;
1590
1591 #ifdef MAC
1592 /*
1593 * Tag the mbuf with an appropriate MAC label before any other
1594 * consumers can get to it.
1595 */
1596 mac_ifnet_create_mbuf(ifp, m);
1597 #endif
1598 /* Allow monitor mode to claim this frame, after stats are updated. */
1599 if (ifp->if_flags & IFF_MONITOR) {
1600 if_printf(ifp, "discard frame at IFF_MONITOR\n");
1601 m_freem(m);
1602 return;
1603 }
1604 /* Direct packet to correct FIB based on interface config */
1605 M_SETFIB(m, ifp->if_fib);
1606 /*
1607 * Dispatch frame to upper layer.
1608 */
1609 switch (proto) {
1610 #ifdef INET
1611 case ETHERTYPE_IP:
1612 isr = NETISR_IP;
1613 break;
1614
1615 case ETHERTYPE_ARP:
1616 if (ifp->if_flags & IFF_NOARP) {
1617 /* Discard packet if ARP is disabled on interface */
1618 m_freem(m);
1619 return;
1620 }
1621 isr = NETISR_ARP;
1622 break;
1623 #endif
1624 #ifdef INET6
1625 case ETHERTYPE_IPV6:
1626 isr = NETISR_IPV6;
1627 break;
1628 #endif
1629 default:
1630 goto discard;
1631 }
1632 netisr_dispatch(isr, m);
1633 return;
1634
1635 discard:
1636 m_freem(m);
1637 }
1638
1639 /*
1640 * Process a received Infiniband packet.
1641 */
1642 static void
ipoib_input(struct ifnet * ifp,struct mbuf * m)1643 ipoib_input(struct ifnet *ifp, struct mbuf *m)
1644 {
1645 struct ipoib_header *eh;
1646
1647 if ((ifp->if_flags & IFF_UP) == 0) {
1648 m_freem(m);
1649 return;
1650 }
1651 CURVNET_SET_QUIET(ifp->if_vnet);
1652
1653 /* Let BPF have it before we strip the header. */
1654 IPOIB_MTAP(ifp, m);
1655 eh = mtod(m, struct ipoib_header *);
1656 /*
1657 * Reset layer specific mbuf flags to avoid confusing upper layers.
1658 * Strip off Infiniband header.
1659 */
1660 m->m_flags &= ~M_VLANTAG;
1661 m_clrprotoflags(m);
1662 m_adj(m, IPOIB_HEADER_LEN);
1663
1664 if (IPOIB_IS_MULTICAST(eh->hwaddr)) {
1665 if (memcmp(eh->hwaddr, ifp->if_broadcastaddr,
1666 ifp->if_addrlen) == 0)
1667 m->m_flags |= M_BCAST;
1668 else
1669 m->m_flags |= M_MCAST;
1670 if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
1671 }
1672
1673 ipoib_demux(ifp, m, ntohs(eh->proto));
1674 CURVNET_RESTORE();
1675 }
1676
1677 static int
ipoib_resolvemulti(struct ifnet * ifp,struct sockaddr ** llsa,struct sockaddr * sa)1678 ipoib_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
1679 struct sockaddr *sa)
1680 {
1681 struct sockaddr_dl *sdl;
1682 #ifdef INET
1683 struct sockaddr_in *sin;
1684 #endif
1685 #ifdef INET6
1686 struct sockaddr_in6 *sin6;
1687 #endif
1688 u_char *e_addr;
1689
1690 switch(sa->sa_family) {
1691 case AF_LINK:
1692 /*
1693 * No mapping needed. Just check that it's a valid MC address.
1694 */
1695 sdl = (struct sockaddr_dl *)sa;
1696 e_addr = LLADDR(sdl);
1697 if (!IPOIB_IS_MULTICAST(e_addr))
1698 return EADDRNOTAVAIL;
1699 *llsa = NULL;
1700 return 0;
1701
1702 #ifdef INET
1703 case AF_INET:
1704 sin = (struct sockaddr_in *)sa;
1705 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
1706 return EADDRNOTAVAIL;
1707 sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
1708 sdl->sdl_alen = INFINIBAND_ALEN;
1709 e_addr = LLADDR(sdl);
1710 ip_ib_mc_map(sin->sin_addr.s_addr, ifp->if_broadcastaddr,
1711 e_addr);
1712 *llsa = (struct sockaddr *)sdl;
1713 return 0;
1714 #endif
1715 #ifdef INET6
1716 case AF_INET6:
1717 sin6 = (struct sockaddr_in6 *)sa;
1718 /*
1719 * An IP6 address of 0 means listen to all
1720 * of the multicast address used for IP6.
1721 * This has no meaning in ipoib.
1722 */
1723 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
1724 return EADDRNOTAVAIL;
1725 if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
1726 return EADDRNOTAVAIL;
1727 sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
1728 sdl->sdl_alen = INFINIBAND_ALEN;
1729 e_addr = LLADDR(sdl);
1730 ipv6_ib_mc_map(&sin6->sin6_addr, ifp->if_broadcastaddr, e_addr);
1731 *llsa = (struct sockaddr *)sdl;
1732 return 0;
1733 #endif
1734
1735 default:
1736 return EAFNOSUPPORT;
1737 }
1738 }
1739
1740 module_init_order(ipoib_init_module, SI_ORDER_FIFTH);
1741 module_exit_order(ipoib_cleanup_module, SI_ORDER_FIFTH);
1742
1743 static int
ipoib_evhand(module_t mod,int event,void * arg)1744 ipoib_evhand(module_t mod, int event, void *arg)
1745 {
1746 return (0);
1747 }
1748
1749 static moduledata_t ipoib_mod = {
1750 .name = "ipoib",
1751 .evhand = ipoib_evhand,
1752 };
1753
1754 DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_LAST, SI_ORDER_ANY);
1755 MODULE_DEPEND(ipoib, ibcore, 1, 1, 1);
1756 MODULE_DEPEND(ipoib, linuxkpi, 1, 1, 1);
1757