1 /*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD$
27 */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/bus.h>
38 #include <sys/pciio.h>
39 #include <sys/rman.h>
40 #include <sys/smp.h>
41 #include <sys/sysctl.h>
42
43 #include <dev/pci/pcivar.h>
44 #include <dev/pci/pcireg.h>
45
46 #include <machine/resource.h>
47
48 #include <machine/vmm.h>
49 #include <machine/vmm_dev.h>
50
51 #include "vmm_lapic.h"
52 #include "vmm_ktr.h"
53
54 #include "iommu.h"
55 #include "ppt.h"
56
57 /* XXX locking */
58
59 #define MAX_MSIMSGS 32
60
61 /*
62 * If the MSI-X table is located in the middle of a BAR then that MMIO
63 * region gets split into two segments - one segment above the MSI-X table
64 * and the other segment below the MSI-X table - with a hole in place of
65 * the MSI-X table so accesses to it can be trapped and emulated.
66 *
67 * So, allocate a MMIO segment for each BAR register + 1 additional segment.
68 */
69 #define MAX_MMIOSEGS ((PCIR_MAX_BAR_0 + 1) + 1)
70
71 MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
72
73 struct pptintr_arg { /* pptintr(pptintr_arg) */
74 struct pptdev *pptdev;
75 uint64_t addr;
76 uint64_t msg_data;
77 };
78
79 struct pptseg {
80 vm_paddr_t gpa;
81 size_t len;
82 int wired;
83 };
84
85 struct pptdev {
86 device_t dev;
87 struct vm *vm; /* owner of this device */
88 TAILQ_ENTRY(pptdev) next;
89 struct pptseg mmio[MAX_MMIOSEGS];
90 struct {
91 int num_msgs; /* guest state */
92
93 int startrid; /* host state */
94 struct resource *res[MAX_MSIMSGS];
95 void *cookie[MAX_MSIMSGS];
96 struct pptintr_arg arg[MAX_MSIMSGS];
97 } msi;
98
99 struct {
100 int num_msgs;
101 int startrid;
102 int msix_table_rid;
103 int msix_pba_rid;
104 struct resource *msix_table_res;
105 struct resource *msix_pba_res;
106 struct resource **res;
107 void **cookie;
108 struct pptintr_arg *arg;
109 } msix;
110 };
111
112 SYSCTL_DECL(_hw_vmm);
113 SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW, 0, "bhyve passthru devices");
114
115 static int num_pptdevs;
116 SYSCTL_INT(_hw_vmm_ppt, OID_AUTO, devices, CTLFLAG_RD, &num_pptdevs, 0,
117 "number of pci passthru devices");
118
119 static TAILQ_HEAD(, pptdev) pptdev_list = TAILQ_HEAD_INITIALIZER(pptdev_list);
120
121 static int
ppt_probe(device_t dev)122 ppt_probe(device_t dev)
123 {
124 int bus, slot, func;
125 struct pci_devinfo *dinfo;
126
127 dinfo = (struct pci_devinfo *)device_get_ivars(dev);
128
129 bus = pci_get_bus(dev);
130 slot = pci_get_slot(dev);
131 func = pci_get_function(dev);
132
133 /*
134 * To qualify as a pci passthrough device a device must:
135 * - be allowed by administrator to be used in this role
136 * - be an endpoint device
137 */
138 if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL)
139 return (ENXIO);
140 else if (vmm_is_pptdev(bus, slot, func))
141 return (0);
142 else
143 /*
144 * Returning BUS_PROBE_NOWILDCARD here matches devices that the
145 * SR-IOV infrastructure specified as "ppt" passthrough devices.
146 * All normal devices that did not have "ppt" specified as their
147 * driver will not be matched by this.
148 */
149 return (BUS_PROBE_NOWILDCARD);
150 }
151
152 static int
ppt_attach(device_t dev)153 ppt_attach(device_t dev)
154 {
155 struct pptdev *ppt;
156
157 ppt = device_get_softc(dev);
158
159 iommu_remove_device(iommu_host_domain(), pci_get_rid(dev));
160 num_pptdevs++;
161 TAILQ_INSERT_TAIL(&pptdev_list, ppt, next);
162 ppt->dev = dev;
163
164 if (bootverbose)
165 device_printf(dev, "attached\n");
166
167 return (0);
168 }
169
170 static int
ppt_detach(device_t dev)171 ppt_detach(device_t dev)
172 {
173 struct pptdev *ppt;
174
175 ppt = device_get_softc(dev);
176
177 if (ppt->vm != NULL)
178 return (EBUSY);
179 num_pptdevs--;
180 TAILQ_REMOVE(&pptdev_list, ppt, next);
181 pci_disable_busmaster(dev);
182 iommu_add_device(iommu_host_domain(), pci_get_rid(dev));
183
184 return (0);
185 }
186
187 static device_method_t ppt_methods[] = {
188 /* Device interface */
189 DEVMETHOD(device_probe, ppt_probe),
190 DEVMETHOD(device_attach, ppt_attach),
191 DEVMETHOD(device_detach, ppt_detach),
192 {0, 0}
193 };
194
195 static devclass_t ppt_devclass;
196 DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, sizeof(struct pptdev));
197 DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
198
199 static struct pptdev *
ppt_find(int bus,int slot,int func)200 ppt_find(int bus, int slot, int func)
201 {
202 device_t dev;
203 struct pptdev *ppt;
204 int b, s, f;
205
206 TAILQ_FOREACH(ppt, &pptdev_list, next) {
207 dev = ppt->dev;
208 b = pci_get_bus(dev);
209 s = pci_get_slot(dev);
210 f = pci_get_function(dev);
211 if (bus == b && slot == s && func == f)
212 return (ppt);
213 }
214 return (NULL);
215 }
216
217 static void
ppt_unmap_mmio(struct vm * vm,struct pptdev * ppt)218 ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
219 {
220 int i;
221 struct pptseg *seg;
222
223 for (i = 0; i < MAX_MMIOSEGS; i++) {
224 seg = &ppt->mmio[i];
225 if (seg->len == 0)
226 continue;
227 (void)vm_unmap_mmio(vm, seg->gpa, seg->len);
228 bzero(seg, sizeof(struct pptseg));
229 }
230 }
231
232 static void
ppt_teardown_msi(struct pptdev * ppt)233 ppt_teardown_msi(struct pptdev *ppt)
234 {
235 int i, rid;
236 void *cookie;
237 struct resource *res;
238
239 if (ppt->msi.num_msgs == 0)
240 return;
241
242 for (i = 0; i < ppt->msi.num_msgs; i++) {
243 rid = ppt->msi.startrid + i;
244 res = ppt->msi.res[i];
245 cookie = ppt->msi.cookie[i];
246
247 if (cookie != NULL)
248 bus_teardown_intr(ppt->dev, res, cookie);
249
250 if (res != NULL)
251 bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
252
253 ppt->msi.res[i] = NULL;
254 ppt->msi.cookie[i] = NULL;
255 }
256
257 if (ppt->msi.startrid == 1)
258 pci_release_msi(ppt->dev);
259
260 ppt->msi.num_msgs = 0;
261 }
262
263 static void
ppt_teardown_msix_intr(struct pptdev * ppt,int idx)264 ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
265 {
266 int rid;
267 struct resource *res;
268 void *cookie;
269
270 rid = ppt->msix.startrid + idx;
271 res = ppt->msix.res[idx];
272 cookie = ppt->msix.cookie[idx];
273
274 if (cookie != NULL)
275 bus_teardown_intr(ppt->dev, res, cookie);
276
277 if (res != NULL)
278 bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
279
280 ppt->msix.res[idx] = NULL;
281 ppt->msix.cookie[idx] = NULL;
282 }
283
284 static void
ppt_teardown_msix(struct pptdev * ppt)285 ppt_teardown_msix(struct pptdev *ppt)
286 {
287 int i;
288
289 if (ppt->msix.num_msgs == 0)
290 return;
291
292 for (i = 0; i < ppt->msix.num_msgs; i++)
293 ppt_teardown_msix_intr(ppt, i);
294
295 free(ppt->msix.res, M_PPTMSIX);
296 free(ppt->msix.cookie, M_PPTMSIX);
297 free(ppt->msix.arg, M_PPTMSIX);
298
299 pci_release_msi(ppt->dev);
300
301 if (ppt->msix.msix_table_res) {
302 bus_release_resource(ppt->dev, SYS_RES_MEMORY,
303 ppt->msix.msix_table_rid,
304 ppt->msix.msix_table_res);
305 ppt->msix.msix_table_res = NULL;
306 ppt->msix.msix_table_rid = 0;
307 }
308 if (ppt->msix.msix_pba_res) {
309 bus_release_resource(ppt->dev, SYS_RES_MEMORY,
310 ppt->msix.msix_pba_rid,
311 ppt->msix.msix_pba_res);
312 ppt->msix.msix_pba_res = NULL;
313 ppt->msix.msix_pba_rid = 0;
314 }
315
316 ppt->msix.num_msgs = 0;
317 }
318
319 int
ppt_avail_devices(void)320 ppt_avail_devices(void)
321 {
322
323 return (num_pptdevs);
324 }
325
326 int
ppt_assigned_devices(struct vm * vm)327 ppt_assigned_devices(struct vm *vm)
328 {
329 struct pptdev *ppt;
330 int num;
331
332 num = 0;
333 TAILQ_FOREACH(ppt, &pptdev_list, next) {
334 if (ppt->vm == vm)
335 num++;
336 }
337 return (num);
338 }
339
340 bool
ppt_is_mmio(struct vm * vm,vm_paddr_t gpa)341 ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
342 {
343 int i;
344 struct pptdev *ppt;
345 struct pptseg *seg;
346
347 TAILQ_FOREACH(ppt, &pptdev_list, next) {
348 if (ppt->vm != vm)
349 continue;
350
351 for (i = 0; i < MAX_MMIOSEGS; i++) {
352 seg = &ppt->mmio[i];
353 if (seg->len == 0)
354 continue;
355 if (gpa >= seg->gpa && gpa < seg->gpa + seg->len)
356 return (true);
357 }
358 }
359
360 return (false);
361 }
362
363 static void
ppt_pci_reset(device_t dev)364 ppt_pci_reset(device_t dev)
365 {
366
367 if (pcie_flr(dev,
368 max(pcie_get_max_completion_timeout(dev) / 1000, 10), true))
369 return;
370
371 pci_power_reset(dev);
372 }
373
374 int
ppt_assign_device(struct vm * vm,int bus,int slot,int func)375 ppt_assign_device(struct vm *vm, int bus, int slot, int func)
376 {
377 struct pptdev *ppt;
378
379 ppt = ppt_find(bus, slot, func);
380 if (ppt != NULL) {
381 /*
382 * If this device is owned by a different VM then we
383 * cannot change its owner.
384 */
385 if (ppt->vm != NULL && ppt->vm != vm)
386 return (EBUSY);
387
388 pci_save_state(ppt->dev);
389 ppt_pci_reset(ppt->dev);
390 pci_restore_state(ppt->dev);
391 ppt->vm = vm;
392 iommu_add_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
393 return (0);
394 }
395 return (ENOENT);
396 }
397
398 int
ppt_unassign_device(struct vm * vm,int bus,int slot,int func)399 ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
400 {
401 struct pptdev *ppt;
402
403 ppt = ppt_find(bus, slot, func);
404 if (ppt != NULL) {
405 /*
406 * If this device is not owned by this 'vm' then bail out.
407 */
408 if (ppt->vm != vm)
409 return (EBUSY);
410
411 pci_save_state(ppt->dev);
412 ppt_pci_reset(ppt->dev);
413 pci_restore_state(ppt->dev);
414 ppt_unmap_mmio(vm, ppt);
415 ppt_teardown_msi(ppt);
416 ppt_teardown_msix(ppt);
417 iommu_remove_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
418 ppt->vm = NULL;
419 return (0);
420 }
421 return (ENOENT);
422 }
423
424 int
ppt_unassign_all(struct vm * vm)425 ppt_unassign_all(struct vm *vm)
426 {
427 struct pptdev *ppt;
428 int bus, slot, func;
429 device_t dev;
430
431 TAILQ_FOREACH(ppt, &pptdev_list, next) {
432 if (ppt->vm == vm) {
433 dev = ppt->dev;
434 bus = pci_get_bus(dev);
435 slot = pci_get_slot(dev);
436 func = pci_get_function(dev);
437 vm_unassign_pptdev(vm, bus, slot, func);
438 }
439 }
440
441 return (0);
442 }
443
444 int
ppt_map_mmio(struct vm * vm,int bus,int slot,int func,vm_paddr_t gpa,size_t len,vm_paddr_t hpa)445 ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
446 vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
447 {
448 int i, error;
449 struct pptseg *seg;
450 struct pptdev *ppt;
451
452 ppt = ppt_find(bus, slot, func);
453 if (ppt != NULL) {
454 if (ppt->vm != vm)
455 return (EBUSY);
456
457 for (i = 0; i < MAX_MMIOSEGS; i++) {
458 seg = &ppt->mmio[i];
459 if (seg->len == 0) {
460 error = vm_map_mmio(vm, gpa, len, hpa);
461 if (error == 0) {
462 seg->gpa = gpa;
463 seg->len = len;
464 }
465 return (error);
466 }
467 }
468 return (ENOSPC);
469 }
470 return (ENOENT);
471 }
472
473 static int
pptintr(void * arg)474 pptintr(void *arg)
475 {
476 struct pptdev *ppt;
477 struct pptintr_arg *pptarg;
478
479 pptarg = arg;
480 ppt = pptarg->pptdev;
481
482 if (ppt->vm != NULL)
483 lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data);
484 else {
485 /*
486 * XXX
487 * This is not expected to happen - panic?
488 */
489 }
490
491 /*
492 * For legacy interrupts give other filters a chance in case
493 * the interrupt was not generated by the passthrough device.
494 */
495 if (ppt->msi.startrid == 0)
496 return (FILTER_STRAY);
497 else
498 return (FILTER_HANDLED);
499 }
500
501 int
ppt_setup_msi(struct vm * vm,int vcpu,int bus,int slot,int func,uint64_t addr,uint64_t msg,int numvec)502 ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
503 uint64_t addr, uint64_t msg, int numvec)
504 {
505 int i, rid, flags;
506 int msi_count, startrid, error, tmp;
507 struct pptdev *ppt;
508
509 if (numvec < 0 || numvec > MAX_MSIMSGS)
510 return (EINVAL);
511
512 ppt = ppt_find(bus, slot, func);
513 if (ppt == NULL)
514 return (ENOENT);
515 if (ppt->vm != vm) /* Make sure we own this device */
516 return (EBUSY);
517
518 /* Free any allocated resources */
519 ppt_teardown_msi(ppt);
520
521 if (numvec == 0) /* nothing more to do */
522 return (0);
523
524 flags = RF_ACTIVE;
525 msi_count = pci_msi_count(ppt->dev);
526 if (msi_count == 0) {
527 startrid = 0; /* legacy interrupt */
528 msi_count = 1;
529 flags |= RF_SHAREABLE;
530 } else
531 startrid = 1; /* MSI */
532
533 /*
534 * The device must be capable of supporting the number of vectors
535 * the guest wants to allocate.
536 */
537 if (numvec > msi_count)
538 return (EINVAL);
539
540 /*
541 * Make sure that we can allocate all the MSI vectors that are needed
542 * by the guest.
543 */
544 if (startrid == 1) {
545 tmp = numvec;
546 error = pci_alloc_msi(ppt->dev, &tmp);
547 if (error)
548 return (error);
549 else if (tmp != numvec) {
550 pci_release_msi(ppt->dev);
551 return (ENOSPC);
552 } else {
553 /* success */
554 }
555 }
556
557 ppt->msi.startrid = startrid;
558
559 /*
560 * Allocate the irq resource and attach it to the interrupt handler.
561 */
562 for (i = 0; i < numvec; i++) {
563 ppt->msi.num_msgs = i + 1;
564 ppt->msi.cookie[i] = NULL;
565
566 rid = startrid + i;
567 ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
568 &rid, flags);
569 if (ppt->msi.res[i] == NULL)
570 break;
571
572 ppt->msi.arg[i].pptdev = ppt;
573 ppt->msi.arg[i].addr = addr;
574 ppt->msi.arg[i].msg_data = msg + i;
575
576 error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
577 INTR_TYPE_NET | INTR_MPSAFE,
578 pptintr, NULL, &ppt->msi.arg[i],
579 &ppt->msi.cookie[i]);
580 if (error != 0)
581 break;
582 }
583
584 if (i < numvec) {
585 ppt_teardown_msi(ppt);
586 return (ENXIO);
587 }
588
589 return (0);
590 }
591
592 int
ppt_setup_msix(struct vm * vm,int vcpu,int bus,int slot,int func,int idx,uint64_t addr,uint64_t msg,uint32_t vector_control)593 ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
594 int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
595 {
596 struct pptdev *ppt;
597 struct pci_devinfo *dinfo;
598 int numvec, alloced, rid, error;
599 size_t res_size, cookie_size, arg_size;
600
601 ppt = ppt_find(bus, slot, func);
602 if (ppt == NULL)
603 return (ENOENT);
604 if (ppt->vm != vm) /* Make sure we own this device */
605 return (EBUSY);
606
607 dinfo = device_get_ivars(ppt->dev);
608 if (!dinfo)
609 return (ENXIO);
610
611 /*
612 * First-time configuration:
613 * Allocate the MSI-X table
614 * Allocate the IRQ resources
615 * Set up some variables in ppt->msix
616 */
617 if (ppt->msix.num_msgs == 0) {
618 numvec = pci_msix_count(ppt->dev);
619 if (numvec <= 0)
620 return (EINVAL);
621
622 ppt->msix.startrid = 1;
623 ppt->msix.num_msgs = numvec;
624
625 res_size = numvec * sizeof(ppt->msix.res[0]);
626 cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
627 arg_size = numvec * sizeof(ppt->msix.arg[0]);
628
629 ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
630 ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
631 M_WAITOK | M_ZERO);
632 ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
633
634 rid = dinfo->cfg.msix.msix_table_bar;
635 ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
636 SYS_RES_MEMORY, &rid, RF_ACTIVE);
637
638 if (ppt->msix.msix_table_res == NULL) {
639 ppt_teardown_msix(ppt);
640 return (ENOSPC);
641 }
642 ppt->msix.msix_table_rid = rid;
643
644 if (dinfo->cfg.msix.msix_table_bar !=
645 dinfo->cfg.msix.msix_pba_bar) {
646 rid = dinfo->cfg.msix.msix_pba_bar;
647 ppt->msix.msix_pba_res = bus_alloc_resource_any(
648 ppt->dev, SYS_RES_MEMORY, &rid, RF_ACTIVE);
649
650 if (ppt->msix.msix_pba_res == NULL) {
651 ppt_teardown_msix(ppt);
652 return (ENOSPC);
653 }
654 ppt->msix.msix_pba_rid = rid;
655 }
656
657 alloced = numvec;
658 error = pci_alloc_msix(ppt->dev, &alloced);
659 if (error || alloced != numvec) {
660 ppt_teardown_msix(ppt);
661 return (error == 0 ? ENOSPC: error);
662 }
663 }
664
665 if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
666 /* Tear down the IRQ if it's already set up */
667 ppt_teardown_msix_intr(ppt, idx);
668
669 /* Allocate the IRQ resource */
670 ppt->msix.cookie[idx] = NULL;
671 rid = ppt->msix.startrid + idx;
672 ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
673 &rid, RF_ACTIVE);
674 if (ppt->msix.res[idx] == NULL)
675 return (ENXIO);
676
677 ppt->msix.arg[idx].pptdev = ppt;
678 ppt->msix.arg[idx].addr = addr;
679 ppt->msix.arg[idx].msg_data = msg;
680
681 /* Setup the MSI-X interrupt */
682 error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
683 INTR_TYPE_NET | INTR_MPSAFE,
684 pptintr, NULL, &ppt->msix.arg[idx],
685 &ppt->msix.cookie[idx]);
686
687 if (error != 0) {
688 bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
689 ppt->msix.cookie[idx] = NULL;
690 ppt->msix.res[idx] = NULL;
691 return (ENXIO);
692 }
693 } else {
694 /* Masked, tear it down if it's already been set up */
695 ppt_teardown_msix_intr(ppt, idx);
696 }
697
698 return (0);
699 }
700