1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * $FreeBSD$
29 */
30
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/malloc.h>
38 #include <sys/module.h>
39 #include <sys/bus.h>
40 #include <sys/pciio.h>
41 #include <sys/rman.h>
42 #include <sys/smp.h>
43 #include <sys/sysctl.h>
44
45 #include <dev/pci/pcivar.h>
46 #include <dev/pci/pcireg.h>
47
48 #include <machine/resource.h>
49
50 #include <machine/vmm.h>
51 #include <machine/vmm_dev.h>
52
53 #include "vmm_lapic.h"
54 #include "vmm_ktr.h"
55
56 #include "iommu.h"
57 #include "ppt.h"
58
59 /* XXX locking */
60
61 #define MAX_MSIMSGS 32
62
63 /*
64 * If the MSI-X table is located in the middle of a BAR then that MMIO
65 * region gets split into two segments - one segment above the MSI-X table
66 * and the other segment below the MSI-X table - with a hole in place of
67 * the MSI-X table so accesses to it can be trapped and emulated.
68 *
69 * So, allocate a MMIO segment for each BAR register + 1 additional segment.
70 */
71 #define MAX_MMIOSEGS ((PCIR_MAX_BAR_0 + 1) + 1)
72
73 MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
74
75 struct pptintr_arg { /* pptintr(pptintr_arg) */
76 struct pptdev *pptdev;
77 uint64_t addr;
78 uint64_t msg_data;
79 };
80
81 struct pptseg {
82 vm_paddr_t gpa;
83 size_t len;
84 int wired;
85 };
86
87 struct pptdev {
88 device_t dev;
89 struct vm *vm; /* owner of this device */
90 TAILQ_ENTRY(pptdev) next;
91 struct pptseg mmio[MAX_MMIOSEGS];
92 struct {
93 int num_msgs; /* guest state */
94
95 int startrid; /* host state */
96 struct resource *res[MAX_MSIMSGS];
97 void *cookie[MAX_MSIMSGS];
98 struct pptintr_arg arg[MAX_MSIMSGS];
99 } msi;
100
101 struct {
102 int num_msgs;
103 int startrid;
104 int msix_table_rid;
105 int msix_pba_rid;
106 struct resource *msix_table_res;
107 struct resource *msix_pba_res;
108 struct resource **res;
109 void **cookie;
110 struct pptintr_arg *arg;
111 } msix;
112 };
113
114 SYSCTL_DECL(_hw_vmm);
115 SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW, 0, "bhyve passthru devices");
116
117 static int num_pptdevs;
118 SYSCTL_INT(_hw_vmm_ppt, OID_AUTO, devices, CTLFLAG_RD, &num_pptdevs, 0,
119 "number of pci passthru devices");
120
121 static TAILQ_HEAD(, pptdev) pptdev_list = TAILQ_HEAD_INITIALIZER(pptdev_list);
122
123 static int
ppt_probe(device_t dev)124 ppt_probe(device_t dev)
125 {
126 int bus, slot, func;
127 struct pci_devinfo *dinfo;
128
129 dinfo = (struct pci_devinfo *)device_get_ivars(dev);
130
131 bus = pci_get_bus(dev);
132 slot = pci_get_slot(dev);
133 func = pci_get_function(dev);
134
135 /*
136 * To qualify as a pci passthrough device a device must:
137 * - be allowed by administrator to be used in this role
138 * - be an endpoint device
139 */
140 if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL)
141 return (ENXIO);
142 else if (vmm_is_pptdev(bus, slot, func))
143 return (0);
144 else
145 /*
146 * Returning BUS_PROBE_NOWILDCARD here matches devices that the
147 * SR-IOV infrastructure specified as "ppt" passthrough devices.
148 * All normal devices that did not have "ppt" specified as their
149 * driver will not be matched by this.
150 */
151 return (BUS_PROBE_NOWILDCARD);
152 }
153
154 static int
ppt_attach(device_t dev)155 ppt_attach(device_t dev)
156 {
157 struct pptdev *ppt;
158
159 ppt = device_get_softc(dev);
160
161 iommu_remove_device(iommu_host_domain(), pci_get_rid(dev));
162 num_pptdevs++;
163 TAILQ_INSERT_TAIL(&pptdev_list, ppt, next);
164 ppt->dev = dev;
165
166 if (bootverbose)
167 device_printf(dev, "attached\n");
168
169 return (0);
170 }
171
172 static int
ppt_detach(device_t dev)173 ppt_detach(device_t dev)
174 {
175 struct pptdev *ppt;
176
177 ppt = device_get_softc(dev);
178
179 if (ppt->vm != NULL)
180 return (EBUSY);
181 num_pptdevs--;
182 TAILQ_REMOVE(&pptdev_list, ppt, next);
183 pci_disable_busmaster(dev);
184 iommu_add_device(iommu_host_domain(), pci_get_rid(dev));
185
186 return (0);
187 }
188
189 static device_method_t ppt_methods[] = {
190 /* Device interface */
191 DEVMETHOD(device_probe, ppt_probe),
192 DEVMETHOD(device_attach, ppt_attach),
193 DEVMETHOD(device_detach, ppt_detach),
194 {0, 0}
195 };
196
197 static devclass_t ppt_devclass;
198 DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, sizeof(struct pptdev));
199 DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
200
201 static struct pptdev *
ppt_find(int bus,int slot,int func)202 ppt_find(int bus, int slot, int func)
203 {
204 device_t dev;
205 struct pptdev *ppt;
206 int b, s, f;
207
208 TAILQ_FOREACH(ppt, &pptdev_list, next) {
209 dev = ppt->dev;
210 b = pci_get_bus(dev);
211 s = pci_get_slot(dev);
212 f = pci_get_function(dev);
213 if (bus == b && slot == s && func == f)
214 return (ppt);
215 }
216 return (NULL);
217 }
218
219 static void
ppt_unmap_mmio(struct vm * vm,struct pptdev * ppt)220 ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
221 {
222 int i;
223 struct pptseg *seg;
224
225 for (i = 0; i < MAX_MMIOSEGS; i++) {
226 seg = &ppt->mmio[i];
227 if (seg->len == 0)
228 continue;
229 (void)vm_unmap_mmio(vm, seg->gpa, seg->len);
230 bzero(seg, sizeof(struct pptseg));
231 }
232 }
233
234 static void
ppt_teardown_msi(struct pptdev * ppt)235 ppt_teardown_msi(struct pptdev *ppt)
236 {
237 int i, rid;
238 void *cookie;
239 struct resource *res;
240
241 if (ppt->msi.num_msgs == 0)
242 return;
243
244 for (i = 0; i < ppt->msi.num_msgs; i++) {
245 rid = ppt->msi.startrid + i;
246 res = ppt->msi.res[i];
247 cookie = ppt->msi.cookie[i];
248
249 if (cookie != NULL)
250 bus_teardown_intr(ppt->dev, res, cookie);
251
252 if (res != NULL)
253 bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
254
255 ppt->msi.res[i] = NULL;
256 ppt->msi.cookie[i] = NULL;
257 }
258
259 if (ppt->msi.startrid == 1)
260 pci_release_msi(ppt->dev);
261
262 ppt->msi.num_msgs = 0;
263 }
264
265 static void
ppt_teardown_msix_intr(struct pptdev * ppt,int idx)266 ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
267 {
268 int rid;
269 struct resource *res;
270 void *cookie;
271
272 rid = ppt->msix.startrid + idx;
273 res = ppt->msix.res[idx];
274 cookie = ppt->msix.cookie[idx];
275
276 if (cookie != NULL)
277 bus_teardown_intr(ppt->dev, res, cookie);
278
279 if (res != NULL)
280 bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
281
282 ppt->msix.res[idx] = NULL;
283 ppt->msix.cookie[idx] = NULL;
284 }
285
286 static void
ppt_teardown_msix(struct pptdev * ppt)287 ppt_teardown_msix(struct pptdev *ppt)
288 {
289 int i;
290
291 if (ppt->msix.num_msgs == 0)
292 return;
293
294 for (i = 0; i < ppt->msix.num_msgs; i++)
295 ppt_teardown_msix_intr(ppt, i);
296
297 free(ppt->msix.res, M_PPTMSIX);
298 free(ppt->msix.cookie, M_PPTMSIX);
299 free(ppt->msix.arg, M_PPTMSIX);
300
301 pci_release_msi(ppt->dev);
302
303 if (ppt->msix.msix_table_res) {
304 bus_release_resource(ppt->dev, SYS_RES_MEMORY,
305 ppt->msix.msix_table_rid,
306 ppt->msix.msix_table_res);
307 ppt->msix.msix_table_res = NULL;
308 ppt->msix.msix_table_rid = 0;
309 }
310 if (ppt->msix.msix_pba_res) {
311 bus_release_resource(ppt->dev, SYS_RES_MEMORY,
312 ppt->msix.msix_pba_rid,
313 ppt->msix.msix_pba_res);
314 ppt->msix.msix_pba_res = NULL;
315 ppt->msix.msix_pba_rid = 0;
316 }
317
318 ppt->msix.num_msgs = 0;
319 }
320
321 int
ppt_avail_devices(void)322 ppt_avail_devices(void)
323 {
324
325 return (num_pptdevs);
326 }
327
328 int
ppt_assigned_devices(struct vm * vm)329 ppt_assigned_devices(struct vm *vm)
330 {
331 struct pptdev *ppt;
332 int num;
333
334 num = 0;
335 TAILQ_FOREACH(ppt, &pptdev_list, next) {
336 if (ppt->vm == vm)
337 num++;
338 }
339 return (num);
340 }
341
342 bool
ppt_is_mmio(struct vm * vm,vm_paddr_t gpa)343 ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
344 {
345 int i;
346 struct pptdev *ppt;
347 struct pptseg *seg;
348
349 TAILQ_FOREACH(ppt, &pptdev_list, next) {
350 if (ppt->vm != vm)
351 continue;
352
353 for (i = 0; i < MAX_MMIOSEGS; i++) {
354 seg = &ppt->mmio[i];
355 if (seg->len == 0)
356 continue;
357 if (gpa >= seg->gpa && gpa < seg->gpa + seg->len)
358 return (true);
359 }
360 }
361
362 return (false);
363 }
364
365 static void
ppt_pci_reset(device_t dev)366 ppt_pci_reset(device_t dev)
367 {
368
369 if (pcie_flr(dev,
370 max(pcie_get_max_completion_timeout(dev) / 1000, 10), true))
371 return;
372
373 pci_power_reset(dev);
374 }
375
376 int
ppt_assign_device(struct vm * vm,int bus,int slot,int func)377 ppt_assign_device(struct vm *vm, int bus, int slot, int func)
378 {
379 struct pptdev *ppt;
380
381 ppt = ppt_find(bus, slot, func);
382 if (ppt != NULL) {
383 /*
384 * If this device is owned by a different VM then we
385 * cannot change its owner.
386 */
387 if (ppt->vm != NULL && ppt->vm != vm)
388 return (EBUSY);
389
390 pci_save_state(ppt->dev);
391 ppt_pci_reset(ppt->dev);
392 pci_restore_state(ppt->dev);
393 ppt->vm = vm;
394 iommu_add_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
395 return (0);
396 }
397 return (ENOENT);
398 }
399
400 int
ppt_unassign_device(struct vm * vm,int bus,int slot,int func)401 ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
402 {
403 struct pptdev *ppt;
404
405 ppt = ppt_find(bus, slot, func);
406 if (ppt != NULL) {
407 /*
408 * If this device is not owned by this 'vm' then bail out.
409 */
410 if (ppt->vm != vm)
411 return (EBUSY);
412
413 pci_save_state(ppt->dev);
414 ppt_pci_reset(ppt->dev);
415 pci_restore_state(ppt->dev);
416 ppt_unmap_mmio(vm, ppt);
417 ppt_teardown_msi(ppt);
418 ppt_teardown_msix(ppt);
419 iommu_remove_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
420 ppt->vm = NULL;
421 return (0);
422 }
423 return (ENOENT);
424 }
425
426 int
ppt_unassign_all(struct vm * vm)427 ppt_unassign_all(struct vm *vm)
428 {
429 struct pptdev *ppt;
430 int bus, slot, func;
431 device_t dev;
432
433 TAILQ_FOREACH(ppt, &pptdev_list, next) {
434 if (ppt->vm == vm) {
435 dev = ppt->dev;
436 bus = pci_get_bus(dev);
437 slot = pci_get_slot(dev);
438 func = pci_get_function(dev);
439 vm_unassign_pptdev(vm, bus, slot, func);
440 }
441 }
442
443 return (0);
444 }
445
446 int
ppt_map_mmio(struct vm * vm,int bus,int slot,int func,vm_paddr_t gpa,size_t len,vm_paddr_t hpa)447 ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
448 vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
449 {
450 int i, error;
451 struct pptseg *seg;
452 struct pptdev *ppt;
453
454 ppt = ppt_find(bus, slot, func);
455 if (ppt != NULL) {
456 if (ppt->vm != vm)
457 return (EBUSY);
458
459 for (i = 0; i < MAX_MMIOSEGS; i++) {
460 seg = &ppt->mmio[i];
461 if (seg->len == 0) {
462 error = vm_map_mmio(vm, gpa, len, hpa);
463 if (error == 0) {
464 seg->gpa = gpa;
465 seg->len = len;
466 }
467 return (error);
468 }
469 }
470 return (ENOSPC);
471 }
472 return (ENOENT);
473 }
474
475 static int
pptintr(void * arg)476 pptintr(void *arg)
477 {
478 struct pptdev *ppt;
479 struct pptintr_arg *pptarg;
480
481 pptarg = arg;
482 ppt = pptarg->pptdev;
483
484 if (ppt->vm != NULL)
485 lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data);
486 else {
487 /*
488 * XXX
489 * This is not expected to happen - panic?
490 */
491 }
492
493 /*
494 * For legacy interrupts give other filters a chance in case
495 * the interrupt was not generated by the passthrough device.
496 */
497 if (ppt->msi.startrid == 0)
498 return (FILTER_STRAY);
499 else
500 return (FILTER_HANDLED);
501 }
502
503 int
ppt_setup_msi(struct vm * vm,int vcpu,int bus,int slot,int func,uint64_t addr,uint64_t msg,int numvec)504 ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
505 uint64_t addr, uint64_t msg, int numvec)
506 {
507 int i, rid, flags;
508 int msi_count, startrid, error, tmp;
509 struct pptdev *ppt;
510
511 if (numvec < 0 || numvec > MAX_MSIMSGS)
512 return (EINVAL);
513
514 ppt = ppt_find(bus, slot, func);
515 if (ppt == NULL)
516 return (ENOENT);
517 if (ppt->vm != vm) /* Make sure we own this device */
518 return (EBUSY);
519
520 /* Free any allocated resources */
521 ppt_teardown_msi(ppt);
522
523 if (numvec == 0) /* nothing more to do */
524 return (0);
525
526 flags = RF_ACTIVE;
527 msi_count = pci_msi_count(ppt->dev);
528 if (msi_count == 0) {
529 startrid = 0; /* legacy interrupt */
530 msi_count = 1;
531 flags |= RF_SHAREABLE;
532 } else
533 startrid = 1; /* MSI */
534
535 /*
536 * The device must be capable of supporting the number of vectors
537 * the guest wants to allocate.
538 */
539 if (numvec > msi_count)
540 return (EINVAL);
541
542 /*
543 * Make sure that we can allocate all the MSI vectors that are needed
544 * by the guest.
545 */
546 if (startrid == 1) {
547 tmp = numvec;
548 error = pci_alloc_msi(ppt->dev, &tmp);
549 if (error)
550 return (error);
551 else if (tmp != numvec) {
552 pci_release_msi(ppt->dev);
553 return (ENOSPC);
554 } else {
555 /* success */
556 }
557 }
558
559 ppt->msi.startrid = startrid;
560
561 /*
562 * Allocate the irq resource and attach it to the interrupt handler.
563 */
564 for (i = 0; i < numvec; i++) {
565 ppt->msi.num_msgs = i + 1;
566 ppt->msi.cookie[i] = NULL;
567
568 rid = startrid + i;
569 ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
570 &rid, flags);
571 if (ppt->msi.res[i] == NULL)
572 break;
573
574 ppt->msi.arg[i].pptdev = ppt;
575 ppt->msi.arg[i].addr = addr;
576 ppt->msi.arg[i].msg_data = msg + i;
577
578 error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
579 INTR_TYPE_NET | INTR_MPSAFE,
580 pptintr, NULL, &ppt->msi.arg[i],
581 &ppt->msi.cookie[i]);
582 if (error != 0)
583 break;
584 }
585
586 if (i < numvec) {
587 ppt_teardown_msi(ppt);
588 return (ENXIO);
589 }
590
591 return (0);
592 }
593
594 int
ppt_setup_msix(struct vm * vm,int vcpu,int bus,int slot,int func,int idx,uint64_t addr,uint64_t msg,uint32_t vector_control)595 ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
596 int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
597 {
598 struct pptdev *ppt;
599 struct pci_devinfo *dinfo;
600 int numvec, alloced, rid, error;
601 size_t res_size, cookie_size, arg_size;
602
603 ppt = ppt_find(bus, slot, func);
604 if (ppt == NULL)
605 return (ENOENT);
606 if (ppt->vm != vm) /* Make sure we own this device */
607 return (EBUSY);
608
609 dinfo = device_get_ivars(ppt->dev);
610 if (!dinfo)
611 return (ENXIO);
612
613 /*
614 * First-time configuration:
615 * Allocate the MSI-X table
616 * Allocate the IRQ resources
617 * Set up some variables in ppt->msix
618 */
619 if (ppt->msix.num_msgs == 0) {
620 numvec = pci_msix_count(ppt->dev);
621 if (numvec <= 0)
622 return (EINVAL);
623
624 ppt->msix.startrid = 1;
625 ppt->msix.num_msgs = numvec;
626
627 res_size = numvec * sizeof(ppt->msix.res[0]);
628 cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
629 arg_size = numvec * sizeof(ppt->msix.arg[0]);
630
631 ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
632 ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
633 M_WAITOK | M_ZERO);
634 ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
635
636 rid = dinfo->cfg.msix.msix_table_bar;
637 ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
638 SYS_RES_MEMORY, &rid, RF_ACTIVE);
639
640 if (ppt->msix.msix_table_res == NULL) {
641 ppt_teardown_msix(ppt);
642 return (ENOSPC);
643 }
644 ppt->msix.msix_table_rid = rid;
645
646 if (dinfo->cfg.msix.msix_table_bar !=
647 dinfo->cfg.msix.msix_pba_bar) {
648 rid = dinfo->cfg.msix.msix_pba_bar;
649 ppt->msix.msix_pba_res = bus_alloc_resource_any(
650 ppt->dev, SYS_RES_MEMORY, &rid, RF_ACTIVE);
651
652 if (ppt->msix.msix_pba_res == NULL) {
653 ppt_teardown_msix(ppt);
654 return (ENOSPC);
655 }
656 ppt->msix.msix_pba_rid = rid;
657 }
658
659 alloced = numvec;
660 error = pci_alloc_msix(ppt->dev, &alloced);
661 if (error || alloced != numvec) {
662 ppt_teardown_msix(ppt);
663 return (error == 0 ? ENOSPC: error);
664 }
665 }
666
667 if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
668 /* Tear down the IRQ if it's already set up */
669 ppt_teardown_msix_intr(ppt, idx);
670
671 /* Allocate the IRQ resource */
672 ppt->msix.cookie[idx] = NULL;
673 rid = ppt->msix.startrid + idx;
674 ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
675 &rid, RF_ACTIVE);
676 if (ppt->msix.res[idx] == NULL)
677 return (ENXIO);
678
679 ppt->msix.arg[idx].pptdev = ppt;
680 ppt->msix.arg[idx].addr = addr;
681 ppt->msix.arg[idx].msg_data = msg;
682
683 /* Setup the MSI-X interrupt */
684 error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
685 INTR_TYPE_NET | INTR_MPSAFE,
686 pptintr, NULL, &ppt->msix.arg[idx],
687 &ppt->msix.cookie[idx]);
688
689 if (error != 0) {
690 bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
691 ppt->msix.cookie[idx] = NULL;
692 ppt->msix.res[idx] = NULL;
693 return (ENXIO);
694 }
695 } else {
696 /* Masked, tear it down if it's already been set up */
697 ppt_teardown_msix_intr(ppt, idx);
698 }
699
700 return (0);
701 }
702