1 /* $NetBSD: privcmd.c,v 1.66 2022/09/01 15:32:16 bouyer Exp $ */
2 
3 /*-
4  * Copyright (c) 2004 Christian Limpach.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: privcmd.c,v 1.66 2022/09/01 15:32:16 bouyer Exp $");
31 
32 #include "opt_xen.h"
33 
34 #include "opt_xen.h"
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/vnode.h>
39 #include <sys/dirent.h>
40 #include <sys/stat.h>
41 #include <sys/proc.h>
42 
43 #include <miscfs/specfs/specdev.h>
44 #include <miscfs/kernfs/kernfs.h>
45 
46 #include <uvm/uvm.h>
47 #include <uvm/uvm_fault.h>
48 #include <uvm/uvm_fault_i.h>
49 
50 #include <xen/kernfs_machdep.h>
51 #include <xen/hypervisor.h>
52 #include <xen/xen.h>
53 #include <xen/xenio.h>
54 #include <xen/xenmem.h>
55 #include <xen/xenpmap.h>
56 #include <xen/granttables.h>
57 
58 #define   PRIVCMD_MODE        (S_IRUSR)
59 
60 /* Magic value is used to mark invalid pages.
61  * This must be a value within the page-offset.
62  * Page-aligned values including 0x0 are used by the guest.
63  */
64 #define INVALID_PAGE          0xfff
65 
66 typedef enum _privcmd_type {
67           PTYPE_PRIVCMD,
68           PTYPE_PRIVCMD_PHYSMAP,
69           PTYPE_GNTDEV_REF,
70           PTYPE_GNTDEV_ALLOC
71 } privcmd_type;
72 
73 struct privcmd_object_privcmd {
74           paddr_t base_paddr; /* base address of physical space */
75         paddr_t *maddr; /* array of machine address to map */
76         int     domid;
77         bool    no_translate;
78 };
79 
80 struct privcmd_object_gntref {
81           paddr_t base_paddr; /* base address of physical space */
82         struct ioctl_gntdev_grant_notify notify;
83           struct gnttab_map_grant_ref ops[1]; /* variable length */
84 };
85 
86 struct privcmd_object_gntalloc {
87         vaddr_t     gntva;    /* granted area mapped in kernel */
88         uint16_t domid;
89         uint16_t flags;
90         struct ioctl_gntdev_grant_notify notify;
91           uint32_t gref_ids[1]; /* variable length */
92 };
93 
94 struct privcmd_object {
95           struct uvm_object uobj;
96           privcmd_type type;
97           int       npages;
98           union {
99                     struct privcmd_object_privcmd pc;
100                     struct privcmd_object_gntref gr;
101                     struct privcmd_object_gntalloc ga;
102           } u;
103 };
104 
105 #define PGO_GNTREF_LEN(count) \
106     (sizeof(struct privcmd_object) + \
107           sizeof(struct gnttab_map_grant_ref) * ((count) - 1))
108 
109 #define PGO_GNTA_LEN(count) \
110     (sizeof(struct privcmd_object) + \
111           sizeof(uint32_t) * ((count) - 1))
112 
113 int privcmd_nobjects = 0;
114 
115 static void privpgop_reference(struct uvm_object *);
116 static void privpgop_detach(struct uvm_object *);
117 static int privpgop_fault(struct uvm_faultinfo *, vaddr_t , struct vm_page **,
118                                 int, int, vm_prot_t, int);
119 static int privcmd_map_obj(struct vm_map *, vaddr_t,
120                                  struct privcmd_object *, vm_prot_t);
121 
122 
123 static int
privcmd_xen2bsd_errno(int error)124 privcmd_xen2bsd_errno(int error)
125 {
126           /*
127            * Xen uses System V error codes.
128            * In order to keep bloat as minimal as possible,
129            * only convert what really impact us.
130            */
131 
132           switch(-error) {
133           case 0:
134                     return 0;
135           case 1:
136                     return EPERM;
137           case 2:
138                     return ENOENT;
139           case 3:
140                     return ESRCH;
141           case 4:
142                     return EINTR;
143           case 5:
144                     return EIO;
145           case 6:
146                     return ENXIO;
147           case 7:
148                     return E2BIG;
149           case 8:
150                     return ENOEXEC;
151           case 9:
152                     return EBADF;
153           case 10:
154                     return ECHILD;
155           case 11:
156                     return EAGAIN;
157           case 12:
158                     return ENOMEM;
159           case 13:
160                     return EACCES;
161           case 14:
162                     return EFAULT;
163           case 15:
164                     return ENOTBLK;
165           case 16:
166                     return EBUSY;
167           case 17:
168                     return EEXIST;
169           case 18:
170                     return EXDEV;
171           case 19:
172                     return ENODEV;
173           case 20:
174                     return ENOTDIR;
175           case 21:
176                     return EISDIR;
177           case 22:
178                     return EINVAL;
179           case 23:
180                     return ENFILE;
181           case 24:
182                     return EMFILE;
183           case 25:
184                     return ENOTTY;
185           case 26:
186                     return ETXTBSY;
187           case 27:
188                     return EFBIG;
189           case 28:
190                     return ENOSPC;
191           case 29:
192                     return ESPIPE;
193           case 30:
194                     return EROFS;
195           case 31:
196                     return EMLINK;
197           case 32:
198                     return EPIPE;
199           case 33:
200                     return EDOM;
201           case 34:
202                     return ERANGE;
203           case 35:
204                     return EDEADLK;
205           case 36:
206                     return ENAMETOOLONG;
207           case 37:
208                     return ENOLCK;
209           case 38:
210                     return ENOSYS;
211           case 39:
212                     return ENOTEMPTY;
213           case 40:
214                     return ELOOP;
215           case 42:
216                     return ENOMSG;
217           case 43:
218                     return EIDRM;
219           case 60:
220                     return ENOSTR;
221           case 61:
222                     return ENODATA;
223           case 62:
224                     return ETIME;
225           case 63:
226                     return ENOSR;
227           case 66:
228                     return EREMOTE;
229           case 74:
230                     return EBADMSG;
231           case 75:
232                     return EOVERFLOW;
233           case 84:
234                     return EILSEQ;
235           case 87:
236                     return EUSERS;
237           case 88:
238                     return ENOTSOCK;
239           case 89:
240                     return EDESTADDRREQ;
241           case 90:
242                     return EMSGSIZE;
243           case 91:
244                     return EPROTOTYPE;
245           case 92:
246                     return ENOPROTOOPT;
247           case 93:
248                     return EPROTONOSUPPORT;
249           case 94:
250                     return ESOCKTNOSUPPORT;
251           case 95:
252                     return EOPNOTSUPP;
253           case 96:
254                     return EPFNOSUPPORT;
255           case 97:
256                     return EAFNOSUPPORT;
257           case 98:
258                     return EADDRINUSE;
259           case 99:
260                     return EADDRNOTAVAIL;
261           case 100:
262                     return ENETDOWN;
263           case 101:
264                     return ENETUNREACH;
265           case 102:
266                     return ENETRESET;
267           case 103:
268                     return ECONNABORTED;
269           case 104:
270                     return ECONNRESET;
271           case 105:
272                     return ENOBUFS;
273           case 106:
274                     return EISCONN;
275           case 107:
276                     return ENOTCONN;
277           case 108:
278                     return ESHUTDOWN;
279           case 109:
280                     return ETOOMANYREFS;
281           case 110:
282                     return ETIMEDOUT;
283           case 111:
284                     return ECONNREFUSED;
285           case 112:
286                     return EHOSTDOWN;
287           case 113:
288                     return EHOSTUNREACH;
289           case 114:
290                     return EALREADY;
291           case 115:
292                     return EINPROGRESS;
293           case 116:
294                     return ESTALE;
295           case 122:
296                     return EDQUOT;
297           default:
298                     printf("unknown xen error code %d\n", -error);
299                     return -error;
300           }
301 }
302 
303 static vm_prot_t
privcmd_get_map_prot(struct vm_map * map,vaddr_t start,off_t size)304 privcmd_get_map_prot(struct vm_map *map, vaddr_t start, off_t size)
305 {
306           vm_prot_t prot;
307 
308           vm_map_lock_read(map);
309           /* get protections. This also check for validity of mapping */
310           if (uvm_map_checkprot(map, start, start + size - 1, VM_PROT_WRITE))
311                     prot = VM_PROT_READ | VM_PROT_WRITE;
312           else if (uvm_map_checkprot(map, start, start + size - 1, VM_PROT_READ))
313                     prot = VM_PROT_READ;
314           else {
315                     printf("privcmd_get_map_prot 0x%lx -> 0x%lx "
316                         "failed\n",
317                         start, (unsigned long)(start + size - 1));
318                     prot = UVM_PROT_NONE;
319           }
320           vm_map_unlock_read(map);
321           return prot;
322 }
323 
324 static int
privcmd_mmap(struct vop_ioctl_args * ap)325 privcmd_mmap(struct vop_ioctl_args *ap)
326 {
327 #ifndef XENPV
328           printf("IOCTL_PRIVCMD_MMAP not supported\n");
329           return EINVAL;
330 #else
331           int i, j;
332           privcmd_mmap_t *mcmd = ap->a_data;
333           privcmd_mmap_entry_t mentry;
334           vaddr_t va;
335           paddr_t ma;
336           struct vm_map *vmm = &curlwp->l_proc->p_vmspace->vm_map;
337           paddr_t *maddr;
338           struct privcmd_object *obj;
339           vm_prot_t prot;
340           int error;
341 
342           for (i = 0; i < mcmd->num; i++) {
343                     error = copyin(&mcmd->entry[i], &mentry, sizeof(mentry));
344                     if (error)
345                               return EINVAL;
346                     if (mentry.npages == 0)
347                               return EINVAL;
348                     if (mentry.va > VM_MAXUSER_ADDRESS)
349                               return EINVAL;
350                     va = mentry.va & ~PAGE_MASK;
351                     prot = privcmd_get_map_prot(vmm, va, mentry.npages * PAGE_SIZE);
352                     if (prot == UVM_PROT_NONE)
353                               return EINVAL;
354                     maddr = kmem_alloc(sizeof(paddr_t) * mentry.npages,
355                         KM_SLEEP);
356                     ma = ((paddr_t)mentry.mfn) <<  PGSHIFT;
357                     for (j = 0; j < mentry.npages; j++) {
358                               maddr[j] = ma;
359                               ma += PAGE_SIZE;
360                     }
361                     obj = kmem_alloc(sizeof(*obj), KM_SLEEP);
362                     obj->type = PTYPE_PRIVCMD;
363                     obj->u.pc.maddr = maddr;
364                     obj->u.pc.no_translate = false;
365                     obj->npages = mentry.npages;
366                     obj->u.pc.domid = mcmd->dom;
367                     error  = privcmd_map_obj(vmm, va, obj, prot);
368                     if (error)
369                               return error;
370           }
371           return 0;
372 #endif
373 }
374 
375 static int
privcmd_mmapbatch(struct vop_ioctl_args * ap)376 privcmd_mmapbatch(struct vop_ioctl_args *ap)
377 {
378 #ifndef XENPV
379           printf("IOCTL_PRIVCMD_MMAPBATCH not supported\n");
380           return EINVAL;
381 #else
382           int i;
383           privcmd_mmapbatch_t* pmb = ap->a_data;
384           vaddr_t va0;
385           u_long mfn;
386           paddr_t ma;
387           struct vm_map *vmm;
388           vaddr_t trymap;
389           paddr_t *maddr;
390           struct privcmd_object *obj;
391           vm_prot_t prot;
392           int error;
393 
394           vmm = &curlwp->l_proc->p_vmspace->vm_map;
395           va0 = pmb->addr & ~PAGE_MASK;
396 
397           if (pmb->num == 0)
398                     return EINVAL;
399           if (va0 > VM_MAXUSER_ADDRESS)
400                     return EINVAL;
401           if (((VM_MAXUSER_ADDRESS - va0) >> PGSHIFT) < pmb->num)
402                     return EINVAL;
403 
404           prot = privcmd_get_map_prot(vmm, va0, PAGE_SIZE);
405           if (prot == UVM_PROT_NONE)
406                     return EINVAL;
407 
408           maddr = kmem_alloc(sizeof(paddr_t) * pmb->num, KM_SLEEP);
409           /* get a page of KVA to check mappins */
410           trymap = uvm_km_alloc(kernel_map, PAGE_SIZE, PAGE_SIZE,
411               UVM_KMF_VAONLY);
412           if (trymap == 0) {
413                     kmem_free(maddr, sizeof(paddr_t) * pmb->num);
414                     return ENOMEM;
415           }
416 
417           obj = kmem_alloc(sizeof(*obj), KM_SLEEP);
418           obj->type = PTYPE_PRIVCMD;
419           obj->u.pc.maddr = maddr;
420           obj->u.pc.no_translate = false;
421           obj->npages = pmb->num;
422           obj->u.pc.domid = pmb->dom;
423 
424           for(i = 0; i < pmb->num; ++i) {
425                     error = copyin(&pmb->arr[i], &mfn, sizeof(mfn));
426                     if (error != 0) {
427                               /* XXX: mappings */
428                               pmap_update(pmap_kernel());
429                               kmem_free(maddr, sizeof(paddr_t) * pmb->num);
430                               uvm_km_free(kernel_map, trymap, PAGE_SIZE,
431                                   UVM_KMF_VAONLY);
432                               return error;
433                     }
434                     ma = ((paddr_t)mfn) << PGSHIFT;
435                     if ((error = pmap_enter_ma(pmap_kernel(), trymap, ma, 0,
436                         prot, PMAP_CANFAIL | prot, pmb->dom))) {
437                               mfn |= 0xF0000000;
438                               copyout(&mfn, &pmb->arr[i], sizeof(mfn));
439                               maddr[i] = INVALID_PAGE;
440                     } else {
441                               pmap_remove(pmap_kernel(), trymap,
442                                   trymap + PAGE_SIZE);
443                               maddr[i] = ma;
444                     }
445           }
446           pmap_update(pmap_kernel());
447           uvm_km_free(kernel_map, trymap, PAGE_SIZE, UVM_KMF_VAONLY);
448 
449           error = privcmd_map_obj(vmm, va0, obj, prot);
450 
451           return error;
452 #endif
453 }
454 
455 static int
privcmd_mmapbatch_v2(struct vop_ioctl_args * ap)456 privcmd_mmapbatch_v2(struct vop_ioctl_args *ap)
457 {
458           int i;
459           privcmd_mmapbatch_v2_t* pmb = ap->a_data;
460           vaddr_t va0;
461           u_long mfn;
462           struct vm_map *vmm;
463           paddr_t *maddr;
464           struct privcmd_object *obj;
465           vm_prot_t prot;
466           int error;
467           paddr_t base_paddr = 0;
468 
469           vmm = &curlwp->l_proc->p_vmspace->vm_map;
470           va0 = pmb->addr & ~PAGE_MASK;
471 
472           if (pmb->num == 0)
473                     return EINVAL;
474           if (va0 > VM_MAXUSER_ADDRESS)
475                     return EINVAL;
476           if (((VM_MAXUSER_ADDRESS - va0) >> PGSHIFT) < pmb->num)
477                     return EINVAL;
478 
479           prot = privcmd_get_map_prot(vmm, va0, PAGE_SIZE);
480           if (prot == UVM_PROT_NONE)
481                     return EINVAL;
482 
483 #ifndef XENPV
484           KASSERT(xen_feature(XENFEAT_auto_translated_physmap));
485           base_paddr = xenmem_alloc_pa(pmb->num * PAGE_SIZE, PAGE_SIZE, true);
486           KASSERT(base_paddr != 0);
487 #endif
488           maddr = kmem_alloc(sizeof(paddr_t) * pmb->num, KM_SLEEP);
489           obj = kmem_alloc(sizeof(*obj), KM_SLEEP);
490           obj->type = PTYPE_PRIVCMD_PHYSMAP;
491           obj->u.pc.maddr = maddr;
492           obj->u.pc.base_paddr = base_paddr;
493           obj->u.pc.no_translate = false;
494           obj->npages = pmb->num;
495           obj->u.pc.domid = pmb->dom;
496 
497           for(i = 0; i < pmb->num; ++i) {
498                     error = copyin(&pmb->arr[i], &mfn, sizeof(mfn));
499                     if (error != 0) {
500                               kmem_free(maddr, sizeof(paddr_t) * pmb->num);
501                               kmem_free(obj, sizeof(*obj));
502 #ifndef XENPV
503                               xenmem_free_pa(base_paddr, pmb->num * PAGE_SIZE);
504 #endif
505                               return error;
506                     }
507 #ifdef XENPV
508                     maddr[i] = ((paddr_t)mfn) << PGSHIFT;
509 #else
510                     maddr[i] = mfn; /* TMP argument for XENMEM_add_to_physmap */
511 #endif
512 
513           }
514           error = privcmd_map_obj(vmm, va0, obj, prot);
515           if (error)
516                     return error;
517 
518           /*
519            * map the range in user process now.
520            * If Xenr return -ENOENT, retry (paging in progress)
521            */
522           for(i = 0; i < pmb->num; i++, va0 += PAGE_SIZE) {
523                     int err, cerr;
524 #ifdef XENPV
525                     for (int j = 0 ; j < 10; j++) {
526                               err = pmap_enter_ma(vmm->pmap, va0, maddr[i], 0,
527                                   prot, PMAP_CANFAIL | prot,
528                                   pmb->dom);
529                               if (err != -2) /* Xen ENOENT */
530                                         break;
531                               if (kpause("xnoent", 1, mstohz(100), NULL))
532                                         break;
533                     }
534                     if (err) {
535                               maddr[i] = INVALID_PAGE;
536                     }
537 #else /* XENPV */
538                     xen_add_to_physmap_batch_t add;
539                     u_long idx;
540                     xen_pfn_t gpfn;
541                     int err2;
542                     memset(&add, 0, sizeof(add));
543 
544                     add.domid = DOMID_SELF;
545                     add.space = XENMAPSPACE_gmfn_foreign;
546                     add.size = 1;
547                     add.foreign_domid = pmb->dom;
548                     idx = maddr[i];
549                     set_xen_guest_handle(add.idxs, &idx);
550                     maddr[i] = INVALID_PAGE;
551                     gpfn = (base_paddr >> PGSHIFT) + i;
552                     set_xen_guest_handle(add.gpfns, &gpfn);
553                     err2 = 0;
554                     set_xen_guest_handle(add.errs, &err2);
555                     err = HYPERVISOR_memory_op(XENMEM_add_to_physmap_batch, &add);
556                     if (err < 0) {
557                               printf("privcmd_mmapbatch_v2: XENMEM_add_to_physmap_batch failed %d\n", err);
558                               privpgop_detach(&obj->uobj);
559                               return privcmd_xen2bsd_errno(err);
560                     }
561                     err = err2;
562                     if (err == 0)
563                               maddr[i] = base_paddr + i * PAGE_SIZE;
564 #endif /* XENPV */
565 
566                     cerr = copyout(&err, &pmb->err[i], sizeof(pmb->err[i]));
567                     if (cerr) {
568                               privpgop_detach(&obj->uobj);
569                               return cerr;
570                     }
571           }
572           return 0;
573 }
574 
575 static int
privcmd_mmap_resource(struct vop_ioctl_args * ap)576 privcmd_mmap_resource(struct vop_ioctl_args *ap)
577 {
578           int i;
579           privcmd_mmap_resource_t* pmr = ap->a_data;
580           vaddr_t va0;
581           struct vm_map *vmm;
582           struct privcmd_object *obj;
583           vm_prot_t prot;
584           int error;
585           struct xen_mem_acquire_resource op;
586           xen_pfn_t *pfns;
587           paddr_t *maddr;
588           paddr_t base_paddr = 0;
589 
590           vmm = &curlwp->l_proc->p_vmspace->vm_map;
591           va0 = pmr->addr & ~PAGE_MASK;
592 
593           if (pmr->num == 0)
594                     return EINVAL;
595           if (va0 > VM_MAXUSER_ADDRESS)
596                     return EINVAL;
597           if (((VM_MAXUSER_ADDRESS - va0) >> PGSHIFT) < pmr->num)
598                     return EINVAL;
599 
600           prot = privcmd_get_map_prot(vmm, va0, PAGE_SIZE);
601           if (prot == UVM_PROT_NONE)
602                     return EINVAL;
603 
604           pfns = kmem_alloc(sizeof(xen_pfn_t) * pmr->num, KM_SLEEP);
605 #ifndef XENPV
606           KASSERT(xen_feature(XENFEAT_auto_translated_physmap));
607           base_paddr = xenmem_alloc_pa(pmr->num * PAGE_SIZE, PAGE_SIZE, true);
608           KASSERT(base_paddr != 0);
609           for (i = 0; i < pmr->num; i++) {
610                     pfns[i] = (base_paddr >> PGSHIFT) + i;
611           }
612 #else
613           KASSERT(!xen_feature(XENFEAT_auto_translated_physmap));
614 #endif
615 
616           memset(&op, 0, sizeof(op));
617           op.domid = pmr->dom;
618           op.type = pmr->type;
619           op.id = pmr->id;
620           op.frame = pmr->idx;
621           op.nr_frames = pmr->num;
622           set_xen_guest_handle(op.frame_list, pfns);
623 
624           error = HYPERVISOR_memory_op(XENMEM_acquire_resource, &op);
625           if (error) {
626                     printf("%s: XENMEM_acquire_resource failed: %d\n",
627                         __func__, error);
628                     return privcmd_xen2bsd_errno(error);
629           }
630           maddr = kmem_alloc(sizeof(paddr_t) * pmr->num, KM_SLEEP);
631           for (i = 0; i < pmr->num; i++) {
632                     maddr[i] = pfns[i] << PGSHIFT;
633           }
634           kmem_free(pfns, sizeof(xen_pfn_t) * pmr->num);
635 
636           obj = kmem_alloc(sizeof(*obj), KM_SLEEP);
637           obj->type = PTYPE_PRIVCMD_PHYSMAP;
638           obj->u.pc.base_paddr = base_paddr;
639           obj->u.pc.maddr = maddr;
640           obj->u.pc.no_translate = true;
641           obj->npages = pmr->num;
642           obj->u.pc.domid = (op.flags & XENMEM_rsrc_acq_caller_owned) ?
643               DOMID_SELF : pmr->dom;
644 
645           error = privcmd_map_obj(vmm, va0, obj, prot);
646           return error;
647 }
648 
649 static int
privcmd_map_gref(struct vop_ioctl_args * ap)650 privcmd_map_gref(struct vop_ioctl_args *ap)
651 {
652           struct ioctl_gntdev_mmap_grant_ref *mgr = ap->a_data;
653           struct vm_map *vmm = &curlwp->l_proc->p_vmspace->vm_map;
654           struct privcmd_object *obj;
655           vaddr_t va0 = (vaddr_t)mgr->va & ~PAGE_MASK;
656           vm_prot_t prot;
657           int error;
658 
659           if (mgr->count == 0)
660                     return EINVAL;
661           if (va0 > VM_MAXUSER_ADDRESS)
662                     return EINVAL;
663           if (((VM_MAXUSER_ADDRESS - va0) >> PGSHIFT) < mgr->count)
664                     return EINVAL;
665           if (mgr->notify.offset < 0 || mgr->notify.offset > mgr->count)
666                     return EINVAL;
667 
668           prot = privcmd_get_map_prot(vmm, va0, PAGE_SIZE);
669           if (prot == UVM_PROT_NONE)
670                     return EINVAL;
671 
672           obj = kmem_alloc(PGO_GNTREF_LEN(mgr->count), KM_SLEEP);
673 
674           obj->type  = PTYPE_GNTDEV_REF;
675           obj->npages = mgr->count;
676           memcpy(&obj->u.gr.notify, &mgr->notify,
677               sizeof(obj->u.gr.notify));
678 #ifndef XENPV
679           KASSERT(xen_feature(XENFEAT_auto_translated_physmap));
680           obj->u.gr.base_paddr = xenmem_alloc_pa(obj->npages * PAGE_SIZE,
681               PAGE_SIZE, true);
682           KASSERT(obj->u.gr.base_paddr != 0);
683 #else
684           obj->u.gr.base_paddr = 0;
685 #endif /* !XENPV */
686 
687           for (int i = 0; i < obj->npages; ++i) {
688                     struct ioctl_gntdev_grant_ref gref;
689                     error = copyin(&mgr->refs[i], &gref, sizeof(gref));
690                     if (error != 0) {
691                               goto err1;
692                     }
693 #ifdef XENPV
694                     obj->u.gr.ops[i].host_addr = 0;
695                     obj->u.gr.ops[i].flags = GNTMAP_host_map |
696                         GNTMAP_application_map | GNTMAP_contains_pte;
697 #else /* XENPV */
698                     obj->u.gr.ops[i].host_addr =
699                         obj->u.gr.base_paddr + PAGE_SIZE * i;
700                     obj->u.gr.ops[i].flags = GNTMAP_host_map;
701 #endif /* XENPV */
702                     obj->u.gr.ops[i].dev_bus_addr = 0;
703                     obj->u.gr.ops[i].ref = gref.ref;
704                     obj->u.gr.ops[i].dom = gref.domid;
705                     obj->u.gr.ops[i].handle = -1;
706                     if (prot == UVM_PROT_READ)
707                               obj->u.gr.ops[i].flags |= GNTMAP_readonly;
708           }
709           error = privcmd_map_obj(vmm, va0, obj, prot);
710           return error;
711 err1:
712 #ifndef XENPV
713           xenmem_free_pa(obj->u.gr.base_paddr, obj->npages * PAGE_SIZE);
714 #endif
715           kmem_free(obj, PGO_GNTREF_LEN(obj->npages));
716           return error;
717 }
718 
719 static int
privcmd_alloc_gref(struct vop_ioctl_args * ap)720 privcmd_alloc_gref(struct vop_ioctl_args *ap)
721 {
722           struct ioctl_gntdev_alloc_grant_ref *mga = ap->a_data;
723           struct vm_map *vmm = &curlwp->l_proc->p_vmspace->vm_map;
724           struct privcmd_object *obj;
725           vaddr_t va0 = (vaddr_t)mga->va & ~PAGE_MASK;
726           vm_prot_t prot;
727           int error, ret;
728 
729           if (mga->count == 0)
730                     return EINVAL;
731           if (va0 > VM_MAXUSER_ADDRESS)
732                     return EINVAL;
733           if (((VM_MAXUSER_ADDRESS - va0) >> PGSHIFT) < mga->count)
734                     return EINVAL;
735           if (mga->notify.offset < 0 || mga->notify.offset > mga->count)
736                     return EINVAL;
737 
738           prot = privcmd_get_map_prot(vmm, va0, PAGE_SIZE);
739           if (prot == UVM_PROT_NONE)
740                     return EINVAL;
741 
742           obj = kmem_alloc(PGO_GNTA_LEN(mga->count), KM_SLEEP);
743 
744           obj->type  = PTYPE_GNTDEV_ALLOC;
745           obj->npages = mga->count;
746           obj->u.ga.domid = mga->domid;
747           memcpy(&obj->u.ga.notify, &mga->notify,
748               sizeof(obj->u.ga.notify));
749           obj->u.ga.gntva = uvm_km_alloc(kernel_map,
750               PAGE_SIZE * obj->npages, PAGE_SIZE, UVM_KMF_WIRED | UVM_KMF_ZERO);
751           if (obj->u.ga.gntva == 0) {
752                     error = ENOMEM;
753                     goto err1;
754           }
755 
756           for (int i = 0; i < obj->npages; ++i) {
757                     paddr_t ma;
758                     vaddr_t va = obj->u.ga.gntva + i * PAGE_SIZE;
759                     grant_ref_t id;
760                     bool ro = ((mga->flags & GNTDEV_ALLOC_FLAG_WRITABLE) == 0);
761                     (void)pmap_extract_ma(pmap_kernel(), va, &ma);
762                     if ((ret = xengnt_grant_access(mga->domid, ma, ro, &id)) != 0) {
763                               printf("%s: xengnt_grant_access failed: %d\n",
764                                   __func__, ret);
765                               for (int j = 0; j < i; j++) {
766                                         xengnt_revoke_access(obj->u.ga.gref_ids[j]);
767                                         error = ret;
768                                         goto err2;
769                               }
770                     }
771                     obj->u.ga.gref_ids[i] = id;
772           }
773 
774           error = copyout(&obj->u.ga.gref_ids[0], mga->gref_ids,
775               sizeof(uint32_t) * obj->npages);
776           if (error) {
777                     for (int i = 0; i < obj->npages; ++i) {
778                               xengnt_revoke_access(obj->u.ga.gref_ids[i]);
779                     }
780                     goto err2;
781           }
782 
783           error = privcmd_map_obj(vmm, va0, obj, prot);
784           return error;
785 
786 err2:
787           uvm_km_free(kernel_map, obj->u.ga.gntva,
788               PAGE_SIZE * obj->npages, UVM_KMF_WIRED);
789 err1:
790           kmem_free(obj, PGO_GNTA_LEN(obj->npages));
791           return error;
792 }
793 
794 static int
privcmd_ioctl(void * v)795 privcmd_ioctl(void *v)
796 {
797           struct vop_ioctl_args /* {
798                     const struct vnodeop_desc *a_desc;
799                     struct vnode *a_vp;
800                     u_long a_command;
801                     void *a_data;
802                     int a_fflag;
803                     kauth_cred_t a_cred;
804           } */ *ap = v;
805           int error = 0;
806 
807           switch (ap->a_command) {
808           case IOCTL_PRIVCMD_HYPERCALL:
809           case IOCTL_PRIVCMD_HYPERCALL_OLD:
810           /*
811            * oprivcmd_hypercall_t is privcmd_hypercall_t without the last entry
812            */
813           {
814                     privcmd_hypercall_t *hc = ap->a_data;
815                     if (hc->op >= (PAGE_SIZE >> 5))
816                               return EINVAL;
817                     error = -EOPNOTSUPP;
818 #if defined(__i386__)
819                     __asm volatile (
820                               "pushl %%ebx; pushl %%ecx; pushl %%edx;"
821                               "pushl %%esi; pushl %%edi; "
822                               "movl  4(%%eax),%%ebx ;"
823                               "movl  8(%%eax),%%ecx ;"
824                               "movl 12(%%eax),%%edx ;"
825                               "movl 16(%%eax),%%esi ;"
826                               "movl 20(%%eax),%%edi ;"
827                               "movl   (%%eax),%%eax ;"
828                               "shll $5,%%eax ;"
829                               "addl $hypercall_page,%%eax ;"
830                               "call *%%eax ;"
831                               "popl %%edi; popl %%esi; popl %%edx;"
832                               "popl %%ecx; popl %%ebx"
833                               : "=a" (error) : "0" (ap->a_data) : "memory" );
834 #endif /* __i386__ */
835 #if defined(__x86_64__)
836 #ifndef XENPV
837                     /* hypervisor can't access user memory if SMAP is enabled */
838                     smap_disable();
839 #endif
840                     {
841                     long i1, i2, i3;
842                     __asm volatile (
843                               "movq %8,%%r10; movq %9,%%r8;"
844                               "shll $5,%%eax ;"
845                               "addq $hypercall_page,%%rax ;"
846                               "call *%%rax"
847                               : "=a" (error), "=D" (i1),
848                                 "=S" (i2), "=d" (i3)
849                               : "0" ((unsigned int)hc->op),
850                                 "1" (hc->arg[0]),
851                                 "2" (hc->arg[1]),
852                                 "3" (hc->arg[2]),
853                                 "g" (hc->arg[3]),
854                                 "g" (hc->arg[4])
855                               : "r8", "r10", "memory" );
856                     }
857 #ifndef XENPV
858                     smap_enable();
859 #endif
860 #endif /* __x86_64__ */
861                     if (ap->a_command == IOCTL_PRIVCMD_HYPERCALL) {
862                               if (error >= 0) {
863                                         hc->retval = error;
864                                         error = 0;
865                               } else {
866                                         /* error occurred, return the errno */
867                                         error = privcmd_xen2bsd_errno(error);
868                                         hc->retval = 0;
869                               }
870                     } else {
871                               error = privcmd_xen2bsd_errno(error);
872                     }
873                     break;
874           }
875           case IOCTL_PRIVCMD_MMAP:
876                     return privcmd_mmap(ap);
877 
878           case IOCTL_PRIVCMD_MMAPBATCH:
879                     return privcmd_mmapbatch(ap);
880 
881           case IOCTL_PRIVCMD_MMAPBATCH_V2:
882                     return privcmd_mmapbatch_v2(ap);
883 
884           case IOCTL_PRIVCMD_MMAP_RESOURCE:
885                     return privcmd_mmap_resource(ap);
886 
887           case IOCTL_GNTDEV_MMAP_GRANT_REF:
888                     return privcmd_map_gref(ap);
889 
890           case IOCTL_GNTDEV_ALLOC_GRANT_REF:
891                     return privcmd_alloc_gref(ap);
892           default:
893                     error = EINVAL;
894           }
895 
896           return error;
897 }
898 
899 static const struct uvm_pagerops privpgops = {
900   .pgo_reference = privpgop_reference,
901   .pgo_detach = privpgop_detach,
902   .pgo_fault = privpgop_fault,
903 };
904 
905 static void
privpgop_reference(struct uvm_object * uobj)906 privpgop_reference(struct uvm_object *uobj)
907 {
908           rw_enter(uobj->vmobjlock, RW_WRITER);
909           uobj->uo_refs++;
910           rw_exit(uobj->vmobjlock);
911 }
912 
913 static void
privcmd_notify(struct ioctl_gntdev_grant_notify * notify,vaddr_t va,struct gnttab_map_grant_ref * gmops)914 privcmd_notify(struct ioctl_gntdev_grant_notify *notify, vaddr_t va,
915     struct gnttab_map_grant_ref *gmops)
916 {
917           if (notify->action & UNMAP_NOTIFY_SEND_EVENT) {
918                     hypervisor_notify_via_evtchn(notify->event_channel_port);
919           }
920           if ((notify->action & UNMAP_NOTIFY_CLEAR_BYTE) == 0) {
921                     notify->action = 0;
922                     return;
923           }
924           if (va == 0) {
925                     struct gnttab_map_grant_ref op;
926                     struct gnttab_unmap_grant_ref uop;
927                     int i = notify->offset / PAGE_SIZE;
928                     int o = notify->offset % PAGE_SIZE;
929                     int err;
930 #ifndef XENPV
931                     paddr_t base_paddr;
932                     base_paddr = xenmem_alloc_pa(PAGE_SIZE, PAGE_SIZE, true);
933 #endif
934 
935                     KASSERT(gmops != NULL);
936                     va = uvm_km_alloc(kernel_map, PAGE_SIZE, PAGE_SIZE,
937                         UVM_KMF_VAONLY | UVM_KMF_WAITVA);
938 #ifndef XENPV
939                     op.host_addr = base_paddr;
940 #else
941                     op.host_addr = va;
942 #endif
943                     op.dev_bus_addr = 0;
944                     op.ref = gmops[i].ref;
945                     op.dom = gmops[i].dom;
946                     op.handle = -1;
947                     op.flags = GNTMAP_host_map;
948                     err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
949                     if (err == 0 && op.status == GNTST_okay) {
950 #ifndef XENPV
951                               pmap_kenter_pa(va, base_paddr,
952                                   VM_PROT_READ | VM_PROT_WRITE, 0);
953 #endif
954                               char *n = (void *)(va + o);
955                               *n = 0;
956 #ifndef XENPV
957                               pmap_kremove(va, PAGE_SIZE);
958                               uop.host_addr = base_paddr;
959 #else
960                               uop.host_addr = va;
961 #endif
962                               uop.handle = op.handle;
963                               uop.dev_bus_addr = 0;
964                               (void)HYPERVISOR_grant_table_op(
965                                   GNTTABOP_unmap_grant_ref, &uop, 1);
966                     }
967                     uvm_km_free(kernel_map, va, PAGE_SIZE, UVM_KMF_VAONLY);
968 #ifndef XENPV
969                     xenmem_free_pa(base_paddr, PAGE_SIZE);
970 #endif
971           } else {
972                     KASSERT(gmops == NULL);
973                     char *n = (void *)(va + notify->offset);
974                     *n = 0;
975           }
976           notify->action = 0;
977 }
978 
979 static void
privpgop_detach(struct uvm_object * uobj)980 privpgop_detach(struct uvm_object *uobj)
981 {
982           struct privcmd_object *pobj = (struct privcmd_object *)uobj;
983 
984           rw_enter(uobj->vmobjlock, RW_WRITER);
985           KASSERT(uobj->uo_refs > 0);
986           if (uobj->uo_refs > 1) {
987                     uobj->uo_refs--;
988                     rw_exit(uobj->vmobjlock);
989                     return;
990           }
991           rw_exit(uobj->vmobjlock);
992           switch (pobj->type) {
993           case PTYPE_PRIVCMD_PHYSMAP:
994 #ifndef XENPV
995                     for (int i = 0; i < pobj->npages; i++) {
996                               if (pobj->u.pc.maddr[i] != INVALID_PAGE) {
997                                         struct xen_remove_from_physmap rm;
998                                         rm.domid = DOMID_SELF;
999                                         rm.gpfn = pobj->u.pc.maddr[i] >> PGSHIFT;
1000                                         HYPERVISOR_memory_op(
1001                                             XENMEM_remove_from_physmap, &rm);
1002                               }
1003                     }
1004                     xenmem_free_pa(pobj->u.pc.base_paddr, pobj->npages * PAGE_SIZE);
1005 #endif
1006                     /* FALLTHROUGH */
1007           case PTYPE_PRIVCMD:
1008                     kmem_free(pobj->u.pc.maddr, sizeof(paddr_t) * pobj->npages);
1009                     uvm_obj_destroy(uobj, true);
1010                     kmem_free(pobj, sizeof(struct privcmd_object));
1011                     break;
1012           case PTYPE_GNTDEV_REF:
1013           {
1014                     privcmd_notify(&pobj->u.gr.notify, 0, pobj->u.gr.ops);
1015 #ifndef XENPV
1016                     KASSERT(pobj->u.gr.base_paddr != 0);
1017                     for (int i = 0; i < pobj->npages; i++) {
1018                               struct xen_remove_from_physmap rm;
1019                               rm.domid = DOMID_SELF;
1020                               rm.gpfn = (pobj->u.gr.base_paddr << PGSHIFT) + i;
1021                               HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &rm);
1022                     }
1023                     xenmem_free_pa(pobj->u.gr.base_paddr, pobj->npages * PAGE_SIZE);
1024 #endif
1025                     kmem_free(pobj, PGO_GNTREF_LEN(pobj->npages));
1026                     break;
1027           }
1028           case PTYPE_GNTDEV_ALLOC:
1029                     privcmd_notify(&pobj->u.ga.notify, pobj->u.ga.gntva, NULL);
1030                     for (int i = 0; i < pobj->npages; ++i) {
1031                               xengnt_revoke_access(pobj->u.ga.gref_ids[i]);
1032                     }
1033                     uvm_km_free(kernel_map, pobj->u.ga.gntva,
1034                         PAGE_SIZE * pobj->npages, UVM_KMF_WIRED);
1035                     kmem_free(pobj, PGO_GNTA_LEN(pobj->npages));
1036           }
1037           privcmd_nobjects--;
1038 }
1039 
1040 static int
privpgop_fault(struct uvm_faultinfo * ufi,vaddr_t vaddr,struct vm_page ** pps,int npages,int centeridx,vm_prot_t access_type,int flags)1041 privpgop_fault(struct uvm_faultinfo *ufi, vaddr_t vaddr, struct vm_page **pps,
1042     int npages, int centeridx, vm_prot_t access_type, int flags)
1043 {
1044           struct vm_map_entry *entry = ufi->entry;
1045           struct uvm_object *uobj = entry->object.uvm_obj;
1046           struct privcmd_object *pobj = (struct privcmd_object*)uobj;
1047           int maddr_i, i, error = 0;
1048 
1049           /* compute offset from start of map */
1050           maddr_i = (entry->offset + (vaddr - entry->start)) >> PAGE_SHIFT;
1051           if (maddr_i + npages > pobj->npages) {
1052                     return EINVAL;
1053           }
1054           for (i = 0; i < npages; i++, maddr_i++, vaddr+= PAGE_SIZE) {
1055                     if ((flags & PGO_ALLPAGES) == 0 && i != centeridx)
1056                               continue;
1057                     if (pps[i] == PGO_DONTCARE)
1058                               continue;
1059                     switch(pobj->type) {
1060                     case PTYPE_PRIVCMD:
1061                     case PTYPE_PRIVCMD_PHYSMAP:
1062                     {
1063                               u_int pm_flags = PMAP_CANFAIL | ufi->entry->protection;
1064 #ifdef XENPV
1065                               if (pobj->u.pc.no_translate)
1066                                         pm_flags |= PMAP_MD_XEN_NOTR;
1067 #endif
1068                               if (pobj->u.pc.maddr[maddr_i] == INVALID_PAGE) {
1069                                         /* This has already been flagged as error. */
1070                                         error = EFAULT;
1071                                         goto out;
1072                               }
1073                               error = pmap_enter_ma(ufi->orig_map->pmap, vaddr,
1074                                   pobj->u.pc.maddr[maddr_i], 0,
1075                                   ufi->entry->protection, pm_flags,
1076                                   pobj->u.pc.domid);
1077                               if (error == ENOMEM) {
1078                                         goto out;
1079                               }
1080                               if (error) {
1081                                         pobj->u.pc.maddr[maddr_i] = INVALID_PAGE;
1082                                         error = EFAULT;
1083                               }
1084                               break;
1085                     }
1086                     case PTYPE_GNTDEV_REF:
1087                     {
1088                               struct pmap *pmap = ufi->orig_map->pmap;
1089                               if (pmap_enter_gnt(pmap, vaddr, entry->start, pobj->npages, &pobj->u.gr.ops[0]) != GNTST_okay) {
1090                                         error = EFAULT;
1091                                         goto out;
1092                               }
1093                               break;
1094                     }
1095                     case PTYPE_GNTDEV_ALLOC:
1096                     {
1097                               paddr_t pa;
1098                               if (!pmap_extract(pmap_kernel(),
1099                                   pobj->u.ga.gntva + maddr_i * PAGE_SIZE, &pa)) {
1100                                         error = EFAULT;
1101                                         goto out;
1102                               }
1103                               error = pmap_enter(ufi->orig_map->pmap, vaddr, pa,
1104                                   ufi->entry->protection,
1105                                   PMAP_CANFAIL | ufi->entry->protection);
1106                               if (error == ENOMEM) {
1107                                         goto out;
1108                               }
1109                               break;
1110                     }
1111                     }
1112                     if (error) {
1113                               /* XXX for proper ptp accountings */
1114                               pmap_remove(ufi->orig_map->pmap, vaddr,
1115                                   vaddr + PAGE_SIZE);
1116                     }
1117           }
1118 out:
1119           pmap_update(ufi->orig_map->pmap);
1120           uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, uobj);
1121           return error;
1122 }
1123 
1124 static int
privcmd_map_obj(struct vm_map * map,vaddr_t start,struct privcmd_object * obj,vm_prot_t prot)1125 privcmd_map_obj(struct vm_map *map, vaddr_t start, struct privcmd_object *obj,
1126     vm_prot_t prot)
1127 {
1128           int error;
1129           uvm_flag_t uvmflag;
1130           vaddr_t newstart = start;
1131           off_t size = ((off_t)obj->npages << PGSHIFT);
1132 
1133           privcmd_nobjects++;
1134           uvm_obj_init(&obj->uobj, &privpgops, true, 1);
1135           uvmflag = UVM_MAPFLAG(prot, prot, UVM_INH_NONE, UVM_ADV_NORMAL,
1136               UVM_FLAG_FIXED | UVM_FLAG_UNMAP | UVM_FLAG_NOMERGE);
1137           error = uvm_map(map, &newstart, size, &obj->uobj, 0, 0, uvmflag);
1138 
1139           if (error)
1140                     obj->uobj.pgops->pgo_detach(&obj->uobj);
1141           return error;
1142 }
1143 
1144 static const struct kernfs_fileop privcmd_fileops[] = {
1145   { .kf_fileop = KERNFS_FILEOP_IOCTL, .kf_vop = privcmd_ioctl },
1146 };
1147 
1148 void
xenprivcmd_init(void)1149 xenprivcmd_init(void)
1150 {
1151           kernfs_entry_t *dkt;
1152           kfstype kfst;
1153 
1154           if (!xendomain_is_privileged())
1155                     return;
1156 
1157           kfst = KERNFS_ALLOCTYPE(privcmd_fileops);
1158 
1159           KERNFS_ALLOCENTRY(dkt, KM_SLEEP);
1160           KERNFS_INITENTRY(dkt, DT_REG, "privcmd", NULL, kfst, VREG,
1161               PRIVCMD_MODE);
1162           kernfs_addentry(kernxen_pkt, dkt);
1163 }
1164