xref: /dragonfly/sys/vm/vm_vmspace.c (revision 5229377c915d2a82af954d67267edb514bfcca3f)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 2006 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 #include <sys/param.h>
38 #include <sys/kernel.h>
39 #include <sys/systm.h>
40 #include <sys/sysmsg.h>
41 #include <sys/kern_syscall.h>
42 #include <sys/mman.h>
43 #include <sys/thread.h>
44 #include <sys/proc.h>
45 #include <sys/malloc.h>
46 #include <sys/sysctl.h>
47 #include <sys/vkernel.h>
48 #include <sys/vmspace.h>
49 
50 #include <vm/vm_extern.h>
51 #include <vm/pmap.h>
52 
53 #include <machine/vmparam.h>
54 
55 static struct vmspace_entry *vkernel_find_vmspace(struct vkernel_proc *vkp,
56                                                               void *id, int havetoken);
57 static int vmspace_entry_delete(struct vmspace_entry *ve,
58                                          struct vkernel_proc *vkp, int refs);
59 static void vmspace_entry_cache_ref(struct vmspace_entry *ve);
60 static void vmspace_entry_cache_drop(struct vmspace_entry *ve);
61 static void vmspace_entry_drop(struct vmspace_entry *ve);
62 
63 static MALLOC_DEFINE(M_VKERNEL, "vkernel", "VKernel structures");
64 
65 /*
66  * vmspace_create (void *id, int type, void *data)
67  *
68  * Create a VMSPACE under the control of the caller with the specified id.
69  * An id of NULL cannot be used.  The type and data fields must currently
70  * be 0.
71  *
72  * The vmspace starts out completely empty.  Memory may be mapped into the
73  * VMSPACE with vmspace_mmap().
74  *
75  * No requirements.
76  */
77 int
sys_vmspace_create(struct sysmsg * sysmsg,const struct vmspace_create_args * uap)78 sys_vmspace_create(struct sysmsg *sysmsg,
79                        const struct vmspace_create_args *uap)
80 {
81           struct vmspace_entry *ve;
82           struct vkernel_proc *vkp;
83           struct proc *p = curproc;
84           int error;
85 
86           if (vkernel_enable == 0)
87                     return (EOPNOTSUPP);
88 
89           /*
90            * Create a virtual kernel side-structure for the process if one
91            * does not exist.
92            *
93            * Implement a simple resolution for SMP races.
94            */
95           if ((vkp = p->p_vkernel) == NULL) {
96                     vkp = kmalloc(sizeof(*vkp), M_VKERNEL, M_WAITOK|M_ZERO);
97                     lwkt_gettoken(&p->p_token);
98                     if (p->p_vkernel == NULL) {
99                               vkp->refs = 1;
100                               lwkt_token_init(&vkp->token, "vkernel");
101                               RB_INIT(&vkp->root);
102                               p->p_vkernel = vkp;
103                     } else {
104                               kfree(vkp, M_VKERNEL);
105                               vkp = p->p_vkernel;
106                     }
107                     lwkt_reltoken(&p->p_token);
108           }
109 
110           /*
111            * Create a new VMSPACE, disallow conflicting ids
112            */
113           ve = kmalloc(sizeof(struct vmspace_entry), M_VKERNEL, M_WAITOK|M_ZERO);
114           ve->vmspace = vmspace_alloc(VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
115           ve->id = uap->id;
116           ve->refs = 0;                 /* active refs (none) */
117           ve->cache_refs = 1; /* on-tree, not deleted (prevent kfree) */
118           pmap_pinit2(vmspace_pmap(ve->vmspace));
119 
120           lwkt_gettoken(&vkp->token);
121           if (RB_INSERT(vmspace_rb_tree, &vkp->root, ve)) {
122                     vmspace_rel(ve->vmspace);
123                     ve->vmspace = NULL; /* safety */
124                     kfree(ve, M_VKERNEL);
125                     error = EEXIST;
126           } else {
127                     error = 0;
128           }
129           lwkt_reltoken(&vkp->token);
130 
131           return (error);
132 }
133 
134 /*
135  * Destroy a VMSPACE given its identifier.
136  *
137  * No requirements.
138  */
139 int
sys_vmspace_destroy(struct sysmsg * sysmsg,const struct vmspace_destroy_args * uap)140 sys_vmspace_destroy(struct sysmsg *sysmsg,
141                         const struct vmspace_destroy_args *uap)
142 {
143           struct vkernel_proc *vkp;
144           struct vmspace_entry *ve;
145           int error;
146 
147           if ((vkp = curproc->p_vkernel) == NULL)
148                     return EINVAL;
149 
150           /*
151            * vkp->token protects the deletion against a new RB tree search.
152            */
153           lwkt_gettoken(&vkp->token);
154           error = ENOENT;
155           if ((ve = vkernel_find_vmspace(vkp, uap->id, 1)) != NULL) {
156                     error = vmspace_entry_delete(ve, vkp, 1);
157                     if (error == 0)
158                               vmspace_entry_cache_drop(ve);
159           }
160           lwkt_reltoken(&vkp->token);
161 
162           return(error);
163 }
164 
165 /*
166  * vmspace_ctl (void *id, int cmd, struct trapframe *tframe,
167  *                  struct vextframe *vframe);
168  *
169  * Transfer control to a VMSPACE.  Control is returned after the specified
170  * number of microseconds or if a page fault, signal, trap, or system call
171  * occurs.  The context is updated as appropriate.
172  *
173  * No requirements.
174  */
175 int
sys_vmspace_ctl(struct sysmsg * sysmsg,const struct vmspace_ctl_args * uap)176 sys_vmspace_ctl(struct sysmsg *sysmsg,
177                     const struct vmspace_ctl_args *uap)
178 {
179           struct vmspace_ctl_args ua = *uap;
180           struct vkernel_proc *vkp;
181           struct vkernel_lwp *vklp;
182           struct vmspace_entry *ve = NULL;
183           struct lwp *lp;
184           struct proc *p;
185           int framesz;
186           int error;
187 
188           lp = curthread->td_lwp;
189           p = lp->lwp_proc;
190 
191           if ((vkp = p->p_vkernel) == NULL)
192                     return (EINVAL);
193 
194           /*
195            * NOTE: We have to copy *uap into ua because uap is an aliased
196            *         pointer into the sysframe, which we are replacing.
197            */
198           if ((ve = vkernel_find_vmspace(vkp, ua.id, 0)) == NULL) {
199                     error = ENOENT;
200                     goto done;
201           }
202 
203           switch(ua.cmd) {
204           case VMSPACE_CTL_RUN:
205                     /*
206                      * Save the caller's register context, swap VM spaces, and
207                      * install the passed register context.  Return with
208                      * EJUSTRETURN so the syscall code doesn't adjust the context.
209                      */
210                     framesz = sizeof(struct trapframe);
211                     if ((vklp = lp->lwp_vkernel) == NULL) {
212                               vklp = kmalloc(sizeof(*vklp), M_VKERNEL,
213                                                M_WAITOK|M_ZERO);
214                               lp->lwp_vkernel = vklp;
215                     }
216                     if (ve && vklp->ve_cache != ve) {
217                               vmspace_entry_cache_ref(ve);
218                               if (vklp->ve_cache)
219                                         vmspace_entry_cache_drop(vklp->ve_cache);
220                               vklp->ve_cache = ve;
221                     }
222                     vklp->user_trapframe = ua.tframe;
223                     vklp->user_vextframe = ua.vframe;
224                     bcopy(sysmsg->sysmsg_frame, &vklp->save_trapframe, framesz);
225                     bcopy(&curthread->td_tls, &vklp->save_vextframe.vx_tls,
226                           sizeof(vklp->save_vextframe.vx_tls));
227                     error = copyin(ua.tframe, sysmsg->sysmsg_frame, framesz);
228                     if (error == 0) {
229                               error = copyin(&ua.vframe->vx_tls,
230                                                &curthread->td_tls,
231                                                sizeof(struct savetls));
232                     }
233                     if (error == 0)
234                               error = cpu_sanitize_frame(sysmsg->sysmsg_frame);
235                     if (error == 0)
236                               error = cpu_sanitize_tls(&curthread->td_tls);
237                     if (error) {
238                               bcopy(&vklp->save_trapframe, sysmsg->sysmsg_frame,
239                                     framesz);
240                               bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
241                                     sizeof(vklp->save_vextframe.vx_tls));
242                               set_user_TLS();
243                     } else {
244                               vklp->ve = ve;
245                               atomic_add_int(&ve->refs, 1);
246                               pmap_setlwpvm(lp, ve->vmspace);
247                               set_user_TLS();
248                               set_vkernel_fp(sysmsg->sysmsg_frame);
249                               error = EJUSTRETURN;
250                     }
251                     break;
252           default:
253                     error = EOPNOTSUPP;
254                     break;
255           }
256 done:
257           if (ve)
258                     vmspace_entry_drop(ve);
259 
260           return(error);
261 }
262 
263 /*
264  * vmspace_mmap(id, addr, len, prot, flags, fd, offset)
265  *
266  * map memory within a VMSPACE.  This function is just like a normal mmap()
267  * but operates on the vmspace's memory map.
268  *
269  * No requirements.
270  */
271 int
sys_vmspace_mmap(struct sysmsg * sysmsg,const struct vmspace_mmap_args * uap)272 sys_vmspace_mmap(struct sysmsg *sysmsg,
273                      const struct vmspace_mmap_args *uap)
274 {
275           struct vkernel_proc *vkp;
276           struct vmspace_entry *ve;
277           int error;
278 
279           if ((vkp = curproc->p_vkernel) == NULL) {
280                     error = EINVAL;
281                     goto done2;
282           }
283 
284           if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
285                     error = ENOENT;
286                     goto done2;
287           }
288 
289           error = kern_mmap(ve->vmspace, uap->addr, uap->len,
290                                 uap->prot, uap->flags,
291                                 uap->fd, uap->offset, &sysmsg->sysmsg_resultp);
292 
293           vmspace_entry_drop(ve);
294 done2:
295           return (error);
296 }
297 
298 /*
299  * vmspace_munmap(id, addr, len)
300  *
301  * unmap memory within a VMSPACE.
302  *
303  * No requirements.
304  */
305 int
sys_vmspace_munmap(struct sysmsg * sysmsg,const struct vmspace_munmap_args * uap)306 sys_vmspace_munmap(struct sysmsg *sysmsg,
307                        const struct vmspace_munmap_args *uap)
308 {
309           struct vkernel_proc *vkp;
310           struct vmspace_entry *ve;
311           vm_offset_t addr;
312           vm_offset_t tmpaddr;
313           vm_size_t size, pageoff;
314           vm_map_t map;
315           int error;
316 
317           if ((vkp = curproc->p_vkernel) == NULL) {
318                     error = EINVAL;
319                     goto done2;
320           }
321 
322           if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
323                     error = ENOENT;
324                     goto done2;
325           }
326 
327           /*
328            * NOTE: kern_munmap() can block so we need to temporarily
329            *         ref ve->refs.
330            */
331 
332           /*
333            * Copied from sys_munmap()
334            */
335           addr = (vm_offset_t)uap->addr;
336           size = uap->len;
337 
338           pageoff = (addr & PAGE_MASK);
339           addr -= pageoff;
340           size += pageoff;
341           size = (vm_size_t)round_page(size);
342           if (size < uap->len) {                  /* wrap */
343                     error = EINVAL;
344                     goto done1;
345           }
346           tmpaddr = addr + size;                  /* workaround gcc4 opt */
347           if (tmpaddr < addr) {                   /* wrap */
348                     error = EINVAL;
349                     goto done1;
350           }
351           if (size == 0) {
352                     error = 0;
353                     goto done1;
354           }
355 
356           if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
357                     error = EINVAL;
358                     goto done1;
359           }
360           if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) {
361                     error = EINVAL;
362                     goto done1;
363           }
364           map = &ve->vmspace->vm_map;
365           if (!vm_map_check_protection(map, addr, tmpaddr, VM_PROT_NONE, FALSE)) {
366                     error = EINVAL;
367                     goto done1;
368           }
369           vm_map_remove(map, addr, addr + size);
370           error = 0;
371 done1:
372           vmspace_entry_drop(ve);
373 done2:
374           return (error);
375 }
376 
377 /*
378  * vmspace_pread(id, buf, nbyte, flags, offset)
379  *
380  * Read data from a vmspace.  The number of bytes read is returned or
381  * -1 if an unrecoverable error occured.  If the number of bytes read is
382  * less then the request size, a page fault occured in the VMSPACE which
383  * the caller must resolve in order to proceed.
384  *
385  * (not implemented yet)
386  * No requirements.
387  */
388 int
sys_vmspace_pread(struct sysmsg * sysmsg,const struct vmspace_pread_args * uap)389 sys_vmspace_pread(struct sysmsg *sysmsg,
390                       const struct vmspace_pread_args *uap)
391 {
392           struct vkernel_proc *vkp;
393           struct vmspace_entry *ve;
394           int error;
395 
396           if ((vkp = curproc->p_vkernel) == NULL) {
397                     error = EINVAL;
398                     goto done3;
399           }
400 
401           if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
402                     error = ENOENT;
403                     goto done3;
404           }
405           vmspace_entry_drop(ve);
406           error = EINVAL;
407 done3:
408           return (error);
409 }
410 
411 /*
412  * vmspace_pwrite(id, buf, nbyte, flags, offset)
413  *
414  * Write data to a vmspace.  The number of bytes written is returned or
415  * -1 if an unrecoverable error occured.  If the number of bytes written is
416  * less then the request size, a page fault occured in the VMSPACE which
417  * the caller must resolve in order to proceed.
418  *
419  * (not implemented yet)
420  * No requirements.
421  */
422 int
sys_vmspace_pwrite(struct sysmsg * sysmsg,const struct vmspace_pwrite_args * uap)423 sys_vmspace_pwrite(struct sysmsg *sysmsg,
424                        const struct vmspace_pwrite_args *uap)
425 {
426           struct vkernel_proc *vkp;
427           struct vmspace_entry *ve;
428           int error;
429 
430           if ((vkp = curproc->p_vkernel) == NULL) {
431                     error = EINVAL;
432                     goto done3;
433           }
434           if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
435                     error = ENOENT;
436                     goto done3;
437           }
438           vmspace_entry_drop(ve);
439           error = EINVAL;
440 done3:
441           return (error);
442 }
443 
444 /*
445  * vmspace_mcontrol(id, addr, len, behav, value)
446  *
447  * madvise/mcontrol support for a vmspace.
448  *
449  * No requirements.
450  */
451 int
sys_vmspace_mcontrol(struct sysmsg * sysmsg,const struct vmspace_mcontrol_args * uap)452 sys_vmspace_mcontrol(struct sysmsg *sysmsg,
453                          const struct vmspace_mcontrol_args *uap)
454 {
455           struct vkernel_proc *vkp;
456           struct vmspace_entry *ve;
457           struct lwp *lp;
458           vm_offset_t start, end;
459           vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len;
460           int error;
461 
462           lp = curthread->td_lwp;
463           if ((vkp = curproc->p_vkernel) == NULL) {
464                     error = EINVAL;
465                     goto done3;
466           }
467 
468           if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
469                     error = ENOENT;
470                     goto done3;
471           }
472 
473           /*
474            * This code is basically copied from sys_mcontrol()
475            */
476           if (uap->behav < 0 || uap->behav > MADV_CONTROL_END) {
477                     error = EINVAL;
478                     goto done1;
479           }
480 
481           if (tmpaddr < (vm_offset_t)uap->addr) {
482                     error = EINVAL;
483                     goto done1;
484           }
485           if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
486                     error = EINVAL;
487                     goto done1;
488           }
489         if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) {
490                     error = EINVAL;
491                     goto done1;
492           }
493 
494           start = trunc_page((vm_offset_t) uap->addr);
495           end = round_page(tmpaddr);
496 
497           error = vm_map_madvise(&ve->vmspace->vm_map, start, end,
498                                         uap->behav, uap->value);
499 done1:
500           vmspace_entry_drop(ve);
501 done3:
502           return (error);
503 }
504 
505 /*
506  * Red black tree functions
507  */
508 static int rb_vmspace_compare(struct vmspace_entry *, struct vmspace_entry *);
509 RB_GENERATE(vmspace_rb_tree, vmspace_entry, rb_entry, rb_vmspace_compare);
510 
511 /*
512  * a->start is address, and the only field has to be initialized.
513  * The caller must hold vkp->token.
514  *
515  * The caller must hold vkp->token.
516  */
517 static int
rb_vmspace_compare(struct vmspace_entry * a,struct vmspace_entry * b)518 rb_vmspace_compare(struct vmspace_entry *a, struct vmspace_entry *b)
519 {
520         if ((char *)a->id < (char *)b->id)
521                 return(-1);
522         else if ((char *)a->id > (char *)b->id)
523                 return(1);
524         return(0);
525 }
526 
527 /*
528  * The caller must hold vkp->token.
529  */
530 static
531 int
rb_vmspace_delete(struct vmspace_entry * ve,void * data)532 rb_vmspace_delete(struct vmspace_entry *ve, void *data)
533 {
534           struct vkernel_proc *vkp = data;
535 
536           if (vmspace_entry_delete(ve, vkp, 0) == 0)
537                     vmspace_entry_cache_drop(ve);
538           else
539                     panic("rb_vmspace_delete: invalid refs %d", ve->refs);
540           return(0);
541 }
542 
543 /*
544  * Remove a vmspace_entry from the RB tree and destroy it.  We have to clean
545  * up the pmap, the vm_map, then destroy the vmspace.  We gain control of
546  * the associated cache_refs ref, which the caller will drop for us.
547  *
548  * The ve must not have any active references other than those from the
549  * caller.  If it does, EBUSY is returned.  The ve may still maintain
550  * any number of cache references which will drop as the related LWPs
551  * execute vmspace operations or exit.
552  *
553  * 0 is returned on success, EBUSY on failure.  On success the caller must
554  * drop the last cache_refs.  We have dropped the callers active refs.
555  *
556  * The caller must hold vkp->token.
557  */
558 static
559 int
vmspace_entry_delete(struct vmspace_entry * ve,struct vkernel_proc * vkp,int refs)560 vmspace_entry_delete(struct vmspace_entry *ve, struct vkernel_proc *vkp,
561                          int refs)
562 {
563           /*
564            * Interlocked by vkp->token.
565            *
566            * Drop the callers refs and set VKE_REF_DELETED atomically, if
567            * the remaining refs match exactly.  Dropping refs and setting
568            * the DELETED flag atomically protects other threads from trying
569            * to use the ve.
570            *
571            * The caller now owns the final cache_ref that was previously
572            * associated with the live state of the ve.
573            */
574           if (atomic_cmpset_int(&ve->refs, refs, VKE_REF_DELETED) == 0) {
575                     KKASSERT(ve->refs >= refs);
576                     return EBUSY;
577           }
578           RB_REMOVE(vmspace_rb_tree, &vkp->root, ve);
579 
580           pmap_remove_pages(vmspace_pmap(ve->vmspace),
581                                 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
582           vm_map_remove(&ve->vmspace->vm_map,
583                                 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
584           vmspace_rel(ve->vmspace);
585           ve->vmspace = NULL; /* safety */
586 
587           return 0;
588 }
589 
590 /*
591  * Ref a ve for cache purposes
592  */
593 static
594 void
vmspace_entry_cache_ref(struct vmspace_entry * ve)595 vmspace_entry_cache_ref(struct vmspace_entry *ve)
596 {
597           atomic_add_int(&ve->cache_refs, 1);
598 }
599 
600 /*
601  * The ve cache_drop is the final word for a ve.  It gains an extra ref
602  * representing it being on the RB tree and not being in a deleted state.
603  * Removal from the RB tree and deletion manipulate this ref.  The last
604  * drop will thus include full deletion of the ve in addition to the last
605  * cached user going away.
606  */
607 static
608 void
vmspace_entry_cache_drop(struct vmspace_entry * ve)609 vmspace_entry_cache_drop(struct vmspace_entry *ve)
610 {
611           if (atomic_fetchadd_int(&ve->cache_refs, -1) == 1) {
612                     KKASSERT(ve->refs & VKE_REF_DELETED);
613                     kfree(ve, M_VKERNEL);
614           }
615 }
616 
617 /*
618  * Drop primary reference.  The ve cannot be freed on the 1->0 transition.
619  * Instead, ve deletion interlocks the final kfree() via cache_refs.
620  */
621 static
622 void
vmspace_entry_drop(struct vmspace_entry * ve)623 vmspace_entry_drop(struct vmspace_entry *ve)
624 {
625           atomic_fetchadd_int(&ve->refs, -1);
626 }
627 
628 /*
629  * Locate the ve for (id), return the ve or NULL.  If found this function
630  * will bump ve->refs which prevents the ve from being immediately destroyed
631  * (but it can still be removed).
632  *
633  * The cache can potentially contain a stale ve, check by testing ve->vmspace.
634  *
635  * The caller must hold vkp->token if excl is non-zero.
636  */
637 static
638 struct vmspace_entry *
vkernel_find_vmspace(struct vkernel_proc * vkp,void * id,int excl)639 vkernel_find_vmspace(struct vkernel_proc *vkp, void *id, int excl)
640 {
641           struct vmspace_entry *ve;
642           struct vmspace_entry key;
643           struct vkernel_lwp *vklp;
644           struct lwp *lp = curthread->td_lwp;
645 
646           /*
647            * Cache check.  Since we already hold a ref on the cache entry
648            * the ve cannot be ripped out from under us while we cycle
649            * ve->refs.
650            */
651           if ((vklp = lp->lwp_vkernel) != NULL) {
652                     ve = vklp->ve_cache;
653                     if (ve && ve->id == id) {
654                               uint32_t n;
655 
656                               /*
657                                * Bump active refs, check to see if the cache
658                                * entry is stale.  If not, we are good.
659                                */
660                               n = atomic_fetchadd_int(&ve->refs, 1);
661                               if ((n & VKE_REF_DELETED) == 0) {
662                                         KKASSERT(ve->vmspace);
663                                         return ve;
664                               }
665 
666                               /*
667                                * Cache is stale, clean it out and fall through
668                                * to a normal search.
669                                */
670                               vklp->ve_cache = NULL;
671                               vmspace_entry_drop(ve);
672                               vmspace_entry_cache_drop(ve);
673                     }
674           }
675 
676           /*
677            * Normal search protected by vkp->token.  No new ve's can be marked
678            * DELETED while we hold the token so we are safe.
679            */
680           if (excl == 0)
681                     lwkt_gettoken_shared(&vkp->token);
682           key.id = id;
683           ve = RB_FIND(vmspace_rb_tree, &vkp->root, &key);
684           if (ve) {
685                     if (atomic_fetchadd_int(&ve->refs, 1) & VKE_REF_DELETED) {
686                               vmspace_entry_drop(ve);
687                               ve = NULL;
688                     }
689           }
690           if (excl == 0)
691                     lwkt_reltoken(&vkp->token);
692           return (ve);
693 }
694 
695 /*
696  * Manage vkernel refs, used by the kernel when fork()ing or exit()ing
697  * a vkernel process.
698  *
699  * No requirements.
700  */
701 void
vkernel_inherit(struct proc * p1,struct proc * p2)702 vkernel_inherit(struct proc *p1, struct proc *p2)
703 {
704           struct vkernel_proc *vkp;
705 
706           vkp = p1->p_vkernel;
707           KKASSERT(vkp->refs > 0);
708           atomic_add_int(&vkp->refs, 1);
709           p2->p_vkernel = vkp;
710 }
711 
712 /*
713  * No requirements.
714  */
715 void
vkernel_exit(struct proc * p)716 vkernel_exit(struct proc *p)
717 {
718           struct vkernel_proc *vkp;
719           struct lwp *lp;
720 
721           vkp = p->p_vkernel;
722 
723           /*
724            * Restore the original VM context if we are killed while running
725            * a different one.
726            *
727            * This isn't supposed to happen.  What is supposed to happen is
728            * that the process should enter vkernel_trap() before the handling
729            * the signal.
730            */
731           RB_FOREACH(lp, lwp_rb_tree, &p->p_lwp_tree) {
732                     vkernel_lwp_exit(lp);
733           }
734 
735           /*
736            * Dereference the common area
737            */
738           p->p_vkernel = NULL;
739           KKASSERT(vkp->refs > 0);
740 
741           if (atomic_fetchadd_int(&vkp->refs, -1) == 1) {
742                     lwkt_gettoken(&vkp->token);
743                     RB_SCAN(vmspace_rb_tree, &vkp->root, NULL,
744                               rb_vmspace_delete, vkp);
745                     lwkt_reltoken(&vkp->token);
746                     kfree(vkp, M_VKERNEL);
747           }
748 }
749 
750 /*
751  * No requirements.
752  */
753 void
vkernel_lwp_exit(struct lwp * lp)754 vkernel_lwp_exit(struct lwp *lp)
755 {
756           struct vkernel_lwp *vklp;
757           struct vmspace_entry *ve;
758 
759           if ((vklp = lp->lwp_vkernel) != NULL) {
760                     /*
761                      * vkernel thread
762                      */
763                     if ((ve = vklp->ve) != NULL) {
764                               kprintf("Warning, pid %d killed with "
765                                   "active VC!\n", lp->lwp_proc->p_pid);
766                               pmap_setlwpvm(lp, lp->lwp_proc->p_vmspace);
767                               vklp->ve = NULL;
768                               KKASSERT(ve->refs > 0);
769                               vmspace_entry_drop(ve);
770                     }
771                     if ((ve = vklp->ve_cache) != NULL) {
772                               vklp->ve_cache = NULL;
773                               vmspace_entry_cache_drop(ve);
774                     }
775 
776                     lp->lwp_vkernel = NULL;
777                     kfree(vklp, M_VKERNEL);
778           }
779 }
780 
781 /*
782  * A VM space under virtual kernel control trapped out or made a system call
783  * or otherwise needs to return control to the virtual kernel context.
784  *
785  * No requirements.
786  */
787 void
vkernel_trap(struct lwp * lp,struct trapframe * frame)788 vkernel_trap(struct lwp *lp, struct trapframe *frame)
789 {
790           struct proc *p = lp->lwp_proc;
791           struct vmspace_entry *ve;
792           struct vkernel_lwp *vklp;
793           int error;
794 
795           /*
796            * Which vmspace entry was running?
797            */
798           vklp = lp->lwp_vkernel;
799           KKASSERT(vklp);
800 
801           ve = vklp->ve;
802           KKASSERT(ve != NULL);
803 
804           /*
805            * Switch the LWP vmspace back to the virtual kernel's VM space.
806            */
807           vklp->ve = NULL;
808           pmap_setlwpvm(lp, p->p_vmspace);
809           KKASSERT(ve->refs > 0);
810           vmspace_entry_drop(ve);
811           /* ve is invalid once we kill our ref */
812 
813           /*
814            * Copy the emulated process frame to the virtual kernel process.
815            * The emulated process cannot change TLS descriptors so don't
816            * bother saving them, we already have a copy.
817            *
818            * Restore the virtual kernel's saved context so the virtual kernel
819            * process can resume.
820            */
821           error = copyout(frame, vklp->user_trapframe, sizeof(*frame));
822           bcopy(&vklp->save_trapframe, frame, sizeof(*frame));
823           bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
824                 sizeof(vklp->save_vextframe.vx_tls));
825           set_user_TLS();
826           cpu_vkernel_trap(frame, error);
827 }
828