1 /**	$MirOS: src/sys/uvm/uvm_mmap.c,v 1.5 2014/07/13 12:35:45 tg Exp $ */
2 /*	$OpenBSD: uvm_mmap.c,v 1.55 2005/01/15 06:54:51 otto Exp $	*/
3 /*	+OpenBSD: uvm_mmap.c,v 1.91 2012/07/21 06:46:58 matthew Exp $	*/
4 /*	$NetBSD: uvm_mmap.c,v 1.49 2001/02/18 21:19:08 chs Exp $	*/
5 
6 /*
7  * Copyright (c) 1997 Charles D. Cranor and Washington University.
8  * Copyright (c) 1991, 1993 The Regents of the University of California.
9  * Copyright (c) 1988 University of Utah.
10  *
11  * All rights reserved.
12  *
13  * This code is derived from software contributed to Berkeley by
14  * the Systems Programming Group of the University of Utah Computer
15  * Science Department.
16  *
17  * Redistribution and use in source and binary forms, with or without
18  * modification, are permitted provided that the following conditions
19  * are met:
20  * 1. Redistributions of source code must retain the above copyright
21  *    notice, this list of conditions and the following disclaimer.
22  * 2. Redistributions in binary form must reproduce the above copyright
23  *    notice, this list of conditions and the following disclaimer in the
24  *    documentation and/or other materials provided with the distribution.
25  * 3. All advertising materials mentioning features or use of this software
26  *    must display the following acknowledgement:
27  *      This product includes software developed by the Charles D. Cranor,
28  *	Washington University, University of California, Berkeley and
29  *	its contributors.
30  * 4. Neither the name of the University nor the names of its contributors
31  *    may be used to endorse or promote products derived from this software
32  *    without specific prior written permission.
33  *
34  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
35  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
37  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
38  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
40  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
41  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
42  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
43  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
44  * SUCH DAMAGE.
45  *
46  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
47  *      @(#)vm_mmap.c   8.5 (Berkeley) 5/19/94
48  * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
49  */
50 
51 /*
52  * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
53  * function.
54  */
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/file.h>
58 #include <sys/filedesc.h>
59 #include <sys/resourcevar.h>
60 #include <sys/mman.h>
61 #include <sys/mount.h>
62 #include <sys/proc.h>
63 #include <sys/malloc.h>
64 #include <sys/vnode.h>
65 #include <sys/conf.h>
66 #include <sys/stat.h>
67 
68 #include <machine/exec.h>	/* for __LDPGSZ */
69 
70 #include <miscfs/specfs/specdev.h>
71 
72 #include <sys/syscallargs.h>
73 
74 #include <uvm/uvm.h>
75 #include <uvm/uvm_device.h>
76 #include <uvm/uvm_vnode.h>
77 
78 
79 /*
80  * unimplemented VM system calls:
81  */
82 
83 /*
84  * sys_mquery: provide mapping hints to applications that do fixed mappings
85  *
86  * flags: 0 or MAP_FIXED (MAP_FIXED - means that we insist on this addr and
87  *	don't care about PMAP_PREFER or such)
88  * addr: hint where we'd like to place the mapping.
89  * size: size of the mapping
90  * fd: fd of the file we want to map
91  * off: offset within the file
92  */
93 
94 int
sys_mquery(p,v,retval)95 sys_mquery(p, v, retval)
96 	struct proc *p;
97 	void *v;
98 	register_t *retval;
99 {
100 	struct sys_mquery_args /* {
101 		syscallarg(void *) addr;
102 		syscallarg(size_t) len;
103 		syscallarg(int) prot;
104 		syscallarg(int) flags;
105 		syscallarg(int) fd;
106 		syscallarg(long) pad;
107 		syscallarg(off_t) pos;
108 	} */ *uap = v;
109 	struct file *fp;
110 	struct uvm_object *uobj;
111 	voff_t uoff;
112 	int error;
113 	vaddr_t vaddr;
114 	int flags = 0;
115 	vsize_t size;
116 	vm_prot_t prot;
117 	int fd;
118 
119 	vaddr = (vaddr_t) SCARG(uap, addr);
120 	prot = SCARG(uap, prot);
121 	size = (vsize_t) SCARG(uap, len);
122 	fd = SCARG(uap, fd);
123 
124 	if ((prot & VM_PROT_ALL) != prot)
125 		return (EINVAL);
126 
127 	if (SCARG(uap, flags) & MAP_FIXED)
128 		flags |= UVM_FLAG_FIXED;
129 
130 	if (fd >= 0) {
131 		if ((error = getvnode(p->p_fd, fd, &fp)) != 0)
132 			return (error);
133 		uobj = &((struct vnode *)fp->f_data)->v_uvm.u_obj;
134 		uoff = SCARG(uap, pos);
135 	} else {
136 		fp = NULL;
137 		uobj = NULL;
138 		uoff = 0;
139 	}
140 
141 	if (vaddr == 0)
142 		vaddr = uvm_map_hint(p, prot);
143 
144 	/* prevent a user requested address from falling in heap space */
145 	if ((vaddr + size > (vaddr_t)p->p_vmspace->vm_daddr) &&
146 	    (vaddr < (vaddr_t)p->p_vmspace->vm_daddr + MAXDSIZ)) {
147 		if (flags & UVM_FLAG_FIXED) {
148 			error = EINVAL;
149 			goto done;
150 		}
151 		vaddr = round_page((vaddr_t)p->p_vmspace->vm_daddr + MAXDSIZ);
152 	}
153 again:
154 
155 	if (uvm_map_findspace(&p->p_vmspace->vm_map, vaddr, size,
156 	    &vaddr, uobj, uoff, 0, flags) == NULL) {
157 		if (flags & UVM_FLAG_FIXED)
158 			error = EINVAL;
159 		else
160 			error = ENOMEM;
161 	} else {
162 		/* prevent a returned address from falling in heap space */
163 		if ((vaddr + size > (vaddr_t)p->p_vmspace->vm_daddr)
164 		    && (vaddr < (vaddr_t)p->p_vmspace->vm_daddr + MAXDSIZ)) {
165 			vaddr = round_page((vaddr_t)p->p_vmspace->vm_daddr +
166 			    MAXDSIZ);
167 			goto again;
168 		}
169 		error = 0;
170 		*retval = (register_t)(vaddr);
171 	}
172 done:
173 	if (fp != NULL)
174 		FRELE(fp);
175 	return (error);
176 }
177 
178 /*
179  * sys_mincore: determine if pages are in core or not.
180  */
181 
182 /* ARGSUSED */
183 int
sys_mincore(p,v,retval)184 sys_mincore(p, v, retval)
185 	struct proc *p;
186 	void *v;
187 	register_t *retval;
188 {
189 	struct sys_mincore_args /* {
190 		syscallarg(void *) addr;
191 		syscallarg(size_t) len;
192 		syscallarg(char *) vec;
193 	} */ *uap = v;
194 	vm_page_t m;
195 	char *vec, pgi;
196 	struct uvm_object *uobj;
197 	struct vm_amap *amap;
198 	struct vm_anon *anon;
199 	vm_map_entry_t entry;
200 	vaddr_t start, end, lim;
201 	vm_map_t map;
202 	vsize_t len;
203 	int error = 0, npgs;
204 
205 	map = &p->p_vmspace->vm_map;
206 
207 	start = (vaddr_t)SCARG(uap, addr);
208 	len = SCARG(uap, len);
209 	vec = SCARG(uap, vec);
210 
211 	if (start & PAGE_MASK)
212 		return (EINVAL);
213 	len = round_page(len);
214 	end = start + len;
215 	if (end <= start)
216 		return (EINVAL);
217 
218 	npgs = len >> PAGE_SHIFT;
219 
220 	/*
221 	 * Lock down vec, so our returned status isn't outdated by
222 	 * storing the status byte for a page.
223 	 */
224 	if ((error = uvm_vslock(p, vec, npgs, VM_PROT_WRITE)) != 0)
225 		return (error);
226 
227 	vm_map_lock_read(map);
228 
229 	if (uvm_map_lookup_entry(map, start, &entry) == FALSE) {
230 		error = ENOMEM;
231 		goto out;
232 	}
233 
234 	for (/* nothing */;
235 	     entry != &map->header && entry->start < end;
236 	     entry = entry->next) {
237 		KASSERT(!UVM_ET_ISSUBMAP(entry));
238 		KASSERT(start >= entry->start);
239 
240 		/* Make sure there are no holes. */
241 		if (entry->end < end &&
242 		     (entry->next == &map->header ||
243 		      entry->next->start > entry->end)) {
244 			error = ENOMEM;
245 			goto out;
246 		}
247 
248 		lim = end < entry->end ? end : entry->end;
249 
250 		/*
251 		 * Special case for objects with no "real" pages.  Those
252 		 * are always considered resident (mapped devices).
253 		 */
254 		if (UVM_ET_ISOBJ(entry)) {
255 			KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
256 			if (entry->object.uvm_obj->pgops->pgo_releasepg
257 			    == NULL) {
258 				pgi = 1;
259 				for (/* nothing */; start < lim;
260 				     start += PAGE_SIZE, vec++)
261 					copyout(&pgi, vec, sizeof(char));
262 				continue;
263 			}
264 		}
265 
266 		amap = entry->aref.ar_amap;	/* top layer */
267 		uobj = entry->object.uvm_obj;	/* bottom layer */
268 
269 		if (amap != NULL)
270 			amap_lock(amap);
271 		if (uobj != NULL)
272 			simple_lock(&uobj->vmobjlock);
273 
274 		for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
275 			pgi = 0;
276 			if (amap != NULL) {
277 				/* Check the top layer first. */
278 				anon = amap_lookup(&entry->aref,
279 				    start - entry->start);
280 				/* Don't need to lock anon here. */
281 				if (anon != NULL && anon->u.an_page != NULL) {
282 					/*
283 					 * Anon has the page for this entry
284 					 * offset.
285 					 */
286 					pgi = 1;
287 				}
288 			}
289 
290 			if (uobj != NULL && pgi == 0) {
291 				/* Check the bottom layer. */
292 				m = uvm_pagelookup(uobj,
293 				    entry->offset + (start - entry->start));
294 				if (m != NULL) {
295 					/*
296 					 * Object has the page for this entry
297 					 * offset.
298 					 */
299 					pgi = 1;
300 				}
301 			}
302 
303 			copyout(&pgi, vec, sizeof(char));
304 		}
305 
306 		if (uobj != NULL)
307 			simple_unlock(&uobj->vmobjlock);
308 		if (amap != NULL)
309 			amap_unlock(amap);
310 	}
311 
312  out:
313 	vm_map_unlock_read(map);
314 	uvm_vsunlock(p, SCARG(uap, vec), npgs);
315 	return (error);
316 }
317 
318 /*
319  * sys_mmap: mmap system call.
320  *
321  * => file offset and address may not be page aligned
322  *    - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
323  *    - if address isn't page aligned the mapping starts at trunc_page(addr)
324  *      and the return value is adjusted up by the page offset.
325  */
326 
327 int
sys_mmap(p,v,retval)328 sys_mmap(p, v, retval)
329 	struct proc *p;
330 	void *v;
331 	register_t *retval;
332 {
333 	struct sys_mmap_args /* {
334 		syscallarg(void *) addr;
335 		syscallarg(size_t) len;
336 		syscallarg(int) prot;
337 		syscallarg(int) flags;
338 		syscallarg(int) fd;
339 		syscallarg(long) pad;
340 		syscallarg(off_t) pos;
341 	} */ *uap = v;
342 	vaddr_t addr;
343 	struct vattr va;
344 	off_t pos;
345 	vsize_t size, pageoff;
346 	vm_prot_t prot, maxprot;
347 	int flags, fd;
348 	vaddr_t vm_min_address = VM_MIN_ADDRESS;
349 	struct filedesc *fdp = p->p_fd;
350 	struct file *fp = NULL;
351 	struct vnode *vp;
352 	caddr_t handle;
353 	int error;
354 
355 	/*
356 	 * first, extract syscall args from the uap.
357 	 */
358 
359 	addr = (vaddr_t) SCARG(uap, addr);
360 	size = (vsize_t) SCARG(uap, len);
361 	prot = SCARG(uap, prot);
362 	flags = SCARG(uap, flags);
363 	fd = SCARG(uap, fd);
364 	pos = SCARG(uap, pos);
365 
366 	/*
367 	 * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and
368 	 * validate the flags.
369 	 */
370 	if ((prot & VM_PROT_ALL) != prot)
371 		return (EINVAL);
372 	if ((flags & MAP_FLAGMASK) != flags)
373 		return (EINVAL);
374 	if (flags & MAP_COPY)
375 		flags = (flags & ~MAP_COPY) | MAP_PRIVATE;
376 	if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
377 		return (EINVAL);
378 	if ((flags & (MAP_FIXED|__MAP_NOREPLACE)) == __MAP_NOREPLACE)
379 		return (EINVAL);
380 
381 	/*
382 	 * align file position and save offset.  adjust size.
383 	 */
384 
385 	pageoff = (pos & PAGE_MASK);
386 	pos  -= pageoff;
387 	size += pageoff;			/* add offset */
388 	if (size != 0) {
389 		size = (vsize_t) round_page(size);	/* round up */
390 		if (size == 0)
391 			return (ENOMEM);		/* don't allow wrap */
392 	}
393 
394 	/*
395 	 * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr"
396 	 */
397 
398 	if (flags & MAP_FIXED) {
399 
400 		/* ensure address and file offset are aligned properly */
401 		addr -= pageoff;
402 		if (addr & PAGE_MASK)
403 			return (EINVAL);
404 
405 		if (VM_MAXUSER_ADDRESS > 0 &&
406 		    (addr + size) > VM_MAXUSER_ADDRESS)
407 			return (EINVAL);
408 		if (vm_min_address > 0 && addr < vm_min_address)
409 			return (EINVAL);
410 		if (addr > addr + size)
411 			return (EINVAL);		/* no wrapping! */
412 
413 	} else {
414 
415 		/*
416 		 * not fixed: make sure we skip over the largest possible heap.
417 		 * we will refine our guess later (e.g. to account for VAC, etc)
418 		 */
419 		if (addr == 0)
420 			addr = uvm_map_hint(p, prot);
421 		else if (!(flags & MAP_TRYFIXED) &&
422 		    addr < (vaddr_t)p->p_vmspace->vm_daddr)
423 			addr = uvm_map_hint(p, prot);
424 	}
425 
426 	/*
427 	 * check for file mappings (i.e. not anonymous) and verify file.
428 	 */
429 	if ((flags & MAP_ANON) == 0) {
430 
431 		if ((fp = fd_getfile(fdp, fd)) == NULL)
432 			return (EBADF);
433 
434 		FREF(fp);
435 
436 		if (fp->f_type != DTYPE_VNODE) {
437 			error = ENODEV;		/* only mmap vnodes! */
438 			goto out;
439 		}
440 		vp = (struct vnode *)fp->f_data;	/* convert to vnode */
441 
442 		if (vp->v_type != VREG && vp->v_type != VCHR &&
443 		    vp->v_type != VBLK) {
444 			error = ENODEV; /* only REG/CHR/BLK support mmap */
445 			goto out;
446 		}
447 
448 		if (vp->v_type == VREG && (pos + size) < pos) {
449 			error = EINVAL;		/* no offset wrapping */
450 			goto out;
451 		}
452 
453 		/* special case: catch SunOS style /dev/zero */
454 		if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) {
455 			flags |= MAP_ANON;
456 			FRELE(fp);
457 			fp = NULL;
458 			goto is_anon;
459 		}
460 
461 		/*
462 		 * Old programs may not select a specific sharing type, so
463 		 * default to an appropriate one.
464 		 *
465 		 * XXX: how does MAP_ANON fit in the picture?
466 		 */
467 		if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
468 #if defined(DEBUG)
469 			printf("WARNING: defaulted mmap() share type to "
470 			   "%s (pid %d comm %s)\n", vp->v_type == VCHR ?
471 			   "MAP_SHARED" : "MAP_PRIVATE", p->p_pid,
472 			    p->p_comm);
473 #endif
474 			if (vp->v_type == VCHR)
475 				flags |= MAP_SHARED;	/* for a device */
476 			else
477 				flags |= MAP_PRIVATE;	/* for a file */
478 		}
479 
480 		/*
481 		 * MAP_PRIVATE device mappings don't make sense (and aren't
482 		 * supported anyway).  However, some programs rely on this,
483 		 * so just change it to MAP_SHARED.
484 		 */
485 		if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
486 			flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
487 		}
488 
489 		/*
490 		 * now check protection
491 		 */
492 
493 		maxprot = VM_PROT_EXECUTE;
494 
495 		/* check read access */
496 		if (fp->f_flag & FREAD)
497 			maxprot |= VM_PROT_READ;
498 		else if (prot & PROT_READ) {
499 			error = EACCES;
500 			goto out;
501 		}
502 
503 		/* check write access, shared case first */
504 		if (flags & MAP_SHARED) {
505 			/*
506 			 * if the file is writable, only add PROT_WRITE to
507 			 * maxprot if the file is not immutable, append-only.
508 			 * otherwise, if we have asked for PROT_WRITE, return
509 			 * EPERM.
510 			 */
511 			if (fp->f_flag & FWRITE) {
512 				if ((error =
513 				    VOP_GETATTR(vp, &va, p->p_ucred, p)))
514 					goto out;
515 				if ((va.va_flags & (IMMUTABLE|APPEND)) == 0)
516 					maxprot |= VM_PROT_WRITE;
517 				else if (prot & PROT_WRITE) {
518 					error = EPERM;
519 					goto out;
520 				}
521 			} else if (prot & PROT_WRITE) {
522 				error = EACCES;
523 				goto out;
524 			}
525 		} else {
526 			/* MAP_PRIVATE mappings can always write to */
527 			maxprot |= VM_PROT_WRITE;
528 		}
529 
530 		/*
531 		 * set handle to vnode
532 		 */
533 
534 		handle = (caddr_t)vp;
535 
536 	} else {		/* MAP_ANON case */
537 		/*
538 		 * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0?
539 		 */
540 		if (fd != -1) {
541 			error = EINVAL;
542 			goto out;
543 		}
544 
545  is_anon:		/* label for SunOS style /dev/zero */
546 		handle = NULL;
547 		maxprot = VM_PROT_ALL;
548 		pos = 0;
549 	}
550 
551 	/*
552 	 * XXX (in)sanity check.  We don't do proper datasize checking
553 	 * XXX for anonymous (or private writable) mmap().  However,
554 	 * XXX know that if we're trying to allocate more than the amount
555 	 * XXX remaining under our current data size limit, _that_ should
556 	 * XXX be disallowed.
557 	 */
558 	if ((flags & MAP_ANON) != 0 ||
559 	    ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) {
560 		if (size >
561 		    (p->p_rlimit[RLIMIT_DATA].rlim_cur - ctob(p->p_vmspace->vm_dsize))) {
562 			error = ENOMEM;
563 			goto out;
564 		}
565 	}
566 
567 	/*
568 	 * now let kernel internal function uvm_mmap do the work.
569 	 */
570 
571 	error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
572 	    flags, handle, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
573 
574 	if (error == 0)
575 		/* remember to add offset */
576 		*retval = (register_t)(addr + pageoff);
577 
578 out:
579 	if (fp)
580 		FRELE(fp);
581 	return (error);
582 }
583 
584 /*
585  * sys_msync: the msync system call (a front-end for flush)
586  */
587 
588 int
sys_msync(p,v,retval)589 sys_msync(p, v, retval)
590 	struct proc *p;
591 	void *v;
592 	register_t *retval;
593 {
594 	struct sys_msync_args /* {
595 		syscallarg(void *) addr;
596 		syscallarg(size_t) len;
597 		syscallarg(int) flags;
598 	} */ *uap = v;
599 	vaddr_t addr;
600 	vsize_t size, pageoff;
601 	vm_map_t map;
602 	int rv, flags, uvmflags;
603 
604 	/*
605 	 * extract syscall args from the uap
606 	 */
607 
608 	addr = (vaddr_t)SCARG(uap, addr);
609 	size = (vsize_t)SCARG(uap, len);
610 	flags = SCARG(uap, flags);
611 
612 	/* sanity check flags */
613 	if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
614 			(flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
615 			(flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
616 	  return (EINVAL);
617 	if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
618 	  flags |= MS_SYNC;
619 
620 	/*
621 	 * align the address to a page boundary, and adjust the size accordingly
622 	 */
623 
624 	pageoff = (addr & PAGE_MASK);
625 	addr -= pageoff;
626 	size += pageoff;
627 	size = (vsize_t) round_page(size);
628 
629 	/* disallow wrap-around. */
630 	if (addr + (ssize_t)size < addr)
631 		return (EINVAL);
632 
633 	/*
634 	 * get map
635 	 */
636 
637 	map = &p->p_vmspace->vm_map;
638 
639 	/*
640 	 * XXXCDC: do we really need this semantic?
641 	 *
642 	 * XXX Gak!  If size is zero we are supposed to sync "all modified
643 	 * pages with the region containing addr".  Unfortunately, we
644 	 * don't really keep track of individual mmaps so we approximate
645 	 * by flushing the range of the map entry containing addr.
646 	 * This can be incorrect if the region splits or is coalesced
647 	 * with a neighbor.
648 	 */
649 	if (size == 0) {
650 		vm_map_entry_t entry;
651 
652 		vm_map_lock_read(map);
653 		rv = uvm_map_lookup_entry(map, addr, &entry);
654 		if (rv == TRUE) {
655 			addr = entry->start;
656 			size = entry->end - entry->start;
657 		}
658 		vm_map_unlock_read(map);
659 		if (rv == FALSE)
660 			return (EINVAL);
661 	}
662 
663 	/*
664 	 * translate MS_ flags into PGO_ flags
665 	 */
666 	uvmflags = PGO_CLEANIT;
667 	if (flags & MS_INVALIDATE)
668 		uvmflags |= PGO_FREE;
669 	if (flags & MS_SYNC)
670 		uvmflags |= PGO_SYNCIO;
671 	else
672 		uvmflags |= PGO_SYNCIO;	 /* XXXCDC: force sync for now! */
673 
674 	/*
675 	 * doit!
676 	 */
677 	rv = uvm_map_clean(map, addr, addr+size, uvmflags);
678 
679 	/*
680 	 * and return...
681 	 */
682 	return (rv);
683 }
684 
685 /*
686  * sys_munmap: unmap a users memory
687  */
688 
689 int
sys_munmap(p,v,retval)690 sys_munmap(p, v, retval)
691 	struct proc *p;
692 	void *v;
693 	register_t *retval;
694 {
695 	struct sys_munmap_args /* {
696 		syscallarg(void *) addr;
697 		syscallarg(size_t) len;
698 	} */ *uap = v;
699 	vaddr_t addr;
700 	vsize_t size, pageoff;
701 	vm_map_t map;
702 	vaddr_t vm_min_address = VM_MIN_ADDRESS;
703 	struct vm_map_entry *dead_entries;
704 
705 	/*
706 	 * get syscall args...
707 	 */
708 
709 	addr = (vaddr_t) SCARG(uap, addr);
710 	size = (vsize_t) SCARG(uap, len);
711 
712 	/*
713 	 * align the address to a page boundary, and adjust the size accordingly
714 	 */
715 
716 	pageoff = (addr & PAGE_MASK);
717 	addr -= pageoff;
718 	size += pageoff;
719 	size = (vsize_t) round_page(size);
720 
721 	if ((ssize_t)size < 0)
722 		return (EINVAL);
723 	if (size == 0)
724 		return (0);
725 
726 	/*
727 	 * Check for illegal addresses.  Watch out for address wrap...
728 	 * Note that VM_*_ADDRESS are not constants due to casts (argh).
729 	 */
730 	if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
731 		return (EINVAL);
732 	if (vm_min_address > 0 && addr < vm_min_address)
733 		return (EINVAL);
734 	if (addr > addr + size)
735 		return (EINVAL);
736 	map = &p->p_vmspace->vm_map;
737 
738 
739 	vm_map_lock(map);	/* lock map so we can checkprot */
740 
741 	/*
742 	 * interesting system call semantic: make sure entire range is
743 	 * allocated before allowing an unmap.
744 	 */
745 
746 	if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) {
747 		vm_map_unlock(map);
748 		return (EINVAL);
749 	}
750 
751 	/*
752 	 * doit!
753 	 */
754 	uvm_unmap_remove(map, addr, addr + size, &dead_entries);
755 
756 	vm_map_unlock(map);	/* and unlock */
757 
758 	if (dead_entries != NULL)
759 		uvm_unmap_detach(dead_entries, 0);
760 
761 	return (0);
762 }
763 
764 /*
765  * sys_mprotect: the mprotect system call
766  */
767 
768 int
sys_mprotect(p,v,retval)769 sys_mprotect(p, v, retval)
770 	struct proc *p;
771 	void *v;
772 	register_t *retval;
773 {
774 	struct sys_mprotect_args /* {
775 		syscallarg(void *) addr;
776 		syscallarg(size_t) len;
777 		syscallarg(int) prot;
778 	} */ *uap = v;
779 	vaddr_t addr;
780 	vsize_t size, pageoff;
781 	vm_prot_t prot;
782 	int rv;
783 
784 	/*
785 	 * extract syscall args from uap
786 	 */
787 
788 	addr = (vaddr_t)SCARG(uap, addr);
789 	size = (vsize_t)SCARG(uap, len);
790 	prot = SCARG(uap, prot);
791 
792 	if ((prot & VM_PROT_ALL) != prot)
793 		return (EINVAL);
794 
795 	/*
796 	 * align the address to a page boundary, and adjust the size accordingly
797 	 */
798 	pageoff = (addr & PAGE_MASK);
799 	addr -= pageoff;
800 	size += pageoff;
801 	size = (vsize_t) round_page(size);
802 	if ((ssize_t)size < 0)
803 		return (EINVAL);
804 
805 	/*
806 	 * doit
807 	 */
808 
809 	rv = uvm_map_protect(&p->p_vmspace->vm_map,
810 			   addr, addr+size, prot, FALSE);
811 
812 	if (rv == KERN_SUCCESS)
813 		return (0);
814 	if (rv == KERN_PROTECTION_FAILURE)
815 		return (EACCES);
816 	return (EINVAL);
817 }
818 
819 /*
820  * sys_minherit: the minherit system call
821  */
822 
823 int
sys_minherit(p,v,retval)824 sys_minherit(p, v, retval)
825 	struct proc *p;
826 	void *v;
827 	register_t *retval;
828 {
829 	struct sys_minherit_args /* {
830 		syscallarg(void *) addr;
831 		syscallarg(size_t) len;
832 		syscallarg(int) inherit;
833 	} */ *uap = v;
834 	vaddr_t addr;
835 	vsize_t size, pageoff;
836 	vm_inherit_t inherit;
837 
838 	addr = (vaddr_t)SCARG(uap, addr);
839 	size = (vsize_t)SCARG(uap, len);
840 	inherit = SCARG(uap, inherit);
841 	/*
842 	 * align the address to a page boundary, and adjust the size accordingly
843 	 */
844 
845 	pageoff = (addr & PAGE_MASK);
846 	addr -= pageoff;
847 	size += pageoff;
848 	size = (vsize_t) round_page(size);
849 
850 	if ((ssize_t)size < 0)
851 		return (EINVAL);
852 
853 	switch (uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size,
854 			 inherit)) {
855 	case KERN_SUCCESS:
856 		return (0);
857 	case KERN_PROTECTION_FAILURE:
858 		return (EACCES);
859 	}
860 	return (EINVAL);
861 }
862 
863 /*
864  * sys_madvise: give advice about memory usage.
865  */
866 
867 /* ARGSUSED */
868 int
sys_madvise(p,v,retval)869 sys_madvise(p, v, retval)
870 	struct proc *p;
871 	void *v;
872 	register_t *retval;
873 {
874 	struct sys_madvise_args /* {
875 		syscallarg(void *) addr;
876 		syscallarg(size_t) len;
877 		syscallarg(int) behav;
878 	} */ *uap = v;
879 	vaddr_t addr;
880 	vsize_t size, pageoff;
881 	int advice, rv;
882 
883 	addr = (vaddr_t)SCARG(uap, addr);
884 	size = (vsize_t)SCARG(uap, len);
885 	advice = SCARG(uap, behav);
886 
887 	/*
888 	 * align the address to a page boundary, and adjust the size accordingly
889 	 */
890 	pageoff = (addr & PAGE_MASK);
891 	addr -= pageoff;
892 	size += pageoff;
893 	size = (vsize_t) round_page(size);
894 
895 	if ((ssize_t)size <= 0)
896 		return (EINVAL);
897 
898 	switch (advice) {
899 	case MADV_NORMAL:
900 	case MADV_RANDOM:
901 	case MADV_SEQUENTIAL:
902 		rv = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size,
903 		    advice);
904 		break;
905 
906 	case MADV_WILLNEED:
907 		/*
908 		 * Activate all these pages, pre-faulting them in if
909 		 * necessary.
910 		 */
911 		/*
912 		 * XXX IMPLEMENT ME.
913 		 * Should invent a "weak" mode for uvm_fault()
914 		 * which would only do the PGO_LOCKED pgo_get().
915 		 */
916 		return (0);
917 
918 	case MADV_DONTNEED:
919 		/*
920 		 * Deactivate all these pages.  We don't need them
921 		 * any more.  We don't, however, toss the data in
922 		 * the pages.
923 		 */
924 		rv = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
925 		    PGO_DEACTIVATE);
926 		break;
927 
928 	case MADV_FREE:
929 		/*
930 		 * These pages contain no valid data, and may be
931 		 * garbage-collected.  Toss all resources, including
932 		 * any swap space in use.
933 		 */
934 		rv = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
935 		    PGO_FREE);
936 		break;
937 
938 	case MADV_SPACEAVAIL:
939 		/*
940 		 * XXXMRG What is this?  I think it's:
941 		 *
942 		 *	Ensure that we have allocated backing-store
943 		 *	for these pages.
944 		 *
945 		 * This is going to require changes to the page daemon,
946 		 * as it will free swap space allocated to pages in core.
947 		 * There's also what to do for device/file/anonymous memory.
948 		 */
949 		return (EINVAL);
950 
951 	default:
952 		return (EINVAL);
953 	}
954 
955 	return (rv);
956 }
957 
958 /*
959  * sys_mlock: memory lock
960  */
961 
962 int
sys_mlock(p,v,retval)963 sys_mlock(p, v, retval)
964 	struct proc *p;
965 	void *v;
966 	register_t *retval;
967 {
968 	struct sys_mlock_args /* {
969 		syscallarg(const void *) addr;
970 		syscallarg(size_t) len;
971 	} */ *uap = v;
972 	vaddr_t addr;
973 	vsize_t size, pageoff;
974 	int error;
975 
976 	/*
977 	 * extract syscall args from uap
978 	 */
979 	addr = (vaddr_t)SCARG(uap, addr);
980 	size = (vsize_t)SCARG(uap, len);
981 
982 	/*
983 	 * align the address to a page boundary and adjust the size accordingly
984 	 */
985 	pageoff = (addr & PAGE_MASK);
986 	addr -= pageoff;
987 	size += pageoff;
988 	size = (vsize_t) round_page(size);
989 
990 	/* disallow wrap-around. */
991 	if (addr + (ssize_t)size < addr)
992 		return (EINVAL);
993 
994 	if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
995 		return (EAGAIN);
996 
997 #ifdef pmap_wired_count
998 	if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
999 			p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
1000 		return (EAGAIN);
1001 #else
1002 	if ((error = suser(p, 0)) != 0)
1003 		return (error);
1004 #endif
1005 
1006 	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE,
1007 	    0);
1008 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
1009 }
1010 
1011 /*
1012  * sys_munlock: unlock wired pages
1013  */
1014 
1015 int
sys_munlock(p,v,retval)1016 sys_munlock(p, v, retval)
1017 	struct proc *p;
1018 	void *v;
1019 	register_t *retval;
1020 {
1021 	struct sys_munlock_args /* {
1022 		syscallarg(const void *) addr;
1023 		syscallarg(size_t) len;
1024 	} */ *uap = v;
1025 	vaddr_t addr;
1026 	vsize_t size, pageoff;
1027 	int error;
1028 
1029 	/*
1030 	 * extract syscall args from uap
1031 	 */
1032 
1033 	addr = (vaddr_t)SCARG(uap, addr);
1034 	size = (vsize_t)SCARG(uap, len);
1035 
1036 	/*
1037 	 * align the address to a page boundary, and adjust the size accordingly
1038 	 */
1039 	pageoff = (addr & PAGE_MASK);
1040 	addr -= pageoff;
1041 	size += pageoff;
1042 	size = (vsize_t) round_page(size);
1043 
1044 	/* disallow wrap-around. */
1045 	if (addr + (ssize_t)size < addr)
1046 		return (EINVAL);
1047 
1048 #ifndef pmap_wired_count
1049 	if ((error = suser(p, 0)) != 0)
1050 		return (error);
1051 #endif
1052 
1053 	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE,
1054 	    0);
1055 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
1056 }
1057 
1058 /*
1059  * sys_mlockall: lock all pages mapped into an address space.
1060  */
1061 
1062 int
sys_mlockall(p,v,retval)1063 sys_mlockall(p, v, retval)
1064 	struct proc *p;
1065 	void *v;
1066 	register_t *retval;
1067 {
1068 	struct sys_mlockall_args /* {
1069 		syscallarg(int) flags;
1070 	} */ *uap = v;
1071 	int error, flags;
1072 
1073 	flags = SCARG(uap, flags);
1074 
1075 	if (flags == 0 ||
1076 	    (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
1077 		return (EINVAL);
1078 
1079 #ifndef pmap_wired_count
1080 	if ((error = suser(p, 0)) != 0)
1081 		return (error);
1082 #endif
1083 
1084 	error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
1085 	    p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
1086 	switch (error) {
1087 	case KERN_SUCCESS:
1088 		error = 0;
1089 		break;
1090 
1091 	case KERN_NO_SPACE:	/* XXX overloaded */
1092 		error = ENOMEM;
1093 		break;
1094 
1095 	default:
1096 		/*
1097 		 * "Some or all of the memory could not be locked when
1098 		 * the call was made."
1099 		 */
1100 		error = EAGAIN;
1101 	}
1102 
1103 	return (error);
1104 }
1105 
1106 /*
1107  * sys_munlockall: unlock all pages mapped into an address space.
1108  */
1109 
1110 int
sys_munlockall(p,v,retval)1111 sys_munlockall(p, v, retval)
1112 	struct proc *p;
1113 	void *v;
1114 	register_t *retval;
1115 {
1116 
1117 	(void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
1118 	return (0);
1119 }
1120 
1121 /*
1122  * uvm_mmap: internal version of mmap
1123  *
1124  * - used by sys_mmap, exec, and sysv shm
1125  * - handle is a vnode pointer or NULL for MAP_ANON (XXX: not true,
1126  *	sysv shm uses "named anonymous memory")
1127  * - caller must page-align the file offset
1128  */
1129 
1130 int
uvm_mmap(map,addr,size,prot,maxprot,flags,handle,foff,locklimit)1131 uvm_mmap(map, addr, size, prot, maxprot, flags, handle, foff, locklimit)
1132 	vm_map_t map;
1133 	vaddr_t *addr;
1134 	vsize_t size;
1135 	vm_prot_t prot, maxprot;
1136 	int flags;
1137 	caddr_t handle;		/* XXX: VNODE? */
1138 	voff_t foff;
1139 	vsize_t locklimit;
1140 {
1141 	struct uvm_object *uobj;
1142 	struct vnode *vp;
1143 	int retval;
1144 	int advice = UVM_ADV_NORMAL;
1145 	uvm_flag_t uvmflag = 0;
1146 	vsize_t align = 0;	/* userland page size */
1147 
1148 	/*
1149 	 * check params
1150 	 */
1151 
1152 	if (size == 0)
1153 		return(0);
1154 	if (foff & PAGE_MASK)
1155 		return(EINVAL);
1156 	if ((prot & maxprot) != prot)
1157 		return(EINVAL);
1158 
1159 	/*
1160 	 * for non-fixed mappings, round off the suggested address.
1161 	 * for fixed mappings, check alignment and zap old mappings.
1162 	 */
1163 
1164 	if ((flags & MAP_FIXED) == 0) {
1165 		*addr = round_page(*addr);	/* round */
1166 	} else {
1167 		if (*addr & PAGE_MASK)
1168 			return(EINVAL);
1169 
1170 		uvmflag |= UVM_FLAG_FIXED;
1171 		if ((flags & __MAP_NOREPLACE) == 0)
1172 			uvm_unmap(map, *addr, *addr + size);	/* zap! */
1173 	}
1174 
1175 	/*
1176 	 * handle anon vs. non-anon mappings.   for non-anon mappings attach
1177 	 * to underlying vm object.
1178 	 */
1179 
1180 	if (flags & MAP_ANON) {
1181 		if ((flags & MAP_FIXED) == 0 && size >= __LDPGSZ)
1182 			align = __LDPGSZ;
1183 		foff = UVM_UNKNOWN_OFFSET;
1184 		uobj = NULL;
1185 		if ((flags & MAP_SHARED) == 0)
1186 			/* XXX: defer amap create */
1187 			uvmflag |= UVM_FLAG_COPYONW;
1188 		else
1189 			/* shared: create amap now */
1190 			uvmflag |= UVM_FLAG_OVERLAY;
1191 
1192 	} else {
1193 
1194 		vp = (struct vnode *) handle;	/* get vnode */
1195 		if (vp->v_type != VCHR) {
1196 			uobj = uvn_attach((void *) vp, (flags & MAP_SHARED) ?
1197 			   maxprot : (maxprot & ~VM_PROT_WRITE));
1198 
1199 #ifndef UBC
1200 			/*
1201 			 * XXXCDC: hack from old code
1202 			 * don't allow vnodes which have been mapped
1203 			 * shared-writeable to persist [forces them to be
1204 			 * flushed out when last reference goes].
1205 			 * XXXCDC: interesting side effect: avoids a bug.
1206 			 * note that in WRITE [ufs_readwrite.c] that we
1207 			 * allocate buffer, uncache, and then do the write.
1208 			 * the problem with this is that if the uncache causes
1209 			 * VM data to be flushed to the same area of the file
1210 			 * we are writing to... in that case we've got the
1211 			 * buffer locked and our process goes to sleep forever.
1212 			 *
1213 			 * XXXCDC: checking maxprot protects us from the
1214 			 * "persistbug" program but this is not a long term
1215 			 * solution.
1216 			 *
1217 			 * XXXCDC: we don't bother calling uncache with the vp
1218 			 * VOP_LOCKed since we know that we are already
1219 			 * holding a valid reference to the uvn (from the
1220 			 * uvn_attach above), and thus it is impossible for
1221 			 * the uncache to kill the uvn and trigger I/O.
1222 			 */
1223 			if (flags & MAP_SHARED) {
1224 				if ((prot & VM_PROT_WRITE) ||
1225 				    (maxprot & VM_PROT_WRITE)) {
1226 					uvm_vnp_uncache(vp);
1227 				}
1228 			}
1229 #else
1230 			/* XXX for now, attach doesn't gain a ref */
1231 			VREF(vp);
1232 #endif
1233 		} else {
1234 			uobj = udv_attach((void *) &vp->v_rdev,
1235 			    (flags & MAP_SHARED) ? maxprot :
1236 			    (maxprot & ~VM_PROT_WRITE), foff, size);
1237 			/*
1238 			 * XXX Some devices don't like to be mapped with
1239 			 * XXX PROT_EXEC, but we don't really have a
1240 			 * XXX better way of handling this, right now
1241 			 */
1242 			if (uobj == NULL && (prot & PROT_EXEC) == 0) {
1243 				maxprot &= ~VM_PROT_EXECUTE;
1244 				uobj = udv_attach((void *) &vp->v_rdev,
1245 				    (flags & MAP_SHARED) ? maxprot :
1246 				    (maxprot & ~VM_PROT_WRITE), foff, size);
1247 			}
1248 			advice = UVM_ADV_RANDOM;
1249 		}
1250 
1251 		if (uobj == NULL)
1252 			return((vp->v_type == VREG) ? ENOMEM : EINVAL);
1253 
1254 		if ((flags & MAP_SHARED) == 0)
1255 			uvmflag |= UVM_FLAG_COPYONW;
1256 	}
1257 
1258 	/*
1259 	 * set up mapping flags
1260 	 */
1261 
1262 	uvmflag = UVM_MAPFLAG(prot, maxprot,
1263 			(flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY,
1264 			advice, uvmflag);
1265 
1266 	/*
1267 	 * do it!
1268 	 */
1269 
1270 	retval = uvm_map(map, addr, size, uobj, foff, align, uvmflag);
1271 
1272 	if (retval == KERN_SUCCESS) {
1273 		/*
1274 		 * POSIX 1003.1b -- if our address space was configured
1275 		 * to lock all future mappings, wire the one we just made.
1276 		 */
1277 		if (prot == VM_PROT_NONE) {
1278 			/*
1279 			 * No more work to do in this case.
1280 			 */
1281 			return (0);
1282 		}
1283 
1284 		vm_map_lock(map);
1285 
1286 		if (map->flags & VM_MAP_WIREFUTURE) {
1287 			if ((atop(size) + uvmexp.wired) > uvmexp.wiredmax
1288 #ifdef pmap_wired_count
1289 			    || (locklimit != 0 && (size +
1290 			         ptoa(pmap_wired_count(vm_map_pmap(map)))) >
1291 			        locklimit)
1292 #endif
1293 			) {
1294 				retval = KERN_RESOURCE_SHORTAGE;
1295 				vm_map_unlock(map);
1296 				/* unmap the region! */
1297 				uvm_unmap(map, *addr, *addr + size);
1298 				goto bad;
1299 			}
1300 			/*
1301 			 * uvm_map_pageable() always returns the map
1302 			 * unlocked.
1303 			 */
1304 			retval = uvm_map_pageable(map, *addr, *addr + size,
1305 			    FALSE, UVM_LK_ENTER);
1306 			if (retval != KERN_SUCCESS) {
1307 				/* unmap the region! */
1308 				uvm_unmap(map, *addr, *addr + size);
1309 				goto bad;
1310 			}
1311 			return (0);
1312 		}
1313 
1314 		vm_map_unlock(map);
1315 
1316 		return (0);
1317 	}
1318 
1319 	/*
1320 	 * errors: first detach from the uobj, if any.
1321 	 */
1322 
1323 	if (uobj)
1324 		uobj->pgops->pgo_detach(uobj);
1325 
1326  bad:
1327 	return (retval);
1328 }
1329