1 /*--
2  * Copyright (c) 1997, Duke University
3  * All rights reserved.
4  *
5  * Author:
6  *         Andrew Gallatin <gallatin@cs.duke.edu>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. The name of Duke University may not be used to endorse or promote
17  *    products derived from this software without specific prior written
18  *    permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY DUKE UNIVERSITY ``AS IS'' AND ANY
21  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL DUKE UNIVERSITY BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITSOR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
28  * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
29  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
30  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * This is a set of routines for enabling and disabling copy on write
35  * protection for data written into sockets.
36  */
37 
38 #include <sys/cdefs.h>
39 __FBSDID("$FreeBSD: stable/9/sys/kern/uipc_cow.c 219028 2011-02-25 10:11:01Z netchild $");
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/sysctl.h>
44 #include <sys/kernel.h>
45 #include <sys/proc.h>
46 #include <sys/lock.h>
47 #include <sys/mutex.h>
48 #include <sys/mbuf.h>
49 #include <sys/sf_buf.h>
50 #include <sys/socketvar.h>
51 #include <sys/uio.h>
52 
53 #include <vm/vm.h>
54 #include <vm/vm_extern.h>
55 #include <vm/vm_param.h>
56 #include <vm/pmap.h>
57 #include <vm/vm_map.h>
58 #include <vm/vm_page.h>
59 #include <vm/vm_object.h>
60 
61 FEATURE(zero_copy_sockets, "Zero copy sockets support");
62 
63 struct netsend_cow_stats {
64 	int attempted;
65 	int fail_not_mapped;
66 	int fail_sf_buf;
67 	int success;
68 	int iodone;
69 };
70 
71 static struct netsend_cow_stats socow_stats;
72 
73 static void socow_iodone(void *addr, void *args);
74 
75 static void
socow_iodone(void * addr,void * args)76 socow_iodone(void *addr, void *args)
77 {
78 	struct sf_buf *sf;
79 	vm_page_t pp;
80 
81 	sf = args;
82 	pp = sf_buf_page(sf);
83 	sf_buf_free(sf);
84 	/* remove COW mapping  */
85 	vm_page_lock(pp);
86 	vm_page_cowclear(pp);
87 	vm_page_unwire(pp, 0);
88 	/*
89 	 * Check for the object going away on us. This can
90 	 * happen since we don't hold a reference to it.
91 	 * If so, we're responsible for freeing the page.
92 	 */
93 	if (pp->wire_count == 0 && pp->object == NULL)
94 		vm_page_free(pp);
95 	vm_page_unlock(pp);
96 	socow_stats.iodone++;
97 }
98 
99 int
socow_setup(struct mbuf * m0,struct uio * uio)100 socow_setup(struct mbuf *m0, struct uio *uio)
101 {
102 	struct sf_buf *sf;
103 	vm_page_t pp;
104 	struct iovec *iov;
105 	struct vmspace *vmspace;
106 	struct vm_map *map;
107 	vm_offset_t offset, uva;
108 	vm_size_t len;
109 
110 	socow_stats.attempted++;
111 	vmspace = curproc->p_vmspace;
112 	map = &vmspace->vm_map;
113 	uva = (vm_offset_t) uio->uio_iov->iov_base;
114 	offset = uva & PAGE_MASK;
115 	len = PAGE_SIZE - offset;
116 
117 	/*
118 	 * Verify that access to the given address is allowed from user-space.
119 	 */
120 	if (vm_fault_quick_hold_pages(map, uva, len, VM_PROT_READ, &pp, 1) <
121 	    0) {
122 		socow_stats.fail_not_mapped++;
123 		return(0);
124 	}
125 
126 	/*
127 	 * set up COW
128 	 */
129 	vm_page_lock(pp);
130 	if (vm_page_cowsetup(pp) != 0) {
131 		vm_page_unhold(pp);
132 		vm_page_unlock(pp);
133 		return (0);
134 	}
135 
136 	/*
137 	 * wire the page for I/O
138 	 */
139 	vm_page_wire(pp);
140 	vm_page_unhold(pp);
141 	vm_page_unlock(pp);
142 	/*
143 	 * Allocate an sf buf
144 	 */
145 	sf = sf_buf_alloc(pp, SFB_CATCH);
146 	if (sf == NULL) {
147 		vm_page_lock(pp);
148 		vm_page_cowclear(pp);
149 		vm_page_unwire(pp, 0);
150 		/*
151 		 * Check for the object going away on us. This can
152 		 * happen since we don't hold a reference to it.
153 		 * If so, we're responsible for freeing the page.
154 		 */
155 		if (pp->wire_count == 0 && pp->object == NULL)
156 			vm_page_free(pp);
157 		vm_page_unlock(pp);
158 		socow_stats.fail_sf_buf++;
159 		return(0);
160 	}
161 	/*
162 	 * attach to mbuf
163 	 */
164 	MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, socow_iodone,
165 	    (void*)sf_buf_kva(sf), sf, M_RDONLY, EXT_SFBUF);
166 	m0->m_len = len;
167 	m0->m_data = (caddr_t)sf_buf_kva(sf) + offset;
168 	socow_stats.success++;
169 
170 	iov = uio->uio_iov;
171 	iov->iov_base = (char *)iov->iov_base + m0->m_len;
172 	iov->iov_len -= m0->m_len;
173 	uio->uio_resid -= m0->m_len;
174 	uio->uio_offset += m0->m_len;
175 	if (iov->iov_len == 0) {
176 		uio->uio_iov++;
177 		uio->uio_iovcnt--;
178 	}
179 
180 	return(m0->m_len);
181 }
182