1 /*        $NetBSD: sys_memfd.c,v 1.11 2023/08/12 23:22:49 christos Exp $        */
2 
3 /*-
4  * Copyright (c) 2023 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Theodore Preduta.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: sys_memfd.c,v 1.11 2023/08/12 23:22:49 christos Exp $");
34 
35 #include <sys/param.h>
36 #include <sys/types.h>
37 
38 #include <sys/fcntl.h>
39 #include <sys/file.h>
40 #include <sys/filedesc.h>
41 #include <sys/memfd.h>
42 #include <sys/mman.h>
43 #include <sys/syscallargs.h>
44 
45 #include <uvm/uvm_extern.h>
46 #include <uvm/uvm_object.h>
47 
48 #define F_SEAL_ANY_WRITE      (F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
49 #define MFD_KNOWN_SEALS                 (F_SEAL_SEAL|F_SEAL_SHRINK|F_SEAL_GROW \
50                                         |F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
51 
52 static const char memfd_prefix[] = "memfd:";
53 
54 static int memfd_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
55 static int memfd_write(file_t *, off_t *, struct uio *, kauth_cred_t, int);
56 static int memfd_ioctl(file_t *, u_long, void *);
57 static int memfd_fcntl(file_t *, u_int, void *);
58 static int memfd_stat(file_t *, struct stat *);
59 static int memfd_close(file_t *);
60 static int memfd_mmap(file_t *, off_t *, size_t, int, int *, int *,
61     struct uvm_object **, int *);
62 static int memfd_seek(file_t *, off_t, int, off_t *, int);
63 static int memfd_truncate_locked(file_t *, off_t);
64 static int memfd_truncate(file_t *, off_t);
65 
66 static const struct fileops memfd_fileops = {
67           .fo_name = "memfd",
68           .fo_read = memfd_read,
69           .fo_write = memfd_write,
70           .fo_ioctl = memfd_ioctl,
71           .fo_fcntl = memfd_fcntl,
72           .fo_poll = fnullop_poll,
73           .fo_stat = memfd_stat,
74           .fo_close = memfd_close,
75           .fo_kqfilter = fnullop_kqfilter,
76           .fo_restart = fnullop_restart,
77           .fo_mmap = memfd_mmap,
78           .fo_seek = memfd_seek,
79           .fo_fpathconf = (void *)eopnotsupp,
80           .fo_posix_fadvise = (void *)eopnotsupp,
81           .fo_truncate = memfd_truncate,
82 };
83 
84 /*
85  * memfd_create(2).  Creat a file descriptor associated with anonymous
86  * memory.
87  */
88 int
sys_memfd_create(struct lwp * l,const struct sys_memfd_create_args * uap,register_t * retval)89 sys_memfd_create(struct lwp *l, const struct sys_memfd_create_args *uap,
90     register_t *retval)
91 {
92           /* {
93                     syscallarg(const char *) name;
94                     syscallarg(unsigned int) flags;
95           } */
96           int error, fd;
97           file_t *fp;
98           struct memfd *mfd;
99           struct proc *p = l->l_proc;
100           const unsigned int flags = SCARG(uap, flags);
101 
102           if (flags & ~(MFD_CLOEXEC|MFD_ALLOW_SEALING))
103                     return EINVAL;
104 
105           mfd = kmem_zalloc(sizeof(*mfd), KM_SLEEP);
106           mfd->mfd_size = 0;
107           mfd->mfd_uobj = uao_create(INT64_MAX - PAGE_SIZE, 0); /* same as tmpfs */
108 
109           CTASSERT(sizeof(memfd_prefix) < NAME_MAX); /* sanity check */
110           strcpy(mfd->mfd_name, memfd_prefix);
111           error = copyinstr(SCARG(uap, name),
112               &mfd->mfd_name[sizeof(memfd_prefix) - 1],
113               sizeof(mfd->mfd_name) - sizeof(memfd_prefix), NULL);
114           if (error != 0)
115                     goto leave;
116 
117           getnanotime(&mfd->mfd_btime);
118 
119           if ((flags & MFD_ALLOW_SEALING) == 0)
120                     mfd->mfd_seals |= F_SEAL_SEAL;
121 
122           error = fd_allocfile(&fp, &fd);
123           if (error != 0)
124                     goto leave;
125 
126           fp->f_flag = FREAD|FWRITE;
127           fp->f_type = DTYPE_MEMFD;
128           fp->f_ops = &memfd_fileops;
129           fp->f_memfd = mfd;
130           fd_set_exclose(l, fd, (flags & MFD_CLOEXEC) != 0);
131           fd_affix(p, fp, fd);
132 
133           *retval = fd;
134           return 0;
135 
136 leave:
137           uao_detach(mfd->mfd_uobj);
138           kmem_free(mfd, sizeof(*mfd));
139           return error;
140 }
141 
142 static int
memfd_read(file_t * fp,off_t * offp,struct uio * uio,kauth_cred_t cred,int flags)143 memfd_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
144     int flags)
145 {
146           int error;
147           vsize_t todo;
148           struct memfd *mfd = fp->f_memfd;
149 
150           mutex_enter(&fp->f_lock);
151 
152           if (*offp < 0) {
153                     error = EINVAL;
154                     goto leave;
155           }
156 
157           /* Trying to read past the end does nothing. */
158           if (*offp >= mfd->mfd_size) {
159                     error = 0;
160                     goto leave;
161           }
162 
163           uio->uio_offset = *offp;
164           todo = MIN(uio->uio_resid, mfd->mfd_size - *offp);
165           error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
166               UBC_READ|UBC_PARTIALOK);
167           if (flags & FOF_UPDATE_OFFSET)
168                     *offp = uio->uio_offset;
169 
170 leave:
171           getnanotime(&mfd->mfd_atime);
172 
173 
174           mutex_exit(&fp->f_lock);
175 
176           return error;
177 }
178 
179 static int
memfd_write(file_t * fp,off_t * offp,struct uio * uio,kauth_cred_t cred,int flags)180 memfd_write(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
181     int flags)
182 {
183           int error;
184           vsize_t todo;
185           struct memfd *mfd = fp->f_memfd;
186 
187           mutex_enter(&fp->f_lock);
188 
189           if (mfd->mfd_seals & F_SEAL_ANY_WRITE) {
190                     error = EPERM;
191                     goto leave;
192           }
193 
194           if (*offp < 0) {
195                     error = EINVAL;
196                     goto leave;
197           }
198 
199           uio->uio_offset = *offp;
200           todo = uio->uio_resid;
201 
202           if (mfd->mfd_seals & F_SEAL_GROW) {
203                     if (*offp >= mfd->mfd_size) {
204                               error = EPERM;
205                               goto leave;
206                     }
207 
208                     /* Truncate the write to fit in mfd_size */
209                     if (*offp + uio->uio_resid >= mfd->mfd_size)
210                               todo = mfd->mfd_size - *offp;
211           } else if (*offp + uio->uio_resid >= mfd->mfd_size) {
212                     /* Grow to accommodate the write request. */
213                     error = memfd_truncate_locked(fp, *offp + uio->uio_resid);
214                     if (error != 0)
215                               goto leave;
216           }
217 
218           error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
219               UBC_WRITE|UBC_PARTIALOK);
220           if (flags & FOF_UPDATE_OFFSET)
221                     *offp = uio->uio_offset;
222 
223           getnanotime(&mfd->mfd_mtime);
224 
225 leave:
226           mutex_exit(&fp->f_lock);
227 
228           return error;
229 }
230 
231 static int
memfd_ioctl(file_t * fp,u_long cmd,void * data)232 memfd_ioctl(file_t *fp, u_long cmd, void *data)
233 {
234 
235           return EINVAL;
236 }
237 
238 static int
memfd_fcntl(file_t * fp,u_int cmd,void * data)239 memfd_fcntl(file_t *fp, u_int cmd, void *data)
240 {
241           struct memfd *mfd = fp->f_memfd;
242           int error = 0;
243 
244           switch (cmd) {
245           case F_GETPATH:
246                     strncpy(data, mfd->mfd_name, MAXPATHLEN);
247                     return 0;
248 
249           case F_ADD_SEALS:
250                     mutex_enter(&fp->f_lock);
251 
252                     if (mfd->mfd_seals & F_SEAL_SEAL) {
253                             error = EPERM;
254                               goto leave_add_seals;
255                     }
256 
257                     if (*(int *)data & ~MFD_KNOWN_SEALS) {
258                             error = EINVAL;
259                               goto leave_add_seals;
260                     }
261 
262                     /*
263                      * Can only add F_SEAL_WRITE if there are no currently
264                      * open mmaps.
265                      *
266                      * XXX should only disallow if there are no currently
267                      * open mmaps with PROT_WRITE.
268                      */
269                     if ((mfd->mfd_seals & F_SEAL_WRITE) == 0 &&
270                         (*(int *)data & F_SEAL_WRITE) != 0 &&
271                         mfd->mfd_uobj->uo_refs > 1)
272                     {
273                               error = EBUSY;
274                               goto leave_add_seals;
275                     }
276 
277                     mfd->mfd_seals |= *(int *)data;
278 
279           leave_add_seals:
280                     mutex_exit(&fp->f_lock);
281                     return error;
282 
283           case F_GET_SEALS:
284                     mutex_enter(&fp->f_lock);
285                     *(int *)data = mfd->mfd_seals;
286                     mutex_exit(&fp->f_lock);
287                     return 0;
288 
289           default:
290                     return EINVAL;
291           }
292 }
293 
294 static int
memfd_stat(file_t * fp,struct stat * st)295 memfd_stat(file_t *fp, struct stat *st)
296 {
297           struct memfd *mfd = fp->f_memfd;
298 
299           mutex_enter(&fp->f_lock);
300 
301           memset(st, 0, sizeof(*st));
302           st->st_uid = kauth_cred_geteuid(fp->f_cred);
303           st->st_gid = kauth_cred_getegid(fp->f_cred);
304           st->st_size = mfd->mfd_size;
305 
306           st->st_mode = S_IREAD;
307           if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) == 0)
308                     st->st_mode |= S_IWRITE;
309 
310           st->st_birthtimespec = mfd->mfd_btime;
311           st->st_ctimespec = mfd->mfd_mtime;
312           st->st_atimespec = mfd->mfd_atime;
313           st->st_mtimespec = mfd->mfd_mtime;
314 
315           mutex_exit(&fp->f_lock);
316 
317           return 0;
318 }
319 
320 static int
memfd_close(file_t * fp)321 memfd_close(file_t *fp)
322 {
323           struct memfd *mfd = fp->f_memfd;
324 
325           uao_detach(mfd->mfd_uobj);
326 
327           kmem_free(mfd, sizeof(*mfd));
328           fp->f_memfd = NULL;
329 
330           return 0;
331 }
332 
333 static int
memfd_mmap(file_t * fp,off_t * offp,size_t size,int prot,int * flagsp,int * advicep,struct uvm_object ** uobjp,int * maxprotp)334 memfd_mmap(file_t *fp, off_t *offp, size_t size, int prot, int *flagsp,
335     int *advicep, struct uvm_object **uobjp, int *maxprotp)
336 {
337           struct memfd *mfd = fp->f_memfd;
338           int error = 0;
339 
340           /* uvm_mmap guarantees page-aligned offset and size.  */
341           KASSERT(*offp == round_page(*offp));
342           KASSERT(size == round_page(size));
343           KASSERT(size > 0);
344 
345           mutex_enter(&fp->f_lock);
346 
347           if (*offp < 0) {
348                     error = EINVAL;
349                     goto leave;
350           }
351           if (*offp + size > mfd->mfd_size) {
352                     error = EINVAL;
353                     goto leave;
354           }
355 
356           if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) &&
357               (prot & VM_PROT_WRITE) && (*flagsp & MAP_PRIVATE) == 0) {
358                     error = EPERM;
359                     goto leave;
360           }
361 
362           uao_reference(fp->f_memfd->mfd_uobj);
363           *uobjp = fp->f_memfd->mfd_uobj;
364 
365           *maxprotp = prot;
366           *advicep = UVM_ADV_RANDOM;
367 
368 leave:
369           mutex_exit(&fp->f_lock);
370 
371           return error;
372 }
373 
374 static int
memfd_seek(file_t * fp,off_t delta,int whence,off_t * newoffp,int flags)375 memfd_seek(file_t *fp, off_t delta, int whence, off_t *newoffp,
376     int flags)
377 {
378           off_t newoff;
379           int error = 0;
380 
381           mutex_enter(&fp->f_lock);
382 
383           switch (whence) {
384           case SEEK_CUR:
385                     newoff = fp->f_offset + delta;
386                     break;
387 
388           case SEEK_END:
389                     newoff = fp->f_memfd->mfd_size + delta;
390                     break;
391 
392           case SEEK_SET:
393                     newoff = delta;
394                     break;
395 
396           default:
397                     error = EINVAL;
398                     goto leave;
399           }
400 
401           if (newoffp)
402                     *newoffp = newoff;
403           if (flags & FOF_UPDATE_OFFSET)
404                     fp->f_offset = newoff;
405 
406 leave:
407           mutex_exit(&fp->f_lock);
408 
409           return error;
410 }
411 
412 static int
memfd_truncate_locked(file_t * fp,off_t length)413 memfd_truncate_locked(file_t *fp, off_t length)
414 {
415           struct memfd *mfd = fp->f_memfd;
416           voff_t start, end;
417           int error = 0;
418 
419           KASSERT(mutex_owned(&fp->f_lock));
420 
421           if (length < 0)
422                     return EINVAL;
423           if (length == mfd->mfd_size)
424                     return 0;
425 
426           if ((mfd->mfd_seals & F_SEAL_SHRINK) && length < mfd->mfd_size)
427                     return EPERM;
428           if ((mfd->mfd_seals & F_SEAL_GROW) && length > mfd->mfd_size)
429                     return EPERM;
430 
431           if (length > mfd->mfd_size)
432                     ubc_zerorange(mfd->mfd_uobj, mfd->mfd_size,
433                         length - mfd->mfd_size, 0);
434           else {
435                     /* length < mfd->mfd_size, so try to get rid of excess pages */
436                     start = round_page(length);
437                     end = round_page(mfd->mfd_size);
438 
439                     if (start < end) { /* we actually have pages to remove */
440                               rw_enter(mfd->mfd_uobj->vmobjlock, RW_WRITER);
441                               error = (*mfd->mfd_uobj->pgops->pgo_put)(mfd->mfd_uobj,
442                                   start, end, PGO_FREE);
443                               /* pgo_put drops vmobjlock */
444                     }
445           }
446 
447           getnanotime(&mfd->mfd_mtime);
448           mfd->mfd_size = length;
449 
450           return error;
451 }
452 
453 static int
memfd_truncate(file_t * fp,off_t length)454 memfd_truncate(file_t *fp, off_t length)
455 {
456           int error;
457 
458           mutex_enter(&fp->f_lock);
459           error = memfd_truncate_locked(fp, length);
460           mutex_exit(&fp->f_lock);
461           return error;
462 }
463