xref: /dragonfly/sys/vfs/devfs/devfs_vnops.c (revision cc8e70bd591c943565dd618d131dcee0027ded02)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 2009 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Alex Hornung <ahornung@gmail.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/time.h>
39 #include <sys/kernel.h>
40 #include <sys/lock.h>
41 #include <sys/fcntl.h>
42 #include <sys/proc.h>
43 #include <sys/caps.h>
44 #include <sys/signalvar.h>
45 #include <sys/vnode.h>
46 #include <sys/uio.h>
47 #include <sys/mount.h>
48 #include <sys/file.h>
49 #include <sys/dirent.h>
50 #include <sys/malloc.h>
51 #include <sys/stat.h>
52 #include <sys/reg.h>
53 #include <vm/vm_pager.h>
54 #include <vm/vm_zone.h>
55 #include <vm/vm_object.h>
56 #include <sys/filio.h>
57 #include <sys/ttycom.h>
58 #include <sys/tty.h>
59 #include <sys/diskslice.h>
60 #include <sys/sysctl.h>
61 #include <sys/devfs.h>
62 #include <sys/pioctl.h>
63 #include <vfs/fifofs/fifo.h>
64 
65 #include <machine/limits.h>
66 
67 #include <sys/buf2.h>
68 #include <vm/vm_page2.h>
69 
70 #ifndef SPEC_CHAIN_DEBUG
71 #define SPEC_CHAIN_DEBUG 0
72 #endif
73 
74 MALLOC_DECLARE(M_DEVFS);
75 #define DEVFS_BADOP (void *)devfs_vop_badop
76 
77 static int devfs_vop_badop(struct vop_generic_args *);
78 static int devfs_vop_access(struct vop_access_args *);
79 static int devfs_vop_inactive(struct vop_inactive_args *);
80 static int devfs_vop_reclaim(struct vop_reclaim_args *);
81 static int devfs_vop_readdir(struct vop_readdir_args *);
82 static int devfs_vop_getattr(struct vop_getattr_args *);
83 static int devfs_vop_setattr(struct vop_setattr_args *);
84 static int devfs_vop_readlink(struct vop_readlink_args *);
85 static int devfs_vop_print(struct vop_print_args *);
86 
87 static int devfs_vop_nresolve(struct vop_nresolve_args *);
88 static int devfs_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
89 static int devfs_vop_nmkdir(struct vop_nmkdir_args *);
90 static int devfs_vop_nsymlink(struct vop_nsymlink_args *);
91 static int devfs_vop_nrmdir(struct vop_nrmdir_args *);
92 static int devfs_vop_nremove(struct vop_nremove_args *);
93 
94 static int devfs_spec_open(struct vop_open_args *);
95 static int devfs_spec_close(struct vop_close_args *);
96 static int devfs_spec_fsync(struct vop_fsync_args *);
97 
98 static int devfs_spec_read(struct vop_read_args *);
99 static int devfs_spec_write(struct vop_write_args *);
100 static int devfs_spec_ioctl(struct vop_ioctl_args *);
101 static int devfs_spec_kqfilter(struct vop_kqfilter_args *);
102 static int devfs_spec_strategy(struct vop_strategy_args *);
103 static void devfs_spec_strategy_done(struct bio *);
104 static int devfs_spec_freeblks(struct vop_freeblks_args *);
105 static int devfs_spec_bmap(struct vop_bmap_args *);
106 static int devfs_spec_advlock(struct vop_advlock_args *);
107 static void devfs_spec_getpages_iodone(struct bio *);
108 static int devfs_spec_getpages(struct vop_getpages_args *);
109 
110 static int devfs_fo_close(struct file *);
111 static int devfs_fo_read(struct file *, struct uio *, struct ucred *, int);
112 static int devfs_fo_write(struct file *, struct uio *, struct ucred *, int);
113 static int devfs_fo_stat(struct file *, struct stat *, struct ucred *);
114 static int devfs_fo_kqfilter(struct file *, struct knote *);
115 static int devfs_fo_ioctl(struct file *, u_long, caddr_t,
116                                         struct ucred *, struct sysmsg *);
117 static int devfs_fo_seek(struct file *, off_t, int, off_t *);
118 static __inline int sequential_heuristic(struct uio *, struct file *);
119 
120 extern struct lock devfs_lock;
121 
122 /*
123  * devfs vnode operations for regular files.  All vnode ops are MPSAFE.
124  */
125 struct vop_ops devfs_vnode_norm_vops = {
126           .vop_default =                vop_defaultop,
127           .vop_access =                 devfs_vop_access,
128           .vop_advlock =                DEVFS_BADOP,
129           .vop_bmap =                   DEVFS_BADOP,
130           .vop_close =                  vop_stdclose,
131           .vop_getattr =                devfs_vop_getattr,
132           .vop_inactive =               devfs_vop_inactive,
133           .vop_ncreate =                DEVFS_BADOP,
134           .vop_nresolve =               devfs_vop_nresolve,
135           .vop_nlookupdotdot =          devfs_vop_nlookupdotdot,
136           .vop_nlink =                  DEVFS_BADOP,
137           .vop_nmkdir =                 devfs_vop_nmkdir,
138           .vop_nmknod =                 DEVFS_BADOP,
139           .vop_nremove =                devfs_vop_nremove,
140           .vop_nrename =                DEVFS_BADOP,
141           .vop_nrmdir =                 devfs_vop_nrmdir,
142           .vop_nsymlink =               devfs_vop_nsymlink,
143           .vop_open =                   vop_stdopen,
144           .vop_pathconf =               vop_stdpathconf,
145           .vop_print =                  devfs_vop_print,
146           .vop_read =                   DEVFS_BADOP,
147           .vop_readdir =                devfs_vop_readdir,
148           .vop_readlink =               devfs_vop_readlink,
149           .vop_reallocblks =  DEVFS_BADOP,
150           .vop_reclaim =                devfs_vop_reclaim,
151           .vop_setattr =                devfs_vop_setattr,
152           .vop_write =                  DEVFS_BADOP,
153           .vop_ioctl =                  DEVFS_BADOP
154 };
155 
156 /*
157  * devfs vnode operations for character devices.  All vnode ops are MPSAFE.
158  */
159 struct vop_ops devfs_vnode_dev_vops = {
160           .vop_default =                vop_defaultop,
161           .vop_access =                 devfs_vop_access,
162           .vop_advlock =                devfs_spec_advlock,
163           .vop_bmap =                   devfs_spec_bmap,
164           .vop_close =                  devfs_spec_close,
165           .vop_freeblks =               devfs_spec_freeblks,
166           .vop_fsync =                  devfs_spec_fsync,
167           .vop_getattr =                devfs_vop_getattr,
168           .vop_getpages =               devfs_spec_getpages,
169           .vop_inactive =               devfs_vop_inactive,
170           .vop_open =                   devfs_spec_open,
171           .vop_pathconf =               vop_stdpathconf,
172           .vop_print =                  devfs_vop_print,
173           .vop_kqfilter =               devfs_spec_kqfilter,
174           .vop_read =                   devfs_spec_read,
175           .vop_readdir =                DEVFS_BADOP,
176           .vop_readlink =               DEVFS_BADOP,
177           .vop_reallocblks =  DEVFS_BADOP,
178           .vop_reclaim =                devfs_vop_reclaim,
179           .vop_setattr =                devfs_vop_setattr,
180           .vop_strategy =               devfs_spec_strategy,
181           .vop_write =                  devfs_spec_write,
182           .vop_ioctl =                  devfs_spec_ioctl
183 };
184 
185 /*
186  * devfs file pointer operations.  All fileops are MPSAFE.
187  */
188 struct vop_ops *devfs_vnode_dev_vops_p = &devfs_vnode_dev_vops;
189 
190 struct fileops devfs_dev_fileops = {
191           .fo_read  = devfs_fo_read,
192           .fo_write = devfs_fo_write,
193           .fo_ioctl = devfs_fo_ioctl,
194           .fo_kqfilter        = devfs_fo_kqfilter,
195           .fo_stat  = devfs_fo_stat,
196           .fo_close = devfs_fo_close,
197           .fo_shutdown        = nofo_shutdown,
198           .fo_seek  = devfs_fo_seek
199 };
200 
201 /*
202  * These two functions are possibly temporary hacks for devices (aka
203  * the pty code) which want to control the node attributes themselves.
204  *
205  * XXX we may ultimately desire to simply remove the uid/gid/mode
206  * from the node entirely.
207  *
208  * MPSAFE - sorta.  Theoretically the overwrite can compete since they
209  *            are loading from the same fields.
210  */
211 static __inline void
node_sync_dev_get(struct devfs_node * node)212 node_sync_dev_get(struct devfs_node *node)
213 {
214           cdev_t dev;
215 
216           if ((dev = node->d_dev) && (dev->si_flags & SI_OVERRIDE)) {
217                     node->uid = dev->si_uid;
218                     node->gid = dev->si_gid;
219                     node->mode = dev->si_perms;
220           }
221 }
222 
223 static __inline void
node_sync_dev_set(struct devfs_node * node)224 node_sync_dev_set(struct devfs_node *node)
225 {
226           cdev_t dev;
227 
228           if ((dev = node->d_dev) && (dev->si_flags & SI_OVERRIDE)) {
229                     dev->si_uid = node->uid;
230                     dev->si_gid = node->gid;
231                     dev->si_perms = node->mode;
232           }
233 }
234 
235 /*
236  * generic entry point for unsupported operations
237  */
238 static int
devfs_vop_badop(struct vop_generic_args * ap)239 devfs_vop_badop(struct vop_generic_args *ap)
240 {
241           return (EIO);
242 }
243 
244 
245 static int
devfs_vop_access(struct vop_access_args * ap)246 devfs_vop_access(struct vop_access_args *ap)
247 {
248           struct devfs_node *node = DEVFS_NODE(ap->a_vp);
249           int error;
250 
251           if (!devfs_node_is_accessible(node))
252                     return ENOENT;
253           node_sync_dev_get(node);
254           error = vop_helper_access(ap, node->uid, node->gid,
255                                           node->mode, node->flags);
256 
257           return error;
258 }
259 
260 
261 static int
devfs_vop_inactive(struct vop_inactive_args * ap)262 devfs_vop_inactive(struct vop_inactive_args *ap)
263 {
264           struct devfs_node *node = DEVFS_NODE(ap->a_vp);
265 
266           if (node == NULL || (node->flags & DEVFS_NODE_LINKED) == 0)
267                     vrecycle(ap->a_vp);
268           return 0;
269 }
270 
271 
272 static int
devfs_vop_reclaim(struct vop_reclaim_args * ap)273 devfs_vop_reclaim(struct vop_reclaim_args *ap)
274 {
275           struct devfs_node *node;
276           struct vnode *vp;
277           int locked;
278 
279           /*
280            * Check if it is locked already. if not, we acquire the devfs lock
281            */
282           if ((lockstatus(&devfs_lock, curthread)) != LK_EXCLUSIVE) {
283                     lockmgr(&devfs_lock, LK_EXCLUSIVE);
284                     locked = 1;
285           } else {
286                     locked = 0;
287           }
288 
289           /*
290            * Get rid of the devfs_node if it is no longer linked into the
291            * topology.  Interlocked by devfs_lock.  However, be careful
292            * interposing other operations between cleaning out v_data and
293            * devfs_freep() as the node is only protected by devfs_lock
294            * once the vnode is disassociated.
295            */
296           vp = ap->a_vp;
297           node = DEVFS_NODE(vp);
298 
299           if (node) {
300                     if (node->v_node != vp) {
301                               kprintf("NODE->V_NODE MISMATCH VP=%p NODEVP=%p\n",
302                                         vp, node->v_node);
303                     }
304                     vp->v_data = NULL;
305                     node->v_node = NULL;
306                     if ((node->flags & DEVFS_NODE_LINKED) == 0)
307                               devfs_freep(node);
308           }
309           v_release_rdev(vp);
310 
311           if (locked)
312                     lockmgr(&devfs_lock, LK_RELEASE);
313 
314           /*
315            * v_rdev needs to be properly released using v_release_rdev
316            * Make sure v_data is NULL as well.
317            */
318           return 0;
319 }
320 
321 
322 static int
devfs_vop_readdir(struct vop_readdir_args * ap)323 devfs_vop_readdir(struct vop_readdir_args *ap)
324 {
325           struct devfs_node *dnode = DEVFS_NODE(ap->a_vp);
326           struct devfs_node *node;
327           int cookie_index;
328           int ncookies;
329           int error2;
330           int error;
331           int r;
332           off_t *cookies;
333           off_t saveoff;
334 
335           devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_readdir() called!\n");
336 
337           if (ap->a_uio->uio_offset < 0 || ap->a_uio->uio_offset > INT_MAX)
338                     return (EINVAL);
339           error = vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY | LK_FAILRECLAIM);
340           if (error)
341                     return (error);
342 
343           if (!devfs_node_is_accessible(dnode)) {
344                     vn_unlock(ap->a_vp);
345                     return ENOENT;
346           }
347 
348           lockmgr(&devfs_lock, LK_EXCLUSIVE);
349 
350           saveoff = ap->a_uio->uio_offset;
351 
352           if (ap->a_ncookies) {
353                     ncookies = ap->a_uio->uio_resid / 16 + 1; /* Why / 16 ?? */
354                     if (ncookies > 256)
355                               ncookies = 256;
356                     cookies = kmalloc(256 * sizeof(off_t), M_TEMP, M_WAITOK);
357                     cookie_index = 0;
358           } else {
359                     ncookies = -1;
360                     cookies = NULL;
361                     cookie_index = 0;
362           }
363 
364           vfs_timestamp(&dnode->atime);
365 
366           if (saveoff == 0) {
367                     r = vop_write_dirent(&error, ap->a_uio, dnode->d_dir.d_ino,
368                                              DT_DIR, 1, ".");
369                     if (r)
370                               goto done;
371                     if (cookies)
372                               cookies[cookie_index] = saveoff;
373                     saveoff++;
374                     cookie_index++;
375                     if (cookie_index == ncookies)
376                               goto done;
377           }
378 
379           if (saveoff == 1) {
380                     if (dnode->parent) {
381                               r = vop_write_dirent(&error, ap->a_uio,
382                                                        dnode->parent->d_dir.d_ino,
383                                                        DT_DIR, 2, "..");
384                     } else {
385                               r = vop_write_dirent(&error, ap->a_uio,
386                                                        dnode->d_dir.d_ino,
387                                                        DT_DIR, 2, "..");
388                     }
389                     if (r)
390                               goto done;
391                     if (cookies)
392                               cookies[cookie_index] = saveoff;
393                     saveoff++;
394                     cookie_index++;
395                     if (cookie_index == ncookies)
396                               goto done;
397           }
398 
399           TAILQ_FOREACH(node, DEVFS_DENODE_HEAD(dnode), link) {
400                     if ((node->flags & DEVFS_HIDDEN) ||
401                         (node->flags & DEVFS_INVISIBLE)) {
402                               continue;
403                     }
404 
405                     /*
406                      * If the node type is a valid devfs alias, then we make
407                      * sure that the target isn't hidden. If it is, we don't
408                      * show the link in the directory listing.
409                      */
410                     if ((node->node_type == Nlink) && (node->link_target != NULL) &&
411                               (node->link_target->flags & DEVFS_HIDDEN))
412                               continue;
413 
414                     if (node->cookie < saveoff)
415                               continue;
416 
417                     saveoff = node->cookie;
418 
419                     error2 = vop_write_dirent(&error, ap->a_uio, node->d_dir.d_ino,
420                                                     node->d_dir.d_type,
421                                                     node->d_dir.d_namlen,
422                                                     node->d_dir.d_name);
423 
424                     if (error2)
425                               break;
426 
427                     saveoff++;
428 
429                     if (cookies)
430                               cookies[cookie_index] = node->cookie;
431                     ++cookie_index;
432                     if (cookie_index == ncookies)
433                               break;
434           }
435 
436 done:
437           lockmgr(&devfs_lock, LK_RELEASE);
438           vn_unlock(ap->a_vp);
439 
440           ap->a_uio->uio_offset = saveoff;
441           if (error && cookie_index == 0) {
442                     if (cookies) {
443                               kfree(cookies, M_TEMP);
444                               *ap->a_ncookies = 0;
445                               *ap->a_cookies = NULL;
446                     }
447           } else {
448                     if (cookies) {
449                               *ap->a_ncookies = cookie_index;
450                               *ap->a_cookies = cookies;
451                     }
452           }
453           return (error);
454 }
455 
456 
457 static int
devfs_vop_nresolve(struct vop_nresolve_args * ap)458 devfs_vop_nresolve(struct vop_nresolve_args *ap)
459 {
460           struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp);
461           struct devfs_node *node, *found = NULL;
462           struct namecache *ncp;
463           struct vnode *vp = NULL;
464           int error = 0;
465           int len;
466           int depth;
467 
468           ncp = ap->a_nch->ncp;
469           len = ncp->nc_nlen;
470 
471           if (!devfs_node_is_accessible(dnode))
472                     return ENOENT;
473 
474           lockmgr(&devfs_lock, LK_EXCLUSIVE);
475 
476           if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir)) {
477                     error = ENOENT;
478                     cache_setvp(ap->a_nch, NULL);
479                     goto out;
480           }
481 
482           TAILQ_FOREACH(node, DEVFS_DENODE_HEAD(dnode), link) {
483                     if (len == node->d_dir.d_namlen) {
484                               if (!memcmp(ncp->nc_name, node->d_dir.d_name, len)) {
485                                         found = node;
486                                         break;
487                               }
488                     }
489           }
490 
491           if (found) {
492                     depth = 0;
493                     while ((found->node_type == Nlink) && (found->link_target)) {
494                               if (depth >= 8) {
495                                         devfs_debug(DEVFS_DEBUG_SHOW, "Recursive link or depth >= 8");
496                                         break;
497                               }
498 
499                               found = found->link_target;
500                               ++depth;
501                     }
502 
503                     if (!(found->flags & DEVFS_HIDDEN))
504                               devfs_allocv(/*ap->a_dvp->v_mount, */ &vp, found);
505           }
506 
507           if (vp == NULL) {
508                     error = ENOENT;
509                     cache_setvp(ap->a_nch, NULL);
510                     goto out;
511 
512           }
513           KKASSERT(vp);
514           vn_unlock(vp);
515           cache_setvp(ap->a_nch, vp);
516           vrele(vp);
517 out:
518           lockmgr(&devfs_lock, LK_RELEASE);
519 
520           return error;
521 }
522 
523 
524 static int
devfs_vop_nlookupdotdot(struct vop_nlookupdotdot_args * ap)525 devfs_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
526 {
527           struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp);
528 
529           *ap->a_vpp = NULL;
530           if (!devfs_node_is_accessible(dnode))
531                     return ENOENT;
532 
533           lockmgr(&devfs_lock, LK_EXCLUSIVE);
534           if (dnode->parent != NULL) {
535                     devfs_allocv(ap->a_vpp, dnode->parent);
536                     vn_unlock(*ap->a_vpp);
537           }
538           lockmgr(&devfs_lock, LK_RELEASE);
539 
540           return ((*ap->a_vpp == NULL) ? ENOENT : 0);
541 }
542 
543 
544 /*
545  * getattr() - Does not need a lock since the vp is refd
546  */
547 static int
devfs_vop_getattr(struct vop_getattr_args * ap)548 devfs_vop_getattr(struct vop_getattr_args *ap)
549 {
550           struct devfs_node *node = DEVFS_NODE(ap->a_vp);
551           struct vattr *vap = ap->a_vap;
552           struct partinfo pinfo;
553           int error = 0;
554 
555 #if 0
556           if (!devfs_node_is_accessible(node))
557                     return ENOENT;
558 #endif
559 
560           /*
561            * XXX This is a temporary hack to prevent crashes when the device is
562            * being destroyed (and so the underlying node will be gone) while
563            * a userland program is blocked in a read().
564            */
565           if (node == NULL)
566                     return EIO;
567 
568           node_sync_dev_get(node);
569 
570           /* start by zeroing out the attributes */
571           VATTR_NULL(vap);
572 
573           /* next do all the common fields */
574           vap->va_type = ap->a_vp->v_type;
575           vap->va_mode = node->mode;
576           vap->va_fileid = DEVFS_NODE(ap->a_vp)->d_dir.d_ino ;
577           vap->va_flags = 0;
578           vap->va_blocksize = DEV_BSIZE;
579           vap->va_bytes = vap->va_size = 0;
580 
581           vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0];
582 
583           vap->va_atime = node->atime;
584           vap->va_mtime = node->mtime;
585           vap->va_ctime = node->ctime;
586 
587           vap->va_nlink = 1; /* number of references to file */
588 
589           vap->va_uid = node->uid;
590           vap->va_gid = node->gid;
591 
592           vap->va_rmajor = 0;
593           vap->va_rminor = 0;
594 
595           if ((node->node_type == Ndev) && node->d_dev)  {
596                     reference_dev(node->d_dev);
597                     vap->va_rminor = node->d_dev->si_uminor;
598                     release_dev(node->d_dev);
599           }
600 
601           /* For a softlink the va_size is the length of the softlink */
602           if (node->symlink_name != 0) {
603                     vap->va_bytes = vap->va_size = node->symlink_namelen;
604           }
605 
606           /*
607            * For a disk-type device, va_size is the size of the underlying
608            * device, so that lseek() works properly.
609            */
610           if ((node->d_dev) && (dev_dflags(node->d_dev) & D_DISK)) {
611                     bzero(&pinfo, sizeof(pinfo));
612                     error = dev_dioctl(node->d_dev, DIOCGPART, (void *)&pinfo,
613                                            0, proc0.p_ucred, NULL, NULL);
614                     if ((error == 0) && (pinfo.media_blksize != 0)) {
615                               vap->va_size = pinfo.media_size;
616                     } else {
617                               vap->va_size = 0;
618                               error = 0;
619                     }
620           }
621 
622           return (error);
623 }
624 
625 static int
devfs_vop_setattr(struct vop_setattr_args * ap)626 devfs_vop_setattr(struct vop_setattr_args *ap)
627 {
628           struct devfs_node *node = DEVFS_NODE(ap->a_vp);
629           struct vattr *vap;
630           uid_t cur_uid;
631           gid_t cur_gid;
632           mode_t cur_mode;
633           int error = 0;
634 
635           if (!devfs_node_is_accessible(node))
636                     return ENOENT;
637           node_sync_dev_get(node);
638 
639           vap = ap->a_vap;
640 
641           if ((vap->va_uid != (uid_t)VNOVAL) || (vap->va_gid != (gid_t)VNOVAL)) {
642                     cur_uid = node->uid;
643                     cur_gid = node->gid;
644                     cur_mode = node->mode;
645                     error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
646                         ap->a_cred, &cur_uid, &cur_gid, &cur_mode);
647                     if (error)
648                               goto out;
649 
650                     if (node->uid != cur_uid || node->gid != cur_gid) {
651                               node->uid = cur_uid;
652                               node->gid = cur_gid;
653                               node->mode = cur_mode;
654                     }
655           }
656 
657           if (vap->va_mode != (mode_t)VNOVAL) {
658                     cur_mode = node->mode;
659                     error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
660                         node->uid, node->gid, &cur_mode);
661                     if (error == 0 && node->mode != cur_mode) {
662                               node->mode = cur_mode;
663                     }
664           }
665 
666 out:
667           node_sync_dev_set(node);
668           vfs_timestamp(&node->ctime);
669 
670           return error;
671 }
672 
673 
674 static int
devfs_vop_readlink(struct vop_readlink_args * ap)675 devfs_vop_readlink(struct vop_readlink_args *ap)
676 {
677           struct devfs_node *node = DEVFS_NODE(ap->a_vp);
678           int ret;
679 
680           if (!devfs_node_is_accessible(node))
681                     return ENOENT;
682 
683           lockmgr(&devfs_lock, LK_SHARED);
684           ret = uiomove(node->symlink_name, node->symlink_namelen, ap->a_uio);
685           lockmgr(&devfs_lock, LK_RELEASE);
686 
687           return ret;
688 }
689 
690 
691 static int
devfs_vop_print(struct vop_print_args * ap)692 devfs_vop_print(struct vop_print_args *ap)
693 {
694           return (0);
695 }
696 
697 static int
devfs_vop_nmkdir(struct vop_nmkdir_args * ap)698 devfs_vop_nmkdir(struct vop_nmkdir_args *ap)
699 {
700           struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp);
701           struct devfs_node *node;
702 
703           if (!devfs_node_is_accessible(dnode))
704                     return ENOENT;
705 
706           if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir))
707                     goto out;
708 
709           lockmgr(&devfs_lock, LK_EXCLUSIVE);
710           devfs_allocvp(ap->a_dvp->v_mount, ap->a_vpp, Ndir,
711                           ap->a_nch->ncp->nc_name, dnode, NULL);
712 
713           if (*ap->a_vpp) {
714                     node = DEVFS_NODE(*ap->a_vpp);
715                     node->flags |= DEVFS_USER_CREATED;
716                     cache_setunresolved(ap->a_nch);
717                     cache_setvp(ap->a_nch, *ap->a_vpp);
718           }
719           lockmgr(&devfs_lock, LK_RELEASE);
720 out:
721           return ((*ap->a_vpp == NULL) ? ENOTDIR : 0);
722 }
723 
724 static int
devfs_vop_nsymlink(struct vop_nsymlink_args * ap)725 devfs_vop_nsymlink(struct vop_nsymlink_args *ap)
726 {
727           struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp);
728           struct devfs_node *node;
729           size_t targetlen;
730 
731           if (!devfs_node_is_accessible(dnode))
732                     return ENOENT;
733 
734           ap->a_vap->va_type = VLNK;
735 
736           if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir))
737                     goto out;
738 
739           lockmgr(&devfs_lock, LK_EXCLUSIVE);
740           devfs_allocvp(ap->a_dvp->v_mount, ap->a_vpp, Nlink,
741                           ap->a_nch->ncp->nc_name, dnode, NULL);
742 
743           targetlen = strlen(ap->a_target);
744           if (*ap->a_vpp) {
745                     node = DEVFS_NODE(*ap->a_vpp);
746                     node->flags |= DEVFS_USER_CREATED;
747                     node->symlink_namelen = targetlen;
748                     node->symlink_name = kmalloc(targetlen + 1, M_DEVFS, M_WAITOK);
749                     memcpy(node->symlink_name, ap->a_target, targetlen);
750                     node->symlink_name[targetlen] = '\0';
751                     cache_setunresolved(ap->a_nch);
752                     cache_setvp(ap->a_nch, *ap->a_vpp);
753           }
754           lockmgr(&devfs_lock, LK_RELEASE);
755 out:
756           return ((*ap->a_vpp == NULL) ? ENOTDIR : 0);
757 }
758 
759 static int
devfs_vop_nrmdir(struct vop_nrmdir_args * ap)760 devfs_vop_nrmdir(struct vop_nrmdir_args *ap)
761 {
762           struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp);
763           struct devfs_node *node;
764           struct namecache *ncp;
765           int error = ENOENT;
766 
767           ncp = ap->a_nch->ncp;
768 
769           if (!devfs_node_is_accessible(dnode))
770                     return ENOENT;
771 
772           lockmgr(&devfs_lock, LK_EXCLUSIVE);
773 
774           if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir))
775                     goto out;
776 
777           TAILQ_FOREACH(node, DEVFS_DENODE_HEAD(dnode), link) {
778                     if (ncp->nc_nlen != node->d_dir.d_namlen)
779                               continue;
780                     if (memcmp(ncp->nc_name, node->d_dir.d_name, ncp->nc_nlen))
781                               continue;
782 
783                     /*
784                      * only allow removal of user created dirs
785                      */
786                     if ((node->flags & DEVFS_USER_CREATED) == 0) {
787                               error = EPERM;
788                               goto out;
789                     } else if (node->node_type != Ndir) {
790                               error = ENOTDIR;
791                               goto out;
792                     } else if (node->nchildren > 2) {
793                               error = ENOTEMPTY;
794                               goto out;
795                     } else {
796                               if (node->v_node)
797                                         cache_inval_vp(node->v_node, CINV_DESTROY);
798                               devfs_unlinkp(node);
799                               error = 0;
800                               break;
801                     }
802           }
803 
804           cache_unlink(ap->a_nch);
805 out:
806           lockmgr(&devfs_lock, LK_RELEASE);
807           return error;
808 }
809 
810 static int
devfs_vop_nremove(struct vop_nremove_args * ap)811 devfs_vop_nremove(struct vop_nremove_args *ap)
812 {
813           struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp);
814           struct devfs_node *node;
815           struct namecache *ncp;
816           int error = ENOENT;
817 
818           ncp = ap->a_nch->ncp;
819 
820           if (!devfs_node_is_accessible(dnode))
821                     return ENOENT;
822 
823           lockmgr(&devfs_lock, LK_EXCLUSIVE);
824 
825           if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir))
826                     goto out;
827 
828           TAILQ_FOREACH(node, DEVFS_DENODE_HEAD(dnode), link) {
829                     if (ncp->nc_nlen != node->d_dir.d_namlen)
830                               continue;
831                     if (memcmp(ncp->nc_name, node->d_dir.d_name, ncp->nc_nlen))
832                               continue;
833 
834                     /*
835                      * only allow removal of user created stuff (e.g. symlinks)
836                      */
837                     if ((node->flags & DEVFS_USER_CREATED) == 0) {
838                               error = EPERM;
839                               goto out;
840                     } else if (node->node_type == Ndir) {
841                               error = EISDIR;
842                               goto out;
843                     } else {
844                               if (node->v_node)
845                                         cache_inval_vp(node->v_node, CINV_DESTROY);
846                               devfs_unlinkp(node);
847                               error = 0;
848                               break;
849                     }
850           }
851 
852           cache_unlink(ap->a_nch);
853 out:
854           lockmgr(&devfs_lock, LK_RELEASE);
855           return error;
856 }
857 
858 
859 static int
devfs_spec_open(struct vop_open_args * ap)860 devfs_spec_open(struct vop_open_args *ap)
861 {
862           struct vnode *vp = ap->a_vp;
863           struct vnode *orig_vp = NULL;
864           struct devfs_node *node = DEVFS_NODE(vp);
865           struct devfs_node *newnode;
866           cdev_t dev, ndev = NULL;
867           int error = 0;
868 
869           if (node) {
870                     if (node->d_dev == NULL)
871                               return ENXIO;
872                     if (!devfs_node_is_accessible(node))
873                               return ENOENT;
874           }
875 
876           if ((dev = vp->v_rdev) == NULL)
877                     return ENXIO;
878 
879           /*
880            * Simple devices that don't care.  Retain the shared lock.
881            */
882           if (dev_dflags(dev) & D_QUICK) {
883                     vn_unlock(vp);
884                     error = dev_dopen(dev, ap->a_mode, S_IFCHR,
885                                           ap->a_cred, ap->a_fpp, vp);
886                     vn_lock(vp, LK_SHARED | LK_RETRY);
887                     if (error)
888                               return error;
889                     vop_stdopen(ap);
890                     goto skip;
891           }
892 
893           /*
894            * Slow code
895            */
896           vn_lock(vp, LK_UPGRADE | LK_RETRY);
897           if (node && ap->a_fpp) {
898                     int exists;
899 
900                     devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_spec_open: -1.1-\n");
901                     lockmgr(&devfs_lock, LK_SHARED);
902 
903                     ndev = devfs_clone(dev, node->d_dir.d_name,
904                                            node->d_dir.d_namlen,
905                                            ap->a_mode, ap->a_cred);
906                     if (ndev != NULL) {
907                               lockmgr(&devfs_lock, LK_RELEASE);
908                               lockmgr(&devfs_lock, LK_EXCLUSIVE);
909                               newnode = devfs_create_device_node(
910                                                   DEVFS_MNTDATA(vp->v_mount)->root_node,
911                                                   ndev, &exists, NULL, NULL);
912                               /* XXX: possibly destroy device if this happens */
913 
914                               if (newnode != NULL) {
915                                         dev = ndev;
916                                         if (exists == 0)
917                                                   devfs_link_dev(dev);
918 
919                                         devfs_debug(DEVFS_DEBUG_DEBUG,
920                                                             "parent here is: %s, node is: |%s|\n",
921                                                             ((node->parent->node_type == Nroot) ?
922                                                             "ROOT!" : node->parent->d_dir.d_name),
923                                                             newnode->d_dir.d_name);
924                                         devfs_debug(DEVFS_DEBUG_DEBUG,
925                                                             "test: %s\n",
926                                                             ((struct devfs_node *)(TAILQ_LAST(DEVFS_DENODE_HEAD(node->parent), devfs_node_head)))->d_dir.d_name);
927 
928                                         /*
929                                          * orig_vp is set to the original vp if we
930                                          * cloned.
931                                          */
932                                         /* node->flags |= DEVFS_CLONED; */
933                                         devfs_allocv(&vp, newnode);
934                                         orig_vp = ap->a_vp;
935                                         ap->a_vp = vp;
936                               }
937                     }
938                     lockmgr(&devfs_lock, LK_RELEASE);
939 
940                     /*
941                      * Synchronize devfs here to make sure that, if the cloned
942                      * device creates other device nodes in addition to the
943                      * cloned one, all of them are created by the time we return
944                      * from opening the cloned one.
945                      */
946                     if (ndev)
947                               devfs_config();
948           }
949 
950           devfs_debug(DEVFS_DEBUG_DEBUG,
951                         "devfs_spec_open() called on %s! \n",
952                         dev->si_name);
953 
954           /*
955            * Make this field valid before any I/O in ->d_open
956            *
957            * NOTE: Shared vnode lock probably held, but its ok as long
958            *         as assignments are consistent.
959            */
960           if (!dev->si_iosize_max)
961                     /* XXX: old DFLTPHYS == 64KB dependency */
962                     dev->si_iosize_max = min(MAXPHYS,64*1024);
963 
964           if (dev_dflags(dev) & D_TTY)
965                     vsetflags(vp, VISTTY);
966 
967           /*
968            * Open the underlying device.
969            *
970            * NOTE: If the dev open returns EALREADY it has completed the open
971            *         operation and is returning a fully initialized *a->a_fpp
972            *         (which it may also have replaced).  This includes issuing
973            *         any necessary VOP_OPEN().
974            *
975            *         Also, the returned ap->a_fpp might not be DTYPE_VNODE and
976            *         if it is might not be using the vp we supplied to it.
977            */
978           vn_unlock(vp);
979           error = dev_dopen(dev, ap->a_mode, S_IFCHR,
980                                 ap->a_cred, ap->a_fpp, vp);
981           vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
982 
983           if (__predict_false(error == EALREADY)) {
984                     if (orig_vp)
985                               vput(vp);
986                     return 0;
987           }
988 
989           /*
990            * Clean up any cloned vp if we error out.
991            */
992           if (__predict_false(error != 0)) {
993                     if (orig_vp) {
994                               vput(vp);
995                               ap->a_vp = orig_vp;
996                               /* orig_vp = NULL; */
997                     }
998                     return error;
999           }
1000 
1001           /*
1002            * This checks if the disk device is going to be opened for writing.
1003            * It will be only allowed in the cases where securelevel permits it
1004            * and it's not mounted R/W.
1005            */
1006           if ((dev_dflags(dev) & D_DISK) && (ap->a_mode & FWRITE) &&
1007               (ap->a_cred != FSCRED)) {
1008 
1009                     /* Very secure mode. No open for writing allowed */
1010                     if (securelevel >= 2)
1011                               return EPERM;
1012 
1013                     /*
1014                      * If it is mounted R/W, do not allow to open for writing.
1015                      * In the case it's mounted read-only but securelevel
1016                      * is >= 1, then do not allow opening for writing either.
1017                      */
1018                     if (vfs_mountedon(vp)) {
1019                               if (!(dev->si_mountpoint->mnt_flag & MNT_RDONLY))
1020                                         return EBUSY;
1021                               else if (securelevel >= 1)
1022                                         return EPERM;
1023                     }
1024           }
1025 
1026           /*
1027            * NOTE: vnode is still locked shared.  t_stop assignment should
1028            *         remain consistent so we should be ok.
1029            */
1030           if (dev_dflags(dev) & D_TTY) {
1031                     if (dev->si_tty) {
1032                               struct tty *tp;
1033                               tp = dev->si_tty;
1034                               if (!tp->t_stop) {
1035                                         devfs_debug(DEVFS_DEBUG_DEBUG,
1036                                                       "devfs: no t_stop\n");
1037                                         tp->t_stop = nottystop;
1038                               }
1039                     }
1040           }
1041 
1042           /*
1043            * NOTE: vnode is still locked shared.  assignments should
1044            *         remain consistent so we should be ok.  However,
1045            *         upgrade to exclusive if we need a VM object.
1046            */
1047           if (vn_isdisk(vp, NULL)) {
1048                     if (!dev->si_bsize_phys)
1049                               dev->si_bsize_phys = DEV_BSIZE;
1050                     vinitvmio(vp, IDX_TO_OFF(INT_MAX), PAGE_SIZE, -1);
1051           }
1052 
1053           vop_stdopen(ap);
1054 #if 0
1055           if (node)
1056                     vfs_timestamp(&node->atime);
1057 #endif
1058           /*
1059            * If we replaced the vp the vop_stdopen() call will have loaded
1060            * it into fp->f_data and vref()d the vp, giving us two refs.  So
1061            * instead of just unlocking it here we have to vput() it.
1062            */
1063           if (orig_vp)
1064                     vput(vp);
1065 
1066           /* Ugly pty magic, to make pty devices appear once they are opened */
1067           if (node && (node->flags & DEVFS_PTY) == DEVFS_PTY) {
1068                     if (node->flags & DEVFS_INVISIBLE)
1069                               node->flags &= ~DEVFS_INVISIBLE;
1070           }
1071 
1072 skip:
1073           if (ap->a_fpp) {
1074                     struct file *fp = *ap->a_fpp;
1075 
1076                     KKASSERT(fp->f_type == DTYPE_VNODE);
1077                     KKASSERT((fp->f_flag & FMASK) == (ap->a_mode & FMASK));
1078                     fp->f_ops = &devfs_dev_fileops;
1079                     KKASSERT(fp->f_data == (void *)vp);
1080           }
1081 
1082           return 0;
1083 }
1084 
1085 static int
devfs_spec_close(struct vop_close_args * ap)1086 devfs_spec_close(struct vop_close_args *ap)
1087 {
1088           struct devfs_node *node;
1089           struct proc *p = curproc;
1090           struct vnode *vp = ap->a_vp;
1091           cdev_t dev = vp->v_rdev;
1092           int error = 0;
1093           int needrelock;
1094           int opencount;
1095 
1096           /*
1097            * Devices flagged D_QUICK require no special handling.
1098            */
1099           if (dev && dev_dflags(dev) & D_QUICK) {
1100                     opencount = vp->v_opencount;
1101                     if (opencount <= 1)
1102                               opencount = count_dev(dev);   /* XXX NOT SMP SAFE */
1103                     if (((vp->v_flag & VRECLAIMED) ||
1104                         (dev_dflags(dev) & D_TRACKCLOSE) ||
1105                         (opencount == 1))) {
1106                               vn_unlock(vp);
1107                               error = dev_dclose(dev, ap->a_fflag, S_IFCHR, ap->a_fp);
1108                               vn_lock(vp, LK_SHARED | LK_RETRY);
1109                     }
1110                     goto skip;
1111           }
1112 
1113           /*
1114            * We do special tests on the opencount so unfortunately we need
1115            * an exclusive lock.
1116            */
1117           vn_lock(vp, LK_UPGRADE | LK_RETRY);
1118 
1119           if (dev)
1120                     devfs_debug(DEVFS_DEBUG_DEBUG,
1121                                   "devfs_spec_close() called on %s! \n",
1122                                   dev->si_name);
1123           else
1124                     devfs_debug(DEVFS_DEBUG_DEBUG,
1125                                   "devfs_spec_close() called, null vode!\n");
1126 
1127           /*
1128            * A couple of hacks for devices and tty devices.  The
1129            * vnode ref count cannot be used to figure out the
1130            * last close, but we can use v_opencount now that
1131            * revoke works properly.
1132            *
1133            * Detect the last close on a controlling terminal and clear
1134            * the session (half-close).
1135            *
1136            * XXX opencount is not SMP safe.  The vnode is locked but there
1137            *     may be multiple vnodes referencing the same device.
1138            */
1139           if (dev) {
1140                     /*
1141                      * NOTE: Try to avoid global tokens when testing opencount
1142                      * XXX hack, fixme. needs a struct lock and opencount in
1143                      * struct cdev itself.
1144                      */
1145                     reference_dev(dev);
1146                     opencount = vp->v_opencount;
1147                     if (opencount <= 1)
1148                               opencount = count_dev(dev);   /* XXX NOT SMP SAFE */
1149           } else {
1150                     opencount = 0;
1151           }
1152 
1153           if (p && vp->v_opencount <= 1 && vp == p->p_session->s_ttyvp) {
1154                     p->p_session->s_ttyvp = NULL;
1155                     vrele(vp);
1156           }
1157 
1158           /*
1159            * Vnodes can be opened and closed multiple times.  Do not really
1160            * close the device unless (1) it is being closed forcibly,
1161            * (2) the device wants to track closes, or (3) this is the last
1162            * vnode doing its last close on the device.
1163            *
1164            * XXX the VXLOCK (force close) case can leave vnodes referencing
1165            * a closed device.  This might not occur now that our revoke is
1166            * fixed.
1167            */
1168           devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_spec_close() -1- \n");
1169           if (dev && ((vp->v_flag & VRECLAIMED) ||
1170                         (dev_dflags(dev) & D_TRACKCLOSE) ||
1171                         (opencount == 1))) {
1172                     /*
1173                      * Ugly pty magic, to make pty devices disappear again once
1174                      * they are closed.
1175                      */
1176                     node = DEVFS_NODE(ap->a_vp);
1177                     if (node && (node->flags & DEVFS_PTY))
1178                               node->flags |= DEVFS_INVISIBLE;
1179 
1180                     /*
1181                      * Unlock around dev_dclose(), unless the vnode is
1182                      * undergoing a vgone/reclaim (during umount).
1183                      */
1184                     needrelock = 0;
1185                     if ((vp->v_flag & VRECLAIMED) == 0 && vn_islocked(vp)) {
1186                               needrelock = 1;
1187                               vn_unlock(vp);
1188                     }
1189 
1190                     /*
1191                      * WARNING!  If the device destroys itself the devfs node
1192                      *             can disappear here.
1193                      *
1194                      * WARNING!  vn_lock() will fail if the vp is in a VRECLAIM,
1195                      *             which can occur during umount.
1196                      */
1197                     error = dev_dclose(dev, ap->a_fflag, S_IFCHR, ap->a_fp);
1198                     /* node is now stale */
1199 
1200                     if (needrelock) {
1201                               if (vn_lock(vp, LK_EXCLUSIVE |
1202                                                   LK_RETRY |
1203                                                   LK_FAILRECLAIM) != 0) {
1204                                         panic("devfs_spec_close: vnode %p "
1205                                               "unexpectedly could not be relocked",
1206                                               vp);
1207                               }
1208                     }
1209           } else {
1210                     error = 0;
1211           }
1212           devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_spec_close() -2- \n");
1213 
1214           /*
1215            * Track the actual opens and closes on the vnode.  The last close
1216            * disassociates the rdev.  If the rdev is already disassociated or
1217            * the opencount is already 0, the vnode might have been revoked
1218            * and no further opencount tracking occurs.
1219            */
1220           if (dev)
1221                     release_dev(dev);
1222 skip:
1223           if (vp->v_opencount > 0)
1224                     vop_stdclose(ap);
1225           return(error);
1226 
1227 }
1228 
1229 
1230 static int
devfs_fo_close(struct file * fp)1231 devfs_fo_close(struct file *fp)
1232 {
1233           struct vnode *vp = (struct vnode *)fp->f_data;
1234           int error;
1235 
1236           fp->f_ops = &badfileops;
1237           error = vn_close(vp, fp->f_flag, fp);
1238           devfs_clear_cdevpriv(fp);
1239 
1240           return (error);
1241 }
1242 
1243 
1244 /*
1245  * Device-optimized file table vnode read routine.
1246  *
1247  * This bypasses the VOP table and talks directly to the device.  Most
1248  * filesystems just route to specfs and can make this optimization.
1249  */
1250 static int
devfs_fo_read(struct file * fp,struct uio * uio,struct ucred * cred,int flags)1251 devfs_fo_read(struct file *fp, struct uio *uio,
1252                      struct ucred *cred, int flags)
1253 {
1254           struct devfs_node *node;
1255           struct vnode *vp;
1256           int ioflag;
1257           int error;
1258           cdev_t dev;
1259 
1260           KASSERT(uio->uio_td == curthread,
1261                     ("uio_td %p is not td %p", uio->uio_td, curthread));
1262 
1263           if (uio->uio_resid == 0)
1264                     return 0;
1265 
1266           vp = (struct vnode *)fp->f_data;
1267           if (vp == NULL || vp->v_type == VBAD)
1268                     return EBADF;
1269 
1270           node = DEVFS_NODE(vp);
1271 
1272           if ((dev = vp->v_rdev) == NULL)
1273                     return EBADF;
1274 
1275           reference_dev(dev);
1276 
1277           if ((flags & O_FOFFSET) == 0)
1278                     uio->uio_offset = fp->f_offset;
1279 
1280           ioflag = 0;
1281           if (flags & O_FBLOCKING) {
1282                     /* ioflag &= ~IO_NDELAY; */
1283           } else if (flags & O_FNONBLOCKING) {
1284                     ioflag |= IO_NDELAY;
1285           } else if (fp->f_flag & FNONBLOCK) {
1286                     ioflag |= IO_NDELAY;
1287           }
1288           if (fp->f_flag & O_DIRECT) {
1289                     ioflag |= IO_DIRECT;
1290           }
1291           ioflag |= sequential_heuristic(uio, fp);
1292 
1293           error = dev_dread(dev, uio, ioflag, fp);
1294 
1295           release_dev(dev);
1296           if (node)
1297                     vfs_timestamp(&node->atime);
1298           if ((flags & O_FOFFSET) == 0)
1299                     fp->f_offset = uio->uio_offset;
1300           fp->f_nextoff = uio->uio_offset;
1301 
1302           return (error);
1303 }
1304 
1305 
1306 static int
devfs_fo_write(struct file * fp,struct uio * uio,struct ucred * cred,int flags)1307 devfs_fo_write(struct file *fp, struct uio *uio,
1308                       struct ucred *cred, int flags)
1309 {
1310           struct devfs_node *node;
1311           struct vnode *vp;
1312           int ioflag;
1313           int error;
1314           cdev_t dev;
1315 
1316           KASSERT(uio->uio_td == curthread,
1317                     ("uio_td %p is not p %p", uio->uio_td, curthread));
1318 
1319           vp = (struct vnode *)fp->f_data;
1320           if (vp == NULL || vp->v_type == VBAD)
1321                     return EBADF;
1322 
1323           node = DEVFS_NODE(vp);
1324 
1325           if (vp->v_type == VREG)
1326                     bwillwrite(uio->uio_resid);
1327 
1328           vp = (struct vnode *)fp->f_data;
1329 
1330           if ((dev = vp->v_rdev) == NULL)
1331                     return EBADF;
1332 
1333           reference_dev(dev);
1334 
1335           if ((flags & O_FOFFSET) == 0)
1336                     uio->uio_offset = fp->f_offset;
1337 
1338           ioflag = IO_UNIT;
1339           if (vp->v_type == VREG &&
1340              ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) {
1341                     ioflag |= IO_APPEND;
1342           }
1343 
1344           if (flags & O_FBLOCKING) {
1345                     /* ioflag &= ~IO_NDELAY; */
1346           } else if (flags & O_FNONBLOCKING) {
1347                     ioflag |= IO_NDELAY;
1348           } else if (fp->f_flag & FNONBLOCK) {
1349                     ioflag |= IO_NDELAY;
1350           }
1351           if (fp->f_flag & O_DIRECT) {
1352                     ioflag |= IO_DIRECT;
1353           }
1354           if (flags & O_FASYNCWRITE) {
1355                     /* ioflag &= ~IO_SYNC; */
1356           } else if (flags & O_FSYNCWRITE) {
1357                     ioflag |= IO_SYNC;
1358           } else if (fp->f_flag & O_FSYNC) {
1359                     ioflag |= IO_SYNC;
1360           }
1361 
1362           if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))
1363                     ioflag |= IO_SYNC;
1364           ioflag |= sequential_heuristic(uio, fp);
1365 
1366           error = dev_dwrite(dev, uio, ioflag, fp);
1367 
1368           release_dev(dev);
1369           if (node) {
1370                     vfs_timestamp(&node->atime);
1371                     vfs_timestamp(&node->mtime);
1372           }
1373 
1374           if ((flags & O_FOFFSET) == 0)
1375                     fp->f_offset = uio->uio_offset;
1376           fp->f_nextoff = uio->uio_offset;
1377 
1378           return (error);
1379 }
1380 
1381 
1382 static int
devfs_fo_stat(struct file * fp,struct stat * sb,struct ucred * cred)1383 devfs_fo_stat(struct file *fp, struct stat *sb, struct ucred *cred)
1384 {
1385           struct vnode *vp;
1386           struct vattr vattr;
1387           struct vattr *vap;
1388           u_short mode;
1389           cdev_t dev;
1390           int error;
1391 
1392           vp = (struct vnode *)fp->f_data;
1393           if (vp == NULL || vp->v_type == VBAD)
1394                     return EBADF;
1395 
1396           error = vn_stat(vp, sb, cred);
1397           if (error)
1398                     return (error);
1399 
1400           vap = &vattr;
1401           error = VOP_GETATTR(vp, vap);
1402           if (error)
1403                     return (error);
1404 
1405           /*
1406            * Zero the spare stat fields
1407            */
1408           sb->st_lspare = 0;
1409           sb->st_qspare2 = 0;
1410 
1411           /*
1412            * Copy from vattr table ... or not in case it's a cloned device
1413            */
1414           if (vap->va_fsid != VNOVAL)
1415                     sb->st_dev = vap->va_fsid;
1416           else
1417                     sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
1418 
1419           sb->st_ino = vap->va_fileid;
1420 
1421           mode = vap->va_mode;
1422           mode |= S_IFCHR;
1423           sb->st_mode = mode;
1424 
1425           if (vap->va_nlink > (nlink_t)-1)
1426                     sb->st_nlink = (nlink_t)-1;
1427           else
1428                     sb->st_nlink = vap->va_nlink;
1429 
1430           sb->st_uid = vap->va_uid;
1431           sb->st_gid = vap->va_gid;
1432           sb->st_rdev = devid_from_dev(DEVFS_NODE(vp)->d_dev);
1433           sb->st_size = vap->va_bytes;
1434           sb->st_atimespec = vap->va_atime;
1435           sb->st_mtimespec = vap->va_mtime;
1436           sb->st_ctimespec = vap->va_ctime;
1437 
1438           /*
1439            * A VCHR and VBLK device may track the last access and last modified
1440            * time independantly of the filesystem.  This is particularly true
1441            * because device read and write calls may bypass the filesystem.
1442            */
1443           if (vp->v_type == VCHR || vp->v_type == VBLK) {
1444                     dev = vp->v_rdev;
1445                     if (dev != NULL) {
1446                               if (dev->si_lastread) {
1447                                         sb->st_atimespec.tv_sec = time_second +
1448                                                                         (dev->si_lastread -
1449                                                                          time_uptime);
1450                                         sb->st_atimespec.tv_nsec = 0;
1451                               }
1452                               if (dev->si_lastwrite) {
1453                                         sb->st_mtimespec.tv_sec = time_second +
1454                                                                         (dev->si_lastwrite -
1455                                                                          time_uptime);
1456                                         sb->st_mtimespec.tv_nsec = 0;
1457                               }
1458                     }
1459           }
1460 
1461         /*
1462            * According to www.opengroup.org, the meaning of st_blksize is
1463            *   "a filesystem-specific preferred I/O block size for this
1464            *    object.  In some filesystem types, this may vary from file
1465            *    to file"
1466            * Default to PAGE_SIZE after much discussion.
1467            */
1468 
1469           sb->st_blksize = PAGE_SIZE;
1470 
1471           sb->st_flags = vap->va_flags;
1472 
1473           error = caps_priv_check(cred, SYSCAP_NOVFS_GENERATION);
1474           if (error)
1475                     sb->st_gen = 0;
1476           else
1477                     sb->st_gen = (u_int32_t)vap->va_gen;
1478 
1479           sb->st_blocks = vap->va_bytes / S_BLKSIZE;
1480 
1481           /*
1482            * This is for ABI compatibility <= 5.7 (for ABI change made in
1483            * 5.7 master).
1484            */
1485           sb->__old_st_blksize = sb->st_blksize;
1486 
1487           return (0);
1488 }
1489 
1490 
1491 static int
devfs_fo_kqfilter(struct file * fp,struct knote * kn)1492 devfs_fo_kqfilter(struct file *fp, struct knote *kn)
1493 {
1494           struct vnode *vp;
1495           int error;
1496           cdev_t dev;
1497 
1498           vp = (struct vnode *)fp->f_data;
1499           if (vp == NULL || vp->v_type == VBAD) {
1500                     error = EBADF;
1501                     goto done;
1502           }
1503           if ((dev = vp->v_rdev) == NULL) {
1504                     error = EBADF;
1505                     goto done;
1506           }
1507           reference_dev(dev);
1508 
1509           error = dev_dkqfilter(dev, kn, fp);
1510 
1511           release_dev(dev);
1512 
1513 done:
1514           return (error);
1515 }
1516 
1517 static int
devfs_fo_ioctl(struct file * fp,u_long com,caddr_t data,struct ucred * ucred,struct sysmsg * msg)1518 devfs_fo_ioctl(struct file *fp, u_long com, caddr_t data,
1519                       struct ucred *ucred, struct sysmsg *msg)
1520 {
1521 #if 0
1522           struct devfs_node *node;
1523 #endif
1524           struct vnode *vp;
1525           struct vnode *ovp;
1526           cdev_t    dev;
1527           int error;
1528           struct fiodname_args *name_args;
1529           size_t namlen;
1530           const char *name;
1531 
1532           vp = ((struct vnode *)fp->f_data);
1533 
1534           if ((dev = vp->v_rdev) == NULL)
1535                     return EBADF;                 /* device was revoked */
1536 
1537           reference_dev(dev);
1538 
1539 #if 0
1540           node = DEVFS_NODE(vp);
1541 #endif
1542 
1543           devfs_debug(DEVFS_DEBUG_DEBUG,
1544                         "devfs_fo_ioctl() called! for dev %s\n",
1545                         dev->si_name);
1546 
1547           if (com == FIODTYPE) {
1548                     *(int *)data = dev_dflags(dev) & D_TYPEMASK;
1549                     error = 0;
1550                     goto out;
1551           } else if (com == FIODNAME) {
1552                     name_args = (struct fiodname_args *)data;
1553                     name = dev->si_name;
1554                     namlen = strlen(name) + 1;
1555 
1556                     devfs_debug(DEVFS_DEBUG_DEBUG,
1557                                   "ioctl, got: FIODNAME for %s\n", name);
1558 
1559                     if (namlen <= name_args->len)
1560                               error = copyout(dev->si_name, name_args->name, namlen);
1561                     else
1562                               error = EINVAL;
1563 
1564                     devfs_debug(DEVFS_DEBUG_DEBUG,
1565                                   "ioctl stuff: error: %d\n", error);
1566                     goto out;
1567           }
1568 
1569           error = dev_dioctl(dev, com, data, fp->f_flag, ucred, msg, fp);
1570 
1571 #if 0
1572           if (node) {
1573                     vfs_timestamp(&node->atime);
1574                     vfs_timestamp(&node->mtime);
1575           }
1576 #endif
1577           if (com == TIOCSCTTY) {
1578                     devfs_debug(DEVFS_DEBUG_DEBUG,
1579                                   "devfs_fo_ioctl: got TIOCSCTTY on %s\n",
1580                                   dev->si_name);
1581           }
1582           if (error == 0 && com == TIOCSCTTY) {
1583                     struct proc *p = curthread->td_proc;
1584                     struct session *sess;
1585 
1586                     devfs_debug(DEVFS_DEBUG_DEBUG,
1587                                   "devfs_fo_ioctl: dealing with TIOCSCTTY on %s\n",
1588                                   dev->si_name);
1589                     if (p == NULL) {
1590                               error = ENOTTY;
1591                               goto out;
1592                     }
1593                     sess = p->p_session;
1594 
1595                     /*
1596                      * Do nothing if reassigning same control tty
1597                      */
1598                     if (sess->s_ttyvp == vp) {
1599                               error = 0;
1600                               goto out;
1601                     }
1602 
1603                     /*
1604                      * Get rid of reference to old control tty
1605                      */
1606                     ovp = sess->s_ttyvp;
1607                     vref(vp);
1608                     sess->s_ttyvp = vp;
1609                     if (ovp)
1610                               vrele(ovp);
1611           }
1612 
1613 out:
1614           release_dev(dev);
1615           devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_fo_ioctl() finished! \n");
1616           return (error);
1617 }
1618 
1619 int
devfs_fo_seek(struct file * fp,off_t offset,int whence,off_t * res)1620 devfs_fo_seek(struct file *fp, off_t offset, int whence, off_t *res)
1621 {
1622           /*
1623            * NOTE: vnode_fileops uses exact same code
1624            */
1625           struct vnode *vp;
1626           struct vattr_lite lva;
1627           off_t new_offset;
1628           int error;
1629 
1630           vp = (struct vnode *)fp->f_data;
1631 
1632           switch (whence) {
1633           case L_INCR:
1634                     spin_lock(&fp->f_spin);
1635                     new_offset = fp->f_offset + offset;
1636                     error = 0;
1637                     break;
1638           case L_XTND:
1639                     error = VOP_GETATTR_LITE(vp, &lva);
1640                     spin_lock(&fp->f_spin);
1641                     new_offset = offset + lva.va_size;
1642                     break;
1643           case L_SET:
1644                     new_offset = offset;
1645                     error = 0;
1646                     spin_lock(&fp->f_spin);
1647                     break;
1648           default:
1649                     new_offset = 0;
1650                     error = EINVAL;
1651                     spin_lock(&fp->f_spin);
1652                     break;
1653           }
1654 
1655           /*
1656            * Validate the seek position.  Negative offsets are not allowed
1657            * for regular files or directories.
1658            *
1659            * Normally we would also not want to allow negative offsets for
1660            * character and block-special devices.  However kvm addresses
1661            * on 64 bit architectures might appear to be negative and must
1662            * be allowed.
1663            */
1664           if (error == 0) {
1665                     if (new_offset < 0 &&
1666                         (vp->v_type == VREG || vp->v_type == VDIR)) {
1667                               error = EINVAL;
1668                     } else {
1669                               fp->f_offset = new_offset;
1670                     }
1671           }
1672           *res = fp->f_offset;
1673           spin_unlock(&fp->f_spin);
1674 
1675           return (error);
1676 }
1677 
1678 static int
devfs_spec_fsync(struct vop_fsync_args * ap)1679 devfs_spec_fsync(struct vop_fsync_args *ap)
1680 {
1681           struct vnode *vp = ap->a_vp;
1682           int error;
1683 
1684           if (!vn_isdisk(vp, NULL))
1685                     return (0);
1686 
1687           /*
1688            * Flush all dirty buffers associated with a block device.
1689            */
1690           error = vfsync(vp, ap->a_waitfor, 10000, NULL, NULL);
1691           return (error);
1692 }
1693 
1694 static int
devfs_spec_read(struct vop_read_args * ap)1695 devfs_spec_read(struct vop_read_args *ap)
1696 {
1697           struct devfs_node *node;
1698           struct vnode *vp;
1699           struct uio *uio;
1700           cdev_t dev;
1701           int error;
1702 
1703           vp = ap->a_vp;
1704           dev = vp->v_rdev;
1705           uio = ap->a_uio;
1706           node = DEVFS_NODE(vp);
1707 
1708           if (dev == NULL)              /* device was revoked */
1709                     return (EBADF);
1710           if (uio->uio_resid == 0)
1711                     return (0);
1712 
1713           vn_unlock(vp);
1714           error = dev_dread(dev, uio, ap->a_ioflag, NULL);
1715           vn_lock(vp, LK_SHARED | LK_RETRY);
1716 
1717           if (node)
1718                     vfs_timestamp(&node->atime);
1719 
1720           return (error);
1721 }
1722 
1723 /*
1724  * Vnode op for write
1725  *
1726  * spec_write(struct vnode *a_vp, struct uio *a_uio, int a_ioflag,
1727  *              struct ucred *a_cred)
1728  */
1729 static int
devfs_spec_write(struct vop_write_args * ap)1730 devfs_spec_write(struct vop_write_args *ap)
1731 {
1732           struct devfs_node *node;
1733           struct vnode *vp;
1734           struct uio *uio;
1735           cdev_t dev;
1736           int error;
1737 
1738           vp = ap->a_vp;
1739           dev = vp->v_rdev;
1740           uio = ap->a_uio;
1741           node = DEVFS_NODE(vp);
1742 
1743           KKASSERT(uio->uio_segflg != UIO_NOCOPY);
1744 
1745           if (dev == NULL)              /* device was revoked */
1746                     return (EBADF);
1747 
1748           vn_unlock(vp);
1749           error = dev_dwrite(dev, uio, ap->a_ioflag, NULL);
1750           vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1751 
1752           if (node) {
1753                     vfs_timestamp(&node->atime);
1754                     vfs_timestamp(&node->mtime);
1755           }
1756 
1757           return (error);
1758 }
1759 
1760 /*
1761  * Device ioctl operation.
1762  *
1763  * spec_ioctl(struct vnode *a_vp, int a_command, caddr_t a_data,
1764  *              int a_fflag, struct ucred *a_cred, struct sysmsg *msg)
1765  */
1766 static int
devfs_spec_ioctl(struct vop_ioctl_args * ap)1767 devfs_spec_ioctl(struct vop_ioctl_args *ap)
1768 {
1769           struct vnode *vp = ap->a_vp;
1770 #if 0
1771           struct devfs_node *node;
1772 #endif
1773           cdev_t dev;
1774 
1775           if ((dev = vp->v_rdev) == NULL)
1776                     return (EBADF);               /* device was revoked */
1777 #if 0
1778           node = DEVFS_NODE(vp);
1779 
1780           if (node) {
1781                     vfs_timestamp(&node->atime);
1782                     vfs_timestamp(&node->mtime);
1783           }
1784 #endif
1785 
1786           return (dev_dioctl(dev, ap->a_command, ap->a_data, ap->a_fflag,
1787                                  ap->a_cred, ap->a_sysmsg, NULL));
1788 }
1789 
1790 /*
1791  * spec_kqfilter(struct vnode *a_vp, struct knote *a_kn)
1792  */
1793 /* ARGSUSED */
1794 static int
devfs_spec_kqfilter(struct vop_kqfilter_args * ap)1795 devfs_spec_kqfilter(struct vop_kqfilter_args *ap)
1796 {
1797           struct vnode *vp = ap->a_vp;
1798 #if 0
1799           struct devfs_node *node;
1800 #endif
1801           cdev_t dev;
1802 
1803           if ((dev = vp->v_rdev) == NULL)
1804                     return (EBADF);               /* device was revoked (EBADF) */
1805 #if 0
1806           node = DEVFS_NODE(vp);
1807 
1808           if (node)
1809                     vfs_timestamp(&node->atime);
1810 #endif
1811 
1812           return (dev_dkqfilter(dev, ap->a_kn, NULL));
1813 }
1814 
1815 /*
1816  * Convert a vnode strategy call into a device strategy call.  Vnode strategy
1817  * calls are not limited to device DMA limits so we have to deal with the
1818  * case.
1819  *
1820  * spec_strategy(struct vnode *a_vp, struct bio *a_bio)
1821  */
1822 static int
devfs_spec_strategy(struct vop_strategy_args * ap)1823 devfs_spec_strategy(struct vop_strategy_args *ap)
1824 {
1825           struct bio *bio = ap->a_bio;
1826           struct buf *bp = bio->bio_buf;
1827           struct buf *nbp;
1828           struct vnode *vp;
1829           struct mount *mp;
1830           int chunksize;
1831           int maxiosize;
1832 
1833           if (bp->b_cmd != BUF_CMD_READ && LIST_FIRST(&bp->b_dep) != NULL)
1834                     buf_start(bp);
1835 
1836           /*
1837            * Collect statistics on synchronous and asynchronous read
1838            * and write counts for disks that have associated filesystems.
1839            */
1840           vp = ap->a_vp;
1841           KKASSERT(vp->v_rdev != NULL); /* XXX */
1842           if (vn_isdisk(vp, NULL) && (mp = vp->v_rdev->si_mountpoint) != NULL) {
1843                     if (bp->b_cmd == BUF_CMD_READ) {
1844                               if (bp->b_flags & BIO_SYNC)
1845                                         mp->mnt_stat.f_syncreads++;
1846                               else
1847                                         mp->mnt_stat.f_asyncreads++;
1848                     } else {
1849                               if (bp->b_flags & BIO_SYNC)
1850                                         mp->mnt_stat.f_syncwrites++;
1851                               else
1852                                         mp->mnt_stat.f_asyncwrites++;
1853                     }
1854           }
1855 
1856         /*
1857          * Device iosize limitations only apply to read and write.  Shortcut
1858          * the I/O if it fits.
1859          */
1860           if ((maxiosize = vp->v_rdev->si_iosize_max) == 0) {
1861                     devfs_debug(DEVFS_DEBUG_DEBUG,
1862                                   "%s: si_iosize_max not set!\n",
1863                                   dev_dname(vp->v_rdev));
1864                     maxiosize = MAXPHYS;
1865           }
1866 #if SPEC_CHAIN_DEBUG & 2
1867           maxiosize = 4096;
1868 #endif
1869         if (bp->b_bcount <= maxiosize ||
1870             (bp->b_cmd != BUF_CMD_READ && bp->b_cmd != BUF_CMD_WRITE)) {
1871                 dev_dstrategy_chain(vp->v_rdev, bio);
1872                 return (0);
1873         }
1874 
1875           /*
1876            * Clone the buffer and set up an I/O chain to chunk up the I/O.
1877            */
1878           nbp = kmalloc(sizeof(*bp), M_DEVBUF, M_INTWAIT|M_ZERO);
1879           initbufbio(nbp);
1880           buf_dep_init(nbp);
1881           BUF_LOCK(nbp, LK_EXCLUSIVE);
1882           BUF_KERNPROC(nbp);
1883           nbp->b_vp = vp;
1884           nbp->b_flags = B_PAGING | B_KVABIO | (bp->b_flags & B_BNOCLIP);
1885           nbp->b_cpumask = bp->b_cpumask;
1886           nbp->b_data = bp->b_data;
1887           nbp->b_bio1.bio_done = devfs_spec_strategy_done;
1888           nbp->b_bio1.bio_offset = bio->bio_offset;
1889           nbp->b_bio1.bio_caller_info1.ptr = bio;
1890 
1891           /*
1892            * Start the first transfer
1893            */
1894           if (vn_isdisk(vp, NULL))
1895                     chunksize = vp->v_rdev->si_bsize_phys;
1896           else
1897                     chunksize = DEV_BSIZE;
1898           chunksize = rounddown(maxiosize, chunksize);
1899 #if SPEC_CHAIN_DEBUG & 1
1900           devfs_debug(DEVFS_DEBUG_DEBUG,
1901                         "spec_strategy chained I/O chunksize=%d\n",
1902                         chunksize);
1903 #endif
1904           nbp->b_cmd = bp->b_cmd;
1905           nbp->b_bcount = chunksize;
1906           nbp->b_bufsize = chunksize;   /* used to detect a short I/O */
1907           nbp->b_bio1.bio_caller_info2.index = chunksize;
1908 
1909 #if SPEC_CHAIN_DEBUG & 1
1910           devfs_debug(DEVFS_DEBUG_DEBUG,
1911                         "spec_strategy: chain %p offset %d/%d bcount %d\n",
1912                         bp, 0, bp->b_bcount, nbp->b_bcount);
1913 #endif
1914 
1915           dev_dstrategy(vp->v_rdev, &nbp->b_bio1);
1916 
1917           if (DEVFS_NODE(vp)) {
1918                     vfs_timestamp(&DEVFS_NODE(vp)->atime);
1919                     vfs_timestamp(&DEVFS_NODE(vp)->mtime);
1920           }
1921 
1922           return (0);
1923 }
1924 
1925 /*
1926  * Chunked up transfer completion routine - chain transfers until done
1927  *
1928  * NOTE: MPSAFE callback.
1929  */
1930 static
1931 void
devfs_spec_strategy_done(struct bio * nbio)1932 devfs_spec_strategy_done(struct bio *nbio)
1933 {
1934           struct buf *nbp = nbio->bio_buf;
1935           struct bio *bio = nbio->bio_caller_info1.ptr;     /* original bio */
1936           struct buf *bp = bio->bio_buf;                              /* original bp */
1937           int chunksize = nbio->bio_caller_info2.index;     /* chunking */
1938           int boffset = nbp->b_data - bp->b_data;
1939 
1940           if (nbp->b_flags & B_ERROR) {
1941                     /*
1942                      * An error terminates the chain, propogate the error back
1943                      * to the original bp
1944                      */
1945                     bp->b_flags |= B_ERROR;
1946                     bp->b_error = nbp->b_error;
1947                     bp->b_resid = bp->b_bcount - boffset +
1948                                     (nbp->b_bcount - nbp->b_resid);
1949 #if SPEC_CHAIN_DEBUG & 1
1950                     devfs_debug(DEVFS_DEBUG_DEBUG,
1951                                   "spec_strategy: chain %p error %d bcount %d/%d\n",
1952                                   bp, bp->b_error, bp->b_bcount,
1953                                   bp->b_bcount - bp->b_resid);
1954 #endif
1955           } else if (nbp->b_resid) {
1956                     /*
1957                      * A short read or write terminates the chain
1958                      */
1959                     bp->b_error = nbp->b_error;
1960                     bp->b_resid = bp->b_bcount - boffset +
1961                                     (nbp->b_bcount - nbp->b_resid);
1962 #if SPEC_CHAIN_DEBUG & 1
1963                     devfs_debug(DEVFS_DEBUG_DEBUG,
1964                                   "spec_strategy: chain %p short read(1) "
1965                                   "bcount %d/%d\n",
1966                                   bp, bp->b_bcount - bp->b_resid, bp->b_bcount);
1967 #endif
1968           } else if (nbp->b_bcount != nbp->b_bufsize) {
1969                     /*
1970                      * A short read or write can also occur by truncating b_bcount
1971                      */
1972 #if SPEC_CHAIN_DEBUG & 1
1973                     devfs_debug(DEVFS_DEBUG_DEBUG,
1974                                   "spec_strategy: chain %p short read(2) "
1975                                   "bcount %d/%d\n",
1976                                   bp, nbp->b_bcount + boffset, bp->b_bcount);
1977 #endif
1978                     bp->b_error = 0;
1979                     bp->b_bcount = nbp->b_bcount + boffset;
1980                     bp->b_resid = nbp->b_resid;
1981           } else if (nbp->b_bcount + boffset == bp->b_bcount) {
1982                     /*
1983                      * No more data terminates the chain
1984                      */
1985 #if SPEC_CHAIN_DEBUG & 1
1986                     devfs_debug(DEVFS_DEBUG_DEBUG,
1987                                   "spec_strategy: chain %p finished bcount %d\n",
1988                                   bp, bp->b_bcount);
1989 #endif
1990                     bp->b_error = 0;
1991                     bp->b_resid = 0;
1992           } else {
1993                     /*
1994                      * Continue the chain
1995                      */
1996                     boffset += nbp->b_bcount;
1997                     nbp->b_data = bp->b_data + boffset;
1998                     nbp->b_bcount = bp->b_bcount - boffset;
1999                     if (nbp->b_bcount > chunksize)
2000                               nbp->b_bcount = chunksize;
2001                     nbp->b_bio1.bio_done = devfs_spec_strategy_done;
2002                     nbp->b_bio1.bio_offset = bio->bio_offset + boffset;
2003 
2004 #if SPEC_CHAIN_DEBUG & 1
2005                     devfs_debug(DEVFS_DEBUG_DEBUG,
2006                                   "spec_strategy: chain %p offset %d/%d bcount %d\n",
2007                                   bp, boffset, bp->b_bcount, nbp->b_bcount);
2008 #endif
2009 
2010                     dev_dstrategy(nbp->b_vp->v_rdev, &nbp->b_bio1);
2011                     return;
2012           }
2013 
2014           /*
2015            * Fall through to here on termination.  biodone(bp) and
2016            * clean up and free nbp.
2017            */
2018           biodone(bio);
2019           BUF_UNLOCK(nbp);
2020           uninitbufbio(nbp);
2021           kfree(nbp, M_DEVBUF);
2022 }
2023 
2024 /*
2025  * spec_freeblks(struct vnode *a_vp, daddr_t a_addr, daddr_t a_length)
2026  */
2027 static int
devfs_spec_freeblks(struct vop_freeblks_args * ap)2028 devfs_spec_freeblks(struct vop_freeblks_args *ap)
2029 {
2030           struct buf *bp;
2031 
2032           /*
2033            * Must be a synchronous operation
2034            */
2035           KKASSERT(ap->a_vp->v_rdev != NULL);
2036           if ((ap->a_vp->v_rdev->si_flags & SI_CANFREE) == 0)
2037                     return (0);
2038           bp = getpbuf(NULL);
2039           bp->b_cmd = BUF_CMD_FREEBLKS;
2040           bp->b_bio1.bio_flags |= BIO_SYNC;
2041           bp->b_bio1.bio_offset = ap->a_offset;
2042           bp->b_bio1.bio_done = biodone_sync;
2043           bp->b_bcount = ap->a_length;
2044           dev_dstrategy(ap->a_vp->v_rdev, &bp->b_bio1);
2045           biowait(&bp->b_bio1, "TRIM");
2046           relpbuf(bp, NULL);
2047 
2048           return (0);
2049 }
2050 
2051 /*
2052  * Implement degenerate case where the block requested is the block
2053  * returned, and assume that the entire device is contiguous in regards
2054  * to the contiguous block range (runp and runb).
2055  *
2056  * spec_bmap(struct vnode *a_vp, off_t a_loffset,
2057  *             off_t *a_doffsetp, int *a_runp, int *a_runb)
2058  */
2059 static int
devfs_spec_bmap(struct vop_bmap_args * ap)2060 devfs_spec_bmap(struct vop_bmap_args *ap)
2061 {
2062           if (ap->a_doffsetp != NULL)
2063                     *ap->a_doffsetp = ap->a_loffset;
2064           if (ap->a_runp != NULL)
2065                     *ap->a_runp = MAXBSIZE;
2066           if (ap->a_runb != NULL) {
2067                     if (ap->a_loffset < MAXBSIZE)
2068                               *ap->a_runb = (int)ap->a_loffset;
2069                     else
2070                               *ap->a_runb = MAXBSIZE;
2071           }
2072           return (0);
2073 }
2074 
2075 
2076 /*
2077  * Special device advisory byte-level locks.
2078  *
2079  * spec_advlock(struct vnode *a_vp, caddr_t a_id, int a_op,
2080  *                  struct flock *a_fl, int a_flags)
2081  */
2082 /* ARGSUSED */
2083 static int
devfs_spec_advlock(struct vop_advlock_args * ap)2084 devfs_spec_advlock(struct vop_advlock_args *ap)
2085 {
2086           return ((ap->a_flags & F_POSIX) ? EINVAL : EOPNOTSUPP);
2087 }
2088 
2089 /*
2090  * NOTE: MPSAFE callback.
2091  */
2092 static void
devfs_spec_getpages_iodone(struct bio * bio)2093 devfs_spec_getpages_iodone(struct bio *bio)
2094 {
2095           bio->bio_buf->b_cmd = BUF_CMD_DONE;
2096           wakeup(bio->bio_buf);
2097 }
2098 
2099 /*
2100  * spec_getpages() - get pages associated with device vnode.
2101  *
2102  * Note that spec_read and spec_write do not use the buffer cache, so we
2103  * must fully implement getpages here.
2104  */
2105 static int
devfs_spec_getpages(struct vop_getpages_args * ap)2106 devfs_spec_getpages(struct vop_getpages_args *ap)
2107 {
2108           vm_offset_t kva;
2109           int error;
2110           int i, pcount, size;
2111           struct buf *bp;
2112           vm_page_t m;
2113           vm_ooffset_t offset;
2114           int toff, nextoff, nread;
2115           struct vnode *vp = ap->a_vp;
2116           int blksiz;
2117           int gotreqpage;
2118 
2119           error = 0;
2120           pcount = round_page(ap->a_count) / PAGE_SIZE;
2121 
2122           /*
2123            * Calculate the offset of the transfer and do sanity check.
2124            */
2125           offset = IDX_TO_OFF(ap->a_m[0]->pindex) + ap->a_offset;
2126 
2127           /*
2128            * Round up physical size for real devices.  We cannot round using
2129            * v_mount's block size data because v_mount has nothing to do with
2130            * the device.  i.e. it's usually '/dev'.  We need the physical block
2131            * size for the device itself.
2132            *
2133            * We can't use v_rdev->si_mountpoint because it only exists when the
2134            * block device is mounted.  However, we can use v_rdev.
2135            */
2136           if (vn_isdisk(vp, NULL))
2137                     blksiz = vp->v_rdev->si_bsize_phys;
2138           else
2139                     blksiz = DEV_BSIZE;
2140 
2141           size = roundup2(ap->a_count, blksiz);
2142 
2143           bp = getpbuf_kva(NULL);
2144           kva = (vm_offset_t)bp->b_data;
2145 
2146           /*
2147            * Map the pages to be read into the kva.
2148            */
2149           pmap_qenter_noinval(kva, ap->a_m, pcount);
2150 
2151           /* Build a minimal buffer header. */
2152           bp->b_cmd = BUF_CMD_READ;
2153           bp->b_flags |= B_KVABIO;
2154           bp->b_bcount = size;
2155           bp->b_resid = 0;
2156           bsetrunningbufspace(bp, size);
2157 
2158           bp->b_bio1.bio_offset = offset;
2159           bp->b_bio1.bio_done = devfs_spec_getpages_iodone;
2160 
2161           mycpu->gd_cnt.v_vnodein++;
2162           mycpu->gd_cnt.v_vnodepgsin += pcount;
2163 
2164           /* Do the input. */
2165           vn_strategy(ap->a_vp, &bp->b_bio1);
2166 
2167           crit_enter();
2168 
2169           /* We definitely need to be at splbio here. */
2170           while (bp->b_cmd != BUF_CMD_DONE)
2171                     tsleep(bp, 0, "spread", 0);
2172 
2173           crit_exit();
2174 
2175           if (bp->b_flags & B_ERROR) {
2176                     if (bp->b_error)
2177                               error = bp->b_error;
2178                     else
2179                               error = EIO;
2180           }
2181 
2182           /*
2183            * If EOF is encountered we must zero-extend the result in order
2184            * to ensure that the page does not contain garabge.  When no
2185            * error occurs, an early EOF is indicated if b_bcount got truncated.
2186            * b_resid is relative to b_bcount and should be 0, but some devices
2187            * might indicate an EOF with b_resid instead of truncating b_bcount.
2188            */
2189           nread = bp->b_bcount - bp->b_resid;
2190           if (nread < ap->a_count) {
2191                     bkvasync(bp);
2192                     bzero((caddr_t)kva + nread, ap->a_count - nread);
2193           }
2194           pmap_qremove_noinval(kva, pcount);
2195 
2196           gotreqpage = 0;
2197           for (i = 0, toff = 0; i < pcount; i++, toff = nextoff) {
2198                     nextoff = toff + PAGE_SIZE;
2199                     m = ap->a_m[i];
2200 
2201                     /*
2202                      * NOTE: vm_page_undirty/clear_dirty etc do not clear the
2203                      *         pmap modified bit.  pmap modified bit should have
2204                      *         already been cleared.
2205                      */
2206                     if (nextoff <= nread) {
2207                               m->valid = VM_PAGE_BITS_ALL;
2208                               vm_page_undirty(m);
2209                     } else if (toff < nread) {
2210                               /*
2211                                * Since this is a VM request, we have to supply the
2212                                * unaligned offset to allow vm_page_set_valid()
2213                                * to zero sub-DEV_BSIZE'd portions of the page.
2214                                */
2215                               vm_page_set_valid(m, 0, nread - toff);
2216                               vm_page_clear_dirty_end_nonincl(m, 0, nread - toff);
2217                     } else {
2218                               m->valid = 0;
2219                               vm_page_undirty(m);
2220                     }
2221 
2222                     if (i != ap->a_reqpage) {
2223                               /*
2224                                * Just in case someone was asking for this page we
2225                                * now tell them that it is ok to use.
2226                                */
2227                               if (!error || (m->valid == VM_PAGE_BITS_ALL)) {
2228                                         if (m->valid) {
2229                                                   if (m->flags & PG_REFERENCED) {
2230                                                             vm_page_activate(m);
2231                                                   } else {
2232                                                             vm_page_deactivate(m);
2233                                                   }
2234                                                   vm_page_wakeup(m);
2235                                         } else {
2236                                                   vm_page_free(m);
2237                                         }
2238                               } else {
2239                                         vm_page_free(m);
2240                               }
2241                     } else if (m->valid) {
2242                               gotreqpage = 1;
2243                               /*
2244                                * Since this is a VM request, we need to make the
2245                                * entire page presentable by zeroing invalid sections.
2246                                */
2247                               if (m->valid != VM_PAGE_BITS_ALL)
2248                                   vm_page_zero_invalid(m, FALSE);
2249                     }
2250           }
2251           if (!gotreqpage) {
2252                     m = ap->a_m[ap->a_reqpage];
2253                     devfs_debug(DEVFS_DEBUG_WARNING,
2254               "spec_getpages:(%s) I/O read failure: (error=%d) bp %p vp %p\n",
2255                               devtoname(vp->v_rdev), error, bp, bp->b_vp);
2256                     devfs_debug(DEVFS_DEBUG_WARNING,
2257               "               size: %d, resid: %d, a_count: %d, valid: 0x%x\n",
2258                         size, bp->b_resid, ap->a_count, m->valid);
2259                     devfs_debug(DEVFS_DEBUG_WARNING,
2260               "               nread: %d, reqpage: %d, pindex: %lu, pcount: %d\n",
2261                         nread, ap->a_reqpage, (u_long)m->pindex, pcount);
2262                     /*
2263                      * Free the buffer header back to the swap buffer pool.
2264                      */
2265                     relpbuf(bp, NULL);
2266                     return VM_PAGER_ERROR;
2267           }
2268           /*
2269            * Free the buffer header back to the swap buffer pool.
2270            */
2271           relpbuf(bp, NULL);
2272           if (DEVFS_NODE(ap->a_vp))
2273                     vfs_timestamp(&DEVFS_NODE(ap->a_vp)->mtime);
2274           return VM_PAGER_OK;
2275 }
2276 
2277 static __inline
2278 int
sequential_heuristic(struct uio * uio,struct file * fp)2279 sequential_heuristic(struct uio *uio, struct file *fp)
2280 {
2281           /*
2282            * Sequential heuristic - detect sequential operation
2283            */
2284           if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
2285               uio->uio_offset == fp->f_nextoff) {
2286                     /*
2287                      * XXX we assume that the filesystem block size is
2288                      * the default.  Not true, but still gives us a pretty
2289                      * good indicator of how sequential the read operations
2290                      * are.
2291                      */
2292                     int tmpseq = fp->f_seqcount;
2293 
2294                     tmpseq += howmany(uio->uio_resid, MAXBSIZE);
2295                     if (tmpseq > IO_SEQMAX)
2296                               tmpseq = IO_SEQMAX;
2297                     fp->f_seqcount = tmpseq;
2298                     return(fp->f_seqcount << IO_SEQSHIFT);
2299           }
2300 
2301           /*
2302            * Not sequential, quick draw-down of seqcount
2303            */
2304           if (fp->f_seqcount > 1)
2305                     fp->f_seqcount = 1;
2306           else
2307                     fp->f_seqcount = 0;
2308           return(0);
2309 }
2310