1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2002 Poul-Henning Kamp
5  * Copyright (c) 2002 Networks Associates Technology, Inc.
6  * Copyright (c) 2013 The FreeBSD Foundation
7  * All rights reserved.
8  *
9  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
10  * and NAI Labs, the Security Research Division of Network Associates, Inc.
11  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
12  * DARPA CHATS research program.
13  *
14  * Portions of this software were developed by Konstantin Belousov
15  * under sponsorship from the FreeBSD Foundation.
16  *
17  * Redistribution and use in source and binary forms, with or without
18  * modification, are permitted provided that the following conditions
19  * are met:
20  * 1. Redistributions of source code must retain the above copyright
21  *    notice, this list of conditions and the following disclaimer.
22  * 2. Redistributions in binary form must reproduce the above copyright
23  *    notice, this list of conditions and the following disclaimer in the
24  *    documentation and/or other materials provided with the distribution.
25  * 3. The names of the authors may not be used to endorse or promote
26  *    products derived from this software without specific prior written
27  *    permission.
28  *
29  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
30  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
33  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39  * SUCH DAMAGE.
40  */
41 
42 #include <sys/cdefs.h>
43 __FBSDID("$FreeBSD: stable/12/sys/geom/geom_io.c 373214 2023-09-20 07:11:23Z dim $");
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/kernel.h>
48 #include <sys/malloc.h>
49 #include <sys/bio.h>
50 #include <sys/ktr.h>
51 #include <sys/proc.h>
52 #include <sys/stack.h>
53 #include <sys/sysctl.h>
54 #include <sys/vmem.h>
55 
56 #include <sys/errno.h>
57 #include <geom/geom.h>
58 #include <geom/geom_int.h>
59 #include <sys/devicestat.h>
60 
61 #include <vm/uma.h>
62 #include <vm/vm.h>
63 #include <vm/vm_param.h>
64 #include <vm/vm_kern.h>
65 #include <vm/vm_page.h>
66 #include <vm/vm_object.h>
67 #include <vm/vm_extern.h>
68 #include <vm/vm_map.h>
69 
70 #define KTR_GEOM_ENABLED \
71     ((KTR_COMPILE & KTR_GEOM) != 0 && (ktr_mask & KTR_GEOM) != 0)
72 
73 static int	g_io_transient_map_bio(struct bio *bp);
74 
75 static struct g_bioq g_bio_run_down;
76 static struct g_bioq g_bio_run_up;
77 
78 /*
79  * Pace is a hint that we've had some trouble recently allocating
80  * bios, so we should back off trying to send I/O down the stack
81  * a bit to let the problem resolve. When pacing, we also turn
82  * off direct dispatch to also reduce memory pressure from I/Os
83  * there, at the expxense of some added latency while the memory
84  * pressures exist. See g_io_schedule_down() for more details
85  * and limitations.
86  */
87 static volatile u_int __read_mostly pace;
88 
89 static uma_zone_t __read_mostly biozone;
90 
91 /*
92  * The head of the list of classifiers used in g_io_request.
93  * Use g_register_classifier() and g_unregister_classifier()
94  * to add/remove entries to the list.
95  * Classifiers are invoked in registration order.
96  */
97 static TAILQ_HEAD(, g_classifier_hook) g_classifier_tailq __read_mostly =
98     TAILQ_HEAD_INITIALIZER(g_classifier_tailq);
99 
100 #include <machine/atomic.h>
101 
102 static void
g_bioq_lock(struct g_bioq * bq)103 g_bioq_lock(struct g_bioq *bq)
104 {
105 
106 	mtx_lock(&bq->bio_queue_lock);
107 }
108 
109 static void
g_bioq_unlock(struct g_bioq * bq)110 g_bioq_unlock(struct g_bioq *bq)
111 {
112 
113 	mtx_unlock(&bq->bio_queue_lock);
114 }
115 
116 #if 0
117 static void
118 g_bioq_destroy(struct g_bioq *bq)
119 {
120 
121 	mtx_destroy(&bq->bio_queue_lock);
122 }
123 #endif
124 
125 static void
g_bioq_init(struct g_bioq * bq)126 g_bioq_init(struct g_bioq *bq)
127 {
128 
129 	TAILQ_INIT(&bq->bio_queue);
130 	mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF);
131 }
132 
133 static struct bio *
g_bioq_first(struct g_bioq * bq)134 g_bioq_first(struct g_bioq *bq)
135 {
136 	struct bio *bp;
137 
138 	bp = TAILQ_FIRST(&bq->bio_queue);
139 	if (bp != NULL) {
140 		KASSERT((bp->bio_flags & BIO_ONQUEUE),
141 		    ("Bio not on queue bp=%p target %p", bp, bq));
142 		bp->bio_flags &= ~BIO_ONQUEUE;
143 		TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue);
144 		bq->bio_queue_length--;
145 	}
146 	return (bp);
147 }
148 
149 struct bio *
g_new_bio(void)150 g_new_bio(void)
151 {
152 	struct bio *bp;
153 
154 	bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
155 #ifdef KTR
156 	if (KTR_GEOM_ENABLED) {
157 		struct stack st;
158 
159 		CTR1(KTR_GEOM, "g_new_bio(): %p", bp);
160 		stack_save(&st);
161 		CTRSTACK(KTR_GEOM, &st, 3, 0);
162 	}
163 #endif
164 	return (bp);
165 }
166 
167 struct bio *
g_alloc_bio(void)168 g_alloc_bio(void)
169 {
170 	struct bio *bp;
171 
172 	bp = uma_zalloc(biozone, M_WAITOK | M_ZERO);
173 #ifdef KTR
174 	if (KTR_GEOM_ENABLED) {
175 		struct stack st;
176 
177 		CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp);
178 		stack_save(&st);
179 		CTRSTACK(KTR_GEOM, &st, 3, 0);
180 	}
181 #endif
182 	return (bp);
183 }
184 
185 void
g_destroy_bio(struct bio * bp)186 g_destroy_bio(struct bio *bp)
187 {
188 #ifdef KTR
189 	if (KTR_GEOM_ENABLED) {
190 		struct stack st;
191 
192 		CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp);
193 		stack_save(&st);
194 		CTRSTACK(KTR_GEOM, &st, 3, 0);
195 	}
196 #endif
197 	uma_zfree(biozone, bp);
198 }
199 
200 struct bio *
g_clone_bio(struct bio * bp)201 g_clone_bio(struct bio *bp)
202 {
203 	struct bio *bp2;
204 
205 	bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
206 	if (bp2 != NULL) {
207 		bp2->bio_parent = bp;
208 		bp2->bio_cmd = bp->bio_cmd;
209 		/*
210 		 *  BIO_ORDERED flag may be used by disk drivers to enforce
211 		 *  ordering restrictions, so this flag needs to be cloned.
212 		 *  BIO_UNMAPPED and BIO_VLIST should be inherited, to properly
213 		 *  indicate which way the buffer is passed.
214 		 *  Other bio flags are not suitable for cloning.
215 		 */
216 		bp2->bio_flags = bp->bio_flags &
217 		    (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST);
218 		bp2->bio_length = bp->bio_length;
219 		bp2->bio_offset = bp->bio_offset;
220 		bp2->bio_data = bp->bio_data;
221 		bp2->bio_ma = bp->bio_ma;
222 		bp2->bio_ma_n = bp->bio_ma_n;
223 		bp2->bio_ma_offset = bp->bio_ma_offset;
224 		bp2->bio_attribute = bp->bio_attribute;
225 		if (bp->bio_cmd == BIO_ZONE)
226 			bcopy(&bp->bio_zone, &bp2->bio_zone,
227 			    sizeof(bp->bio_zone));
228 		/* Inherit classification info from the parent */
229 		bp2->bio_classifier1 = bp->bio_classifier1;
230 		bp2->bio_classifier2 = bp->bio_classifier2;
231 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
232 		bp2->bio_track_bp = bp->bio_track_bp;
233 #endif
234 		bp->bio_children++;
235 	}
236 #ifdef KTR
237 	if (KTR_GEOM_ENABLED) {
238 		struct stack st;
239 
240 		CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2);
241 		stack_save(&st);
242 		CTRSTACK(KTR_GEOM, &st, 3, 0);
243 	}
244 #endif
245 	return(bp2);
246 }
247 
248 struct bio *
g_duplicate_bio(struct bio * bp)249 g_duplicate_bio(struct bio *bp)
250 {
251 	struct bio *bp2;
252 
253 	bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO);
254 	bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST);
255 	bp2->bio_parent = bp;
256 	bp2->bio_cmd = bp->bio_cmd;
257 	bp2->bio_length = bp->bio_length;
258 	bp2->bio_offset = bp->bio_offset;
259 	bp2->bio_data = bp->bio_data;
260 	bp2->bio_ma = bp->bio_ma;
261 	bp2->bio_ma_n = bp->bio_ma_n;
262 	bp2->bio_ma_offset = bp->bio_ma_offset;
263 	bp2->bio_attribute = bp->bio_attribute;
264 	bp->bio_children++;
265 #ifdef KTR
266 	if (KTR_GEOM_ENABLED) {
267 		struct stack st;
268 
269 		CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2);
270 		stack_save(&st);
271 		CTRSTACK(KTR_GEOM, &st, 3, 0);
272 	}
273 #endif
274 	return(bp2);
275 }
276 
277 void
g_reset_bio(struct bio * bp)278 g_reset_bio(struct bio *bp)
279 {
280 
281 	bzero(bp, sizeof(*bp));
282 }
283 
284 void
g_io_init(void)285 g_io_init(void)
286 {
287 
288 	g_bioq_init(&g_bio_run_down);
289 	g_bioq_init(&g_bio_run_up);
290 	biozone = uma_zcreate("g_bio", sizeof (struct bio),
291 	    NULL, NULL,
292 	    NULL, NULL,
293 	    0, 0);
294 }
295 
296 int
g_io_getattr(const char * attr,struct g_consumer * cp,int * len,void * ptr)297 g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr)
298 {
299 	struct bio *bp;
300 	int error;
301 
302 	g_trace(G_T_BIO, "bio_getattr(%s)", attr);
303 	bp = g_alloc_bio();
304 	bp->bio_cmd = BIO_GETATTR;
305 	bp->bio_done = NULL;
306 	bp->bio_attribute = attr;
307 	bp->bio_length = *len;
308 	bp->bio_data = ptr;
309 	g_io_request(bp, cp);
310 	error = biowait(bp, "ggetattr");
311 	*len = bp->bio_completed;
312 	g_destroy_bio(bp);
313 	return (error);
314 }
315 
316 int
g_io_zonecmd(struct disk_zone_args * zone_args,struct g_consumer * cp)317 g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp)
318 {
319 	struct bio *bp;
320 	int error;
321 
322 	g_trace(G_T_BIO, "bio_zone(%d)", zone_args->zone_cmd);
323 	bp = g_alloc_bio();
324 	bp->bio_cmd = BIO_ZONE;
325 	bp->bio_done = NULL;
326 	/*
327 	 * XXX KDM need to handle report zone data.
328 	 */
329 	bcopy(zone_args, &bp->bio_zone, sizeof(*zone_args));
330 	if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES)
331 		bp->bio_length =
332 		    zone_args->zone_params.report.entries_allocated *
333 		    sizeof(struct disk_zone_rep_entry);
334 	else
335 		bp->bio_length = 0;
336 
337 	g_io_request(bp, cp);
338 	error = biowait(bp, "gzone");
339 	bcopy(&bp->bio_zone, zone_args, sizeof(*zone_args));
340 	g_destroy_bio(bp);
341 	return (error);
342 }
343 
344 int
g_io_flush(struct g_consumer * cp)345 g_io_flush(struct g_consumer *cp)
346 {
347 	struct bio *bp;
348 	int error;
349 
350 	g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name);
351 	bp = g_alloc_bio();
352 	bp->bio_cmd = BIO_FLUSH;
353 	bp->bio_flags |= BIO_ORDERED;
354 	bp->bio_done = NULL;
355 	bp->bio_attribute = NULL;
356 	bp->bio_offset = cp->provider->mediasize;
357 	bp->bio_length = 0;
358 	bp->bio_data = NULL;
359 	g_io_request(bp, cp);
360 	error = biowait(bp, "gflush");
361 	g_destroy_bio(bp);
362 	return (error);
363 }
364 
365 static int
g_io_check(struct bio * bp)366 g_io_check(struct bio *bp)
367 {
368 	struct g_consumer *cp;
369 	struct g_provider *pp;
370 	off_t excess;
371 	int error;
372 
373 	biotrack(bp, __func__);
374 
375 	cp = bp->bio_from;
376 	pp = bp->bio_to;
377 
378 	/* Fail if access counters dont allow the operation */
379 	switch(bp->bio_cmd) {
380 	case BIO_READ:
381 	case BIO_GETATTR:
382 		if (cp->acr == 0)
383 			return (EPERM);
384 		break;
385 	case BIO_WRITE:
386 	case BIO_DELETE:
387 	case BIO_FLUSH:
388 		if (cp->acw == 0)
389 			return (EPERM);
390 		break;
391 	case BIO_ZONE:
392 		if ((bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) ||
393 		    (bp->bio_zone.zone_cmd == DISK_ZONE_GET_PARAMS)) {
394 			if (cp->acr == 0)
395 				return (EPERM);
396 		} else if (cp->acw == 0)
397 			return (EPERM);
398 		break;
399 	default:
400 		return (EPERM);
401 	}
402 	/* if provider is marked for error, don't disturb. */
403 	if (pp->error)
404 		return (pp->error);
405 	if (cp->flags & G_CF_ORPHAN)
406 		return (ENXIO);
407 
408 	switch(bp->bio_cmd) {
409 	case BIO_READ:
410 	case BIO_WRITE:
411 	case BIO_DELETE:
412 		/* Zero sectorsize or mediasize is probably a lack of media. */
413 		if (pp->sectorsize == 0 || pp->mediasize == 0)
414 			return (ENXIO);
415 		/* Reject I/O not on sector boundary */
416 		if (bp->bio_offset % pp->sectorsize)
417 			return (EINVAL);
418 		/* Reject I/O not integral sector long */
419 		if (bp->bio_length % pp->sectorsize)
420 			return (EINVAL);
421 		/* Reject requests before or past the end of media. */
422 		if (bp->bio_offset < 0)
423 			return (EIO);
424 		if (bp->bio_offset > pp->mediasize)
425 			return (EIO);
426 
427 		/* Truncate requests to the end of providers media. */
428 		excess = bp->bio_offset + bp->bio_length;
429 		if (excess > bp->bio_to->mediasize) {
430 			KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 ||
431 			    round_page(bp->bio_ma_offset +
432 			    bp->bio_length) / PAGE_SIZE == bp->bio_ma_n,
433 			    ("excess bio %p too short", bp));
434 			excess -= bp->bio_to->mediasize;
435 			bp->bio_length -= excess;
436 			if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
437 				bp->bio_ma_n = round_page(bp->bio_ma_offset +
438 				    bp->bio_length) / PAGE_SIZE;
439 			}
440 			if (excess > 0)
441 				CTR3(KTR_GEOM, "g_down truncated bio "
442 				    "%p provider %s by %d", bp,
443 				    bp->bio_to->name, excess);
444 		}
445 
446 		/* Deliver zero length transfers right here. */
447 		if (bp->bio_length == 0) {
448 			CTR2(KTR_GEOM, "g_down terminated 0-length "
449 			    "bp %p provider %s", bp, bp->bio_to->name);
450 			return (0);
451 		}
452 
453 		if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
454 		    (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 &&
455 		    (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
456 			if ((error = g_io_transient_map_bio(bp)) >= 0)
457 				return (error);
458 		}
459 		break;
460 	default:
461 		break;
462 	}
463 	return (EJUSTRETURN);
464 }
465 
466 /*
467  * bio classification support.
468  *
469  * g_register_classifier() and g_unregister_classifier()
470  * are used to add/remove a classifier from the list.
471  * The list is protected using the g_bio_run_down lock,
472  * because the classifiers are called in this path.
473  *
474  * g_io_request() passes bio's that are not already classified
475  * (i.e. those with bio_classifier1 == NULL) to g_run_classifiers().
476  * Classifiers can store their result in the two fields
477  * bio_classifier1 and bio_classifier2.
478  * A classifier that updates one of the fields should
479  * return a non-zero value.
480  * If no classifier updates the field, g_run_classifiers() sets
481  * bio_classifier1 = BIO_NOTCLASSIFIED to avoid further calls.
482  */
483 
484 int
g_register_classifier(struct g_classifier_hook * hook)485 g_register_classifier(struct g_classifier_hook *hook)
486 {
487 
488 	g_bioq_lock(&g_bio_run_down);
489 	TAILQ_INSERT_TAIL(&g_classifier_tailq, hook, link);
490 	g_bioq_unlock(&g_bio_run_down);
491 
492 	return (0);
493 }
494 
495 void
g_unregister_classifier(struct g_classifier_hook * hook)496 g_unregister_classifier(struct g_classifier_hook *hook)
497 {
498 	struct g_classifier_hook *entry;
499 
500 	g_bioq_lock(&g_bio_run_down);
501 	TAILQ_FOREACH(entry, &g_classifier_tailq, link) {
502 		if (entry == hook) {
503 			TAILQ_REMOVE(&g_classifier_tailq, hook, link);
504 			break;
505 		}
506 	}
507 	g_bioq_unlock(&g_bio_run_down);
508 }
509 
510 static void
g_run_classifiers(struct bio * bp)511 g_run_classifiers(struct bio *bp)
512 {
513 	struct g_classifier_hook *hook;
514 	int classified = 0;
515 
516 	biotrack(bp, __func__);
517 
518 	TAILQ_FOREACH(hook, &g_classifier_tailq, link)
519 		classified |= hook->func(hook->arg, bp);
520 
521 	if (!classified)
522 		bp->bio_classifier1 = BIO_NOTCLASSIFIED;
523 }
524 
525 void
g_io_request(struct bio * bp,struct g_consumer * cp)526 g_io_request(struct bio *bp, struct g_consumer *cp)
527 {
528 	struct g_provider *pp;
529 	struct mtx *mtxp;
530 	int direct, error, first;
531 	uint8_t cmd;
532 
533 	biotrack(bp, __func__);
534 
535 	KASSERT(cp != NULL, ("NULL cp in g_io_request"));
536 	KASSERT(bp != NULL, ("NULL bp in g_io_request"));
537 	pp = cp->provider;
538 	KASSERT(pp != NULL, ("consumer not attached in g_io_request"));
539 #ifdef DIAGNOSTIC
540 	KASSERT(bp->bio_driver1 == NULL,
541 	    ("bio_driver1 used by the consumer (geom %s)", cp->geom->name));
542 	KASSERT(bp->bio_driver2 == NULL,
543 	    ("bio_driver2 used by the consumer (geom %s)", cp->geom->name));
544 	KASSERT(bp->bio_pflags == 0,
545 	    ("bio_pflags used by the consumer (geom %s)", cp->geom->name));
546 	/*
547 	 * Remember consumer's private fields, so we can detect if they were
548 	 * modified by the provider.
549 	 */
550 	bp->_bio_caller1 = bp->bio_caller1;
551 	bp->_bio_caller2 = bp->bio_caller2;
552 	bp->_bio_cflags = bp->bio_cflags;
553 #endif
554 
555 	cmd = bp->bio_cmd;
556 	if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_GETATTR) {
557 		KASSERT(bp->bio_data != NULL,
558 		    ("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd));
559 	}
560 	if (cmd == BIO_DELETE || cmd == BIO_FLUSH) {
561 		KASSERT(bp->bio_data == NULL,
562 		    ("non-NULL bp->data in g_io_request(cmd=%hu)",
563 		    bp->bio_cmd));
564 	}
565 	if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_DELETE) {
566 		KASSERT(bp->bio_offset % cp->provider->sectorsize == 0,
567 		    ("wrong offset %jd for sectorsize %u",
568 		    bp->bio_offset, cp->provider->sectorsize));
569 		KASSERT(bp->bio_length % cp->provider->sectorsize == 0,
570 		    ("wrong length %jd for sectorsize %u",
571 		    bp->bio_length, cp->provider->sectorsize));
572 	}
573 
574 	g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d",
575 	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd);
576 
577 	bp->bio_from = cp;
578 	bp->bio_to = pp;
579 	bp->bio_error = 0;
580 	bp->bio_completed = 0;
581 
582 	KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
583 	    ("Bio already on queue bp=%p", bp));
584 	if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
585 	    ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
586 		binuptime(&bp->bio_t0);
587 	else
588 		getbinuptime(&bp->bio_t0);
589 
590 	direct = (cp->flags & G_CF_DIRECT_SEND) != 0 &&
591 	    (pp->flags & G_PF_DIRECT_RECEIVE) != 0 &&
592 	    !g_is_geom_thread(curthread) &&
593 	    ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ||
594 	    (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) &&
595 	    pace == 0;
596 	if (direct) {
597 		/* Block direct execution if less then half of stack left. */
598 		size_t	st, su;
599 		GET_STACK_USAGE(st, su);
600 		if (su * 2 > st)
601 			direct = 0;
602 	}
603 
604 	if (!TAILQ_EMPTY(&g_classifier_tailq) && !bp->bio_classifier1) {
605 		g_bioq_lock(&g_bio_run_down);
606 		g_run_classifiers(bp);
607 		g_bioq_unlock(&g_bio_run_down);
608 	}
609 
610 	/*
611 	 * The statistics collection is lockless, as such, but we
612 	 * can not update one instance of the statistics from more
613 	 * than one thread at a time, so grab the lock first.
614 	 */
615 	mtxp = mtx_pool_find(mtxpool_sleep, pp);
616 	mtx_lock(mtxp);
617 	if (g_collectstats & G_STATS_PROVIDERS)
618 		devstat_start_transaction_bio_t0(pp->stat, bp);
619 	if (g_collectstats & G_STATS_CONSUMERS)
620 		devstat_start_transaction_bio_t0(cp->stat, bp);
621 	pp->nstart++;
622 	cp->nstart++;
623 	mtx_unlock(mtxp);
624 
625 	if (direct) {
626 		error = g_io_check(bp);
627 		if (error >= 0) {
628 			CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p "
629 			    "provider %s returned %d", bp, bp->bio_to->name,
630 			    error);
631 			g_io_deliver(bp, error);
632 			return;
633 		}
634 		bp->bio_to->geom->start(bp);
635 	} else {
636 		g_bioq_lock(&g_bio_run_down);
637 		first = TAILQ_EMPTY(&g_bio_run_down.bio_queue);
638 		TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue);
639 		bp->bio_flags |= BIO_ONQUEUE;
640 		g_bio_run_down.bio_queue_length++;
641 		g_bioq_unlock(&g_bio_run_down);
642 		/* Pass it on down. */
643 		if (first)
644 			wakeup(&g_wait_down);
645 	}
646 }
647 
648 void
g_io_deliver(struct bio * bp,int error)649 g_io_deliver(struct bio *bp, int error)
650 {
651 	struct bintime now;
652 	struct g_consumer *cp;
653 	struct g_provider *pp;
654 	struct mtx *mtxp;
655 	int direct, first;
656 
657 	biotrack(bp, __func__);
658 
659 	KASSERT(bp != NULL, ("NULL bp in g_io_deliver"));
660 	pp = bp->bio_to;
661 	KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver"));
662 	cp = bp->bio_from;
663 	if (cp == NULL) {
664 		bp->bio_error = error;
665 		bp->bio_done(bp);
666 		return;
667 	}
668 	KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver"));
669 	KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver"));
670 #ifdef DIAGNOSTIC
671 	/*
672 	 * Some classes - GJournal in particular - can modify bio's
673 	 * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO
674 	 * flag means it's an expected behaviour for that particular geom.
675 	 */
676 	if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) {
677 		KASSERT(bp->bio_caller1 == bp->_bio_caller1,
678 		    ("bio_caller1 used by the provider %s", pp->name));
679 		KASSERT(bp->bio_caller2 == bp->_bio_caller2,
680 		    ("bio_caller2 used by the provider %s", pp->name));
681 		KASSERT(bp->bio_cflags == bp->_bio_cflags,
682 		    ("bio_cflags used by the provider %s", pp->name));
683 	}
684 #endif
685 	KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0"));
686 	KASSERT(bp->bio_completed <= bp->bio_length,
687 	    ("bio_completed can't be greater than bio_length"));
688 
689 	g_trace(G_T_BIO,
690 "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd",
691 	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error,
692 	    (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
693 
694 	KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
695 	    ("Bio already on queue bp=%p", bp));
696 
697 	/*
698 	 * XXX: next two doesn't belong here
699 	 */
700 	bp->bio_bcount = bp->bio_length;
701 	bp->bio_resid = bp->bio_bcount - bp->bio_completed;
702 
703 	direct = (pp->flags & G_PF_DIRECT_SEND) &&
704 		 (cp->flags & G_CF_DIRECT_RECEIVE) &&
705 		 !g_is_geom_thread(curthread);
706 	if (direct) {
707 		/* Block direct execution if less then half of stack left. */
708 		size_t	st, su;
709 		GET_STACK_USAGE(st, su);
710 		if (su * 2 > st)
711 			direct = 0;
712 	}
713 
714 	/*
715 	 * The statistics collection is lockless, as such, but we
716 	 * can not update one instance of the statistics from more
717 	 * than one thread at a time, so grab the lock first.
718 	 */
719 	if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
720 	    ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
721 		binuptime(&now);
722 	mtxp = mtx_pool_find(mtxpool_sleep, cp);
723 	mtx_lock(mtxp);
724 	if (g_collectstats & G_STATS_PROVIDERS)
725 		devstat_end_transaction_bio_bt(pp->stat, bp, &now);
726 	if (g_collectstats & G_STATS_CONSUMERS)
727 		devstat_end_transaction_bio_bt(cp->stat, bp, &now);
728 	cp->nend++;
729 	pp->nend++;
730 	mtx_unlock(mtxp);
731 
732 	if (error != ENOMEM) {
733 		bp->bio_error = error;
734 		if (direct) {
735 			biodone(bp);
736 		} else {
737 			g_bioq_lock(&g_bio_run_up);
738 			first = TAILQ_EMPTY(&g_bio_run_up.bio_queue);
739 			TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue);
740 			bp->bio_flags |= BIO_ONQUEUE;
741 			g_bio_run_up.bio_queue_length++;
742 			g_bioq_unlock(&g_bio_run_up);
743 			if (first)
744 				wakeup(&g_wait_up);
745 		}
746 		return;
747 	}
748 
749 	if (bootverbose)
750 		printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name);
751 	bp->bio_children = 0;
752 	bp->bio_inbed = 0;
753 	bp->bio_driver1 = NULL;
754 	bp->bio_driver2 = NULL;
755 	bp->bio_pflags = 0;
756 	g_io_request(bp, cp);
757 	pace = 1;
758 	return;
759 }
760 
761 SYSCTL_DECL(_kern_geom);
762 
763 static long transient_maps;
764 SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD,
765     &transient_maps, 0,
766     "Total count of the transient mapping requests");
767 u_int transient_map_retries = 10;
768 SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW,
769     &transient_map_retries, 0,
770     "Max count of retries used before giving up on creating transient map");
771 int transient_map_hard_failures;
772 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD,
773     &transient_map_hard_failures, 0,
774     "Failures to establish the transient mapping due to retry attempts "
775     "exhausted");
776 int transient_map_soft_failures;
777 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD,
778     &transient_map_soft_failures, 0,
779     "Count of retried failures to establish the transient mapping");
780 int inflight_transient_maps;
781 SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD,
782     &inflight_transient_maps, 0,
783     "Current count of the active transient maps");
784 
785 static int
g_io_transient_map_bio(struct bio * bp)786 g_io_transient_map_bio(struct bio *bp)
787 {
788 	vm_offset_t addr;
789 	long size;
790 	u_int retried;
791 
792 	KASSERT(unmapped_buf_allowed, ("unmapped disabled"));
793 
794 	size = round_page(bp->bio_ma_offset + bp->bio_length);
795 	KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp));
796 	addr = 0;
797 	retried = 0;
798 	atomic_add_long(&transient_maps, 1);
799 retry:
800 	if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) {
801 		if (transient_map_retries != 0 &&
802 		    retried >= transient_map_retries) {
803 			CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s",
804 			    bp, bp->bio_to->name);
805 			atomic_add_int(&transient_map_hard_failures, 1);
806 			return (EDEADLK/* XXXKIB */);
807 		} else {
808 			/*
809 			 * Naive attempt to quisce the I/O to get more
810 			 * in-flight requests completed and defragment
811 			 * the transient_arena.
812 			 */
813 			CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d",
814 			    bp, bp->bio_to->name, retried);
815 			pause("g_d_tra", hz / 10);
816 			retried++;
817 			atomic_add_int(&transient_map_soft_failures, 1);
818 			goto retry;
819 		}
820 	}
821 	atomic_add_int(&inflight_transient_maps, 1);
822 	pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size));
823 	bp->bio_data = (caddr_t)addr + bp->bio_ma_offset;
824 	bp->bio_flags |= BIO_TRANSIENT_MAPPING;
825 	bp->bio_flags &= ~BIO_UNMAPPED;
826 	return (EJUSTRETURN);
827 }
828 
829 void
g_io_schedule_down(struct thread * tp __unused)830 g_io_schedule_down(struct thread *tp __unused)
831 {
832 	struct bio *bp;
833 	int error;
834 
835 	for(;;) {
836 		g_bioq_lock(&g_bio_run_down);
837 		bp = g_bioq_first(&g_bio_run_down);
838 		if (bp == NULL) {
839 			CTR0(KTR_GEOM, "g_down going to sleep");
840 			msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock,
841 			    PRIBIO | PDROP, "-", 0);
842 			continue;
843 		}
844 		CTR0(KTR_GEOM, "g_down has work to do");
845 		g_bioq_unlock(&g_bio_run_down);
846 		biotrack(bp, __func__);
847 		if (pace != 0) {
848 			/*
849 			 * There has been at least one memory allocation
850 			 * failure since the last I/O completed. Pause 1ms to
851 			 * give the system a chance to free up memory. We only
852 			 * do this once because a large number of allocations
853 			 * can fail in the direct dispatch case and there's no
854 			 * relationship between the number of these failures and
855 			 * the length of the outage. If there's still an outage,
856 			 * we'll pause again and again until it's
857 			 * resolved. Older versions paused longer and once per
858 			 * allocation failure. This was OK for a single threaded
859 			 * g_down, but with direct dispatch would lead to max of
860 			 * 10 IOPs for minutes at a time when transient memory
861 			 * issues prevented allocation for a batch of requests
862 			 * from the upper layers.
863 			 *
864 			 * XXX This pacing is really lame. It needs to be solved
865 			 * by other methods. This is OK only because the worst
866 			 * case scenario is so rare. In the worst case scenario
867 			 * all memory is tied up waiting for I/O to complete
868 			 * which can never happen since we can't allocate bios
869 			 * for that I/O.
870 			 */
871 			CTR0(KTR_GEOM, "g_down pacing self");
872 			pause("g_down", min(hz/1000, 1));
873 			pace = 0;
874 		}
875 		CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp,
876 		    bp->bio_to->name);
877 		error = g_io_check(bp);
878 		if (error >= 0) {
879 			CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider "
880 			    "%s returned %d", bp, bp->bio_to->name, error);
881 			g_io_deliver(bp, error);
882 			continue;
883 		}
884 		THREAD_NO_SLEEPING();
885 		CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld "
886 		    "len %ld", bp, bp->bio_to->name, bp->bio_offset,
887 		    bp->bio_length);
888 		bp->bio_to->geom->start(bp);
889 		THREAD_SLEEPING_OK();
890 	}
891 }
892 
893 void
g_io_schedule_up(struct thread * tp __unused)894 g_io_schedule_up(struct thread *tp __unused)
895 {
896 	struct bio *bp;
897 
898 	for(;;) {
899 		g_bioq_lock(&g_bio_run_up);
900 		bp = g_bioq_first(&g_bio_run_up);
901 		if (bp == NULL) {
902 			CTR0(KTR_GEOM, "g_up going to sleep");
903 			msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock,
904 			    PRIBIO | PDROP, "-", 0);
905 			continue;
906 		}
907 		g_bioq_unlock(&g_bio_run_up);
908 		THREAD_NO_SLEEPING();
909 		CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off "
910 		    "%jd len %ld", bp, bp->bio_to->name,
911 		    bp->bio_offset, bp->bio_length);
912 		biodone(bp);
913 		THREAD_SLEEPING_OK();
914 	}
915 }
916 
917 void *
g_read_data(struct g_consumer * cp,off_t offset,off_t length,int * error)918 g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error)
919 {
920 	struct bio *bp;
921 	void *ptr;
922 	int errorc;
923 
924 	KASSERT(length > 0 && length >= cp->provider->sectorsize &&
925 	    length <= MAXPHYS, ("g_read_data(): invalid length %jd",
926 	    (intmax_t)length));
927 
928 	bp = g_alloc_bio();
929 	bp->bio_cmd = BIO_READ;
930 	bp->bio_done = NULL;
931 	bp->bio_offset = offset;
932 	bp->bio_length = length;
933 	ptr = g_malloc(length, M_WAITOK);
934 	bp->bio_data = ptr;
935 	g_io_request(bp, cp);
936 	errorc = biowait(bp, "gread");
937 	if (error != NULL)
938 		*error = errorc;
939 	g_destroy_bio(bp);
940 	if (errorc) {
941 		g_free(ptr);
942 		ptr = NULL;
943 	}
944 	return (ptr);
945 }
946 
947 /*
948  * A read function for use by ffs_sbget when used by GEOM-layer routines.
949  */
950 int
g_use_g_read_data(void * devfd,off_t loc,void ** bufp,int size)951 g_use_g_read_data(void *devfd, off_t loc, void **bufp, int size)
952 {
953 	struct g_consumer *cp;
954 
955 	KASSERT(*bufp == NULL,
956 	    ("g_use_g_read_data: non-NULL *bufp %p\n", *bufp));
957 
958 	cp = (struct g_consumer *)devfd;
959 	/*
960 	 * Take care not to issue an invalid I/O request. The offset of
961 	 * the superblock candidate must be multiples of the provider's
962 	 * sector size, otherwise an FFS can't exist on the provider
963 	 * anyway.
964 	 */
965 	if (loc % cp->provider->sectorsize != 0)
966 		return (ENOENT);
967 	*bufp = g_read_data(cp, loc, size, NULL);
968 	if (*bufp == NULL)
969 		return (ENOENT);
970 	return (0);
971 }
972 
973 int
g_write_data(struct g_consumer * cp,off_t offset,void * ptr,off_t length)974 g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length)
975 {
976 	struct bio *bp;
977 	int error;
978 
979 	KASSERT(length > 0 && length >= cp->provider->sectorsize &&
980 	    length <= MAXPHYS, ("g_write_data(): invalid length %jd",
981 	    (intmax_t)length));
982 
983 	bp = g_alloc_bio();
984 	bp->bio_cmd = BIO_WRITE;
985 	bp->bio_done = NULL;
986 	bp->bio_offset = offset;
987 	bp->bio_length = length;
988 	bp->bio_data = ptr;
989 	g_io_request(bp, cp);
990 	error = biowait(bp, "gwrite");
991 	g_destroy_bio(bp);
992 	return (error);
993 }
994 
995 /*
996  * A write function for use by ffs_sbput when used by GEOM-layer routines.
997  */
998 int
g_use_g_write_data(void * devfd,off_t loc,void * buf,int size)999 g_use_g_write_data(void *devfd, off_t loc, void *buf, int size)
1000 {
1001 
1002 	return (g_write_data((struct g_consumer *)devfd, loc, buf, size));
1003 }
1004 
1005 int
g_delete_data(struct g_consumer * cp,off_t offset,off_t length)1006 g_delete_data(struct g_consumer *cp, off_t offset, off_t length)
1007 {
1008 	struct bio *bp;
1009 	int error;
1010 
1011 	KASSERT(length > 0 && length >= cp->provider->sectorsize,
1012 	    ("g_delete_data(): invalid length %jd", (intmax_t)length));
1013 
1014 	bp = g_alloc_bio();
1015 	bp->bio_cmd = BIO_DELETE;
1016 	bp->bio_done = NULL;
1017 	bp->bio_offset = offset;
1018 	bp->bio_length = length;
1019 	bp->bio_data = NULL;
1020 	g_io_request(bp, cp);
1021 	error = biowait(bp, "gdelete");
1022 	g_destroy_bio(bp);
1023 	return (error);
1024 }
1025 
1026 void
g_print_bio(struct bio * bp)1027 g_print_bio(struct bio *bp)
1028 {
1029 	const char *pname, *cmd = NULL;
1030 
1031 	if (bp->bio_to != NULL)
1032 		pname = bp->bio_to->name;
1033 	else
1034 		pname = "[unknown]";
1035 
1036 	switch (bp->bio_cmd) {
1037 	case BIO_GETATTR:
1038 		cmd = "GETATTR";
1039 		printf("%s[%s(attr=%s)]", pname, cmd, bp->bio_attribute);
1040 		return;
1041 	case BIO_FLUSH:
1042 		cmd = "FLUSH";
1043 		printf("%s[%s]", pname, cmd);
1044 		return;
1045 	case BIO_ZONE: {
1046 		char *subcmd = NULL;
1047 		cmd = "ZONE";
1048 		switch (bp->bio_zone.zone_cmd) {
1049 		case DISK_ZONE_OPEN:
1050 			subcmd = "OPEN";
1051 			break;
1052 		case DISK_ZONE_CLOSE:
1053 			subcmd = "CLOSE";
1054 			break;
1055 		case DISK_ZONE_FINISH:
1056 			subcmd = "FINISH";
1057 			break;
1058 		case DISK_ZONE_RWP:
1059 			subcmd = "RWP";
1060 			break;
1061 		case DISK_ZONE_REPORT_ZONES:
1062 			subcmd = "REPORT ZONES";
1063 			break;
1064 		case DISK_ZONE_GET_PARAMS:
1065 			subcmd = "GET PARAMS";
1066 			break;
1067 		default:
1068 			subcmd = "UNKNOWN";
1069 			break;
1070 		}
1071 		printf("%s[%s,%s]", pname, cmd, subcmd);
1072 		return;
1073 	}
1074 	case BIO_READ:
1075 		cmd = "READ";
1076 		break;
1077 	case BIO_WRITE:
1078 		cmd = "WRITE";
1079 		break;
1080 	case BIO_DELETE:
1081 		cmd = "DELETE";
1082 		break;
1083 	default:
1084 		cmd = "UNKNOWN";
1085 		printf("%s[%s()]", pname, cmd);
1086 		return;
1087 	}
1088 	printf("%s[%s(offset=%jd, length=%jd)]", pname, cmd,
1089 	    (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
1090 }
1091