1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 2002 Poul-Henning Kamp
5 * Copyright (c) 2002 Networks Associates Technology, Inc.
6 * Copyright (c) 2013 The FreeBSD Foundation
7 * All rights reserved.
8 *
9 * This software was developed for the FreeBSD Project by Poul-Henning Kamp
10 * and NAI Labs, the Security Research Division of Network Associates, Inc.
11 * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
12 * DARPA CHATS research program.
13 *
14 * Portions of this software were developed by Konstantin Belousov
15 * under sponsorship from the FreeBSD Foundation.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 * notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 * notice, this list of conditions and the following disclaimer in the
24 * documentation and/or other materials provided with the distribution.
25 * 3. The names of the authors may not be used to endorse or promote
26 * products derived from this software without specific prior written
27 * permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 */
41
42 #include <sys/cdefs.h>
43 __FBSDID("$FreeBSD: stable/12/sys/geom/geom_io.c 373214 2023-09-20 07:11:23Z dim $");
44
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/kernel.h>
48 #include <sys/malloc.h>
49 #include <sys/bio.h>
50 #include <sys/ktr.h>
51 #include <sys/proc.h>
52 #include <sys/stack.h>
53 #include <sys/sysctl.h>
54 #include <sys/vmem.h>
55
56 #include <sys/errno.h>
57 #include <geom/geom.h>
58 #include <geom/geom_int.h>
59 #include <sys/devicestat.h>
60
61 #include <vm/uma.h>
62 #include <vm/vm.h>
63 #include <vm/vm_param.h>
64 #include <vm/vm_kern.h>
65 #include <vm/vm_page.h>
66 #include <vm/vm_object.h>
67 #include <vm/vm_extern.h>
68 #include <vm/vm_map.h>
69
70 #define KTR_GEOM_ENABLED \
71 ((KTR_COMPILE & KTR_GEOM) != 0 && (ktr_mask & KTR_GEOM) != 0)
72
73 static int g_io_transient_map_bio(struct bio *bp);
74
75 static struct g_bioq g_bio_run_down;
76 static struct g_bioq g_bio_run_up;
77
78 /*
79 * Pace is a hint that we've had some trouble recently allocating
80 * bios, so we should back off trying to send I/O down the stack
81 * a bit to let the problem resolve. When pacing, we also turn
82 * off direct dispatch to also reduce memory pressure from I/Os
83 * there, at the expxense of some added latency while the memory
84 * pressures exist. See g_io_schedule_down() for more details
85 * and limitations.
86 */
87 static volatile u_int __read_mostly pace;
88
89 static uma_zone_t __read_mostly biozone;
90
91 /*
92 * The head of the list of classifiers used in g_io_request.
93 * Use g_register_classifier() and g_unregister_classifier()
94 * to add/remove entries to the list.
95 * Classifiers are invoked in registration order.
96 */
97 static TAILQ_HEAD(, g_classifier_hook) g_classifier_tailq __read_mostly =
98 TAILQ_HEAD_INITIALIZER(g_classifier_tailq);
99
100 #include <machine/atomic.h>
101
102 static void
g_bioq_lock(struct g_bioq * bq)103 g_bioq_lock(struct g_bioq *bq)
104 {
105
106 mtx_lock(&bq->bio_queue_lock);
107 }
108
109 static void
g_bioq_unlock(struct g_bioq * bq)110 g_bioq_unlock(struct g_bioq *bq)
111 {
112
113 mtx_unlock(&bq->bio_queue_lock);
114 }
115
116 #if 0
117 static void
118 g_bioq_destroy(struct g_bioq *bq)
119 {
120
121 mtx_destroy(&bq->bio_queue_lock);
122 }
123 #endif
124
125 static void
g_bioq_init(struct g_bioq * bq)126 g_bioq_init(struct g_bioq *bq)
127 {
128
129 TAILQ_INIT(&bq->bio_queue);
130 mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF);
131 }
132
133 static struct bio *
g_bioq_first(struct g_bioq * bq)134 g_bioq_first(struct g_bioq *bq)
135 {
136 struct bio *bp;
137
138 bp = TAILQ_FIRST(&bq->bio_queue);
139 if (bp != NULL) {
140 KASSERT((bp->bio_flags & BIO_ONQUEUE),
141 ("Bio not on queue bp=%p target %p", bp, bq));
142 bp->bio_flags &= ~BIO_ONQUEUE;
143 TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue);
144 bq->bio_queue_length--;
145 }
146 return (bp);
147 }
148
149 struct bio *
g_new_bio(void)150 g_new_bio(void)
151 {
152 struct bio *bp;
153
154 bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
155 #ifdef KTR
156 if (KTR_GEOM_ENABLED) {
157 struct stack st;
158
159 CTR1(KTR_GEOM, "g_new_bio(): %p", bp);
160 stack_save(&st);
161 CTRSTACK(KTR_GEOM, &st, 3, 0);
162 }
163 #endif
164 return (bp);
165 }
166
167 struct bio *
g_alloc_bio(void)168 g_alloc_bio(void)
169 {
170 struct bio *bp;
171
172 bp = uma_zalloc(biozone, M_WAITOK | M_ZERO);
173 #ifdef KTR
174 if (KTR_GEOM_ENABLED) {
175 struct stack st;
176
177 CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp);
178 stack_save(&st);
179 CTRSTACK(KTR_GEOM, &st, 3, 0);
180 }
181 #endif
182 return (bp);
183 }
184
185 void
g_destroy_bio(struct bio * bp)186 g_destroy_bio(struct bio *bp)
187 {
188 #ifdef KTR
189 if (KTR_GEOM_ENABLED) {
190 struct stack st;
191
192 CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp);
193 stack_save(&st);
194 CTRSTACK(KTR_GEOM, &st, 3, 0);
195 }
196 #endif
197 uma_zfree(biozone, bp);
198 }
199
200 struct bio *
g_clone_bio(struct bio * bp)201 g_clone_bio(struct bio *bp)
202 {
203 struct bio *bp2;
204
205 bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
206 if (bp2 != NULL) {
207 bp2->bio_parent = bp;
208 bp2->bio_cmd = bp->bio_cmd;
209 /*
210 * BIO_ORDERED flag may be used by disk drivers to enforce
211 * ordering restrictions, so this flag needs to be cloned.
212 * BIO_UNMAPPED and BIO_VLIST should be inherited, to properly
213 * indicate which way the buffer is passed.
214 * Other bio flags are not suitable for cloning.
215 */
216 bp2->bio_flags = bp->bio_flags &
217 (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST);
218 bp2->bio_length = bp->bio_length;
219 bp2->bio_offset = bp->bio_offset;
220 bp2->bio_data = bp->bio_data;
221 bp2->bio_ma = bp->bio_ma;
222 bp2->bio_ma_n = bp->bio_ma_n;
223 bp2->bio_ma_offset = bp->bio_ma_offset;
224 bp2->bio_attribute = bp->bio_attribute;
225 if (bp->bio_cmd == BIO_ZONE)
226 bcopy(&bp->bio_zone, &bp2->bio_zone,
227 sizeof(bp->bio_zone));
228 /* Inherit classification info from the parent */
229 bp2->bio_classifier1 = bp->bio_classifier1;
230 bp2->bio_classifier2 = bp->bio_classifier2;
231 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
232 bp2->bio_track_bp = bp->bio_track_bp;
233 #endif
234 bp->bio_children++;
235 }
236 #ifdef KTR
237 if (KTR_GEOM_ENABLED) {
238 struct stack st;
239
240 CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2);
241 stack_save(&st);
242 CTRSTACK(KTR_GEOM, &st, 3, 0);
243 }
244 #endif
245 return(bp2);
246 }
247
248 struct bio *
g_duplicate_bio(struct bio * bp)249 g_duplicate_bio(struct bio *bp)
250 {
251 struct bio *bp2;
252
253 bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO);
254 bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST);
255 bp2->bio_parent = bp;
256 bp2->bio_cmd = bp->bio_cmd;
257 bp2->bio_length = bp->bio_length;
258 bp2->bio_offset = bp->bio_offset;
259 bp2->bio_data = bp->bio_data;
260 bp2->bio_ma = bp->bio_ma;
261 bp2->bio_ma_n = bp->bio_ma_n;
262 bp2->bio_ma_offset = bp->bio_ma_offset;
263 bp2->bio_attribute = bp->bio_attribute;
264 bp->bio_children++;
265 #ifdef KTR
266 if (KTR_GEOM_ENABLED) {
267 struct stack st;
268
269 CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2);
270 stack_save(&st);
271 CTRSTACK(KTR_GEOM, &st, 3, 0);
272 }
273 #endif
274 return(bp2);
275 }
276
277 void
g_reset_bio(struct bio * bp)278 g_reset_bio(struct bio *bp)
279 {
280
281 bzero(bp, sizeof(*bp));
282 }
283
284 void
g_io_init(void)285 g_io_init(void)
286 {
287
288 g_bioq_init(&g_bio_run_down);
289 g_bioq_init(&g_bio_run_up);
290 biozone = uma_zcreate("g_bio", sizeof (struct bio),
291 NULL, NULL,
292 NULL, NULL,
293 0, 0);
294 }
295
296 int
g_io_getattr(const char * attr,struct g_consumer * cp,int * len,void * ptr)297 g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr)
298 {
299 struct bio *bp;
300 int error;
301
302 g_trace(G_T_BIO, "bio_getattr(%s)", attr);
303 bp = g_alloc_bio();
304 bp->bio_cmd = BIO_GETATTR;
305 bp->bio_done = NULL;
306 bp->bio_attribute = attr;
307 bp->bio_length = *len;
308 bp->bio_data = ptr;
309 g_io_request(bp, cp);
310 error = biowait(bp, "ggetattr");
311 *len = bp->bio_completed;
312 g_destroy_bio(bp);
313 return (error);
314 }
315
316 int
g_io_zonecmd(struct disk_zone_args * zone_args,struct g_consumer * cp)317 g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp)
318 {
319 struct bio *bp;
320 int error;
321
322 g_trace(G_T_BIO, "bio_zone(%d)", zone_args->zone_cmd);
323 bp = g_alloc_bio();
324 bp->bio_cmd = BIO_ZONE;
325 bp->bio_done = NULL;
326 /*
327 * XXX KDM need to handle report zone data.
328 */
329 bcopy(zone_args, &bp->bio_zone, sizeof(*zone_args));
330 if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES)
331 bp->bio_length =
332 zone_args->zone_params.report.entries_allocated *
333 sizeof(struct disk_zone_rep_entry);
334 else
335 bp->bio_length = 0;
336
337 g_io_request(bp, cp);
338 error = biowait(bp, "gzone");
339 bcopy(&bp->bio_zone, zone_args, sizeof(*zone_args));
340 g_destroy_bio(bp);
341 return (error);
342 }
343
344 int
g_io_flush(struct g_consumer * cp)345 g_io_flush(struct g_consumer *cp)
346 {
347 struct bio *bp;
348 int error;
349
350 g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name);
351 bp = g_alloc_bio();
352 bp->bio_cmd = BIO_FLUSH;
353 bp->bio_flags |= BIO_ORDERED;
354 bp->bio_done = NULL;
355 bp->bio_attribute = NULL;
356 bp->bio_offset = cp->provider->mediasize;
357 bp->bio_length = 0;
358 bp->bio_data = NULL;
359 g_io_request(bp, cp);
360 error = biowait(bp, "gflush");
361 g_destroy_bio(bp);
362 return (error);
363 }
364
365 static int
g_io_check(struct bio * bp)366 g_io_check(struct bio *bp)
367 {
368 struct g_consumer *cp;
369 struct g_provider *pp;
370 off_t excess;
371 int error;
372
373 biotrack(bp, __func__);
374
375 cp = bp->bio_from;
376 pp = bp->bio_to;
377
378 /* Fail if access counters dont allow the operation */
379 switch(bp->bio_cmd) {
380 case BIO_READ:
381 case BIO_GETATTR:
382 if (cp->acr == 0)
383 return (EPERM);
384 break;
385 case BIO_WRITE:
386 case BIO_DELETE:
387 case BIO_FLUSH:
388 if (cp->acw == 0)
389 return (EPERM);
390 break;
391 case BIO_ZONE:
392 if ((bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) ||
393 (bp->bio_zone.zone_cmd == DISK_ZONE_GET_PARAMS)) {
394 if (cp->acr == 0)
395 return (EPERM);
396 } else if (cp->acw == 0)
397 return (EPERM);
398 break;
399 default:
400 return (EPERM);
401 }
402 /* if provider is marked for error, don't disturb. */
403 if (pp->error)
404 return (pp->error);
405 if (cp->flags & G_CF_ORPHAN)
406 return (ENXIO);
407
408 switch(bp->bio_cmd) {
409 case BIO_READ:
410 case BIO_WRITE:
411 case BIO_DELETE:
412 /* Zero sectorsize or mediasize is probably a lack of media. */
413 if (pp->sectorsize == 0 || pp->mediasize == 0)
414 return (ENXIO);
415 /* Reject I/O not on sector boundary */
416 if (bp->bio_offset % pp->sectorsize)
417 return (EINVAL);
418 /* Reject I/O not integral sector long */
419 if (bp->bio_length % pp->sectorsize)
420 return (EINVAL);
421 /* Reject requests before or past the end of media. */
422 if (bp->bio_offset < 0)
423 return (EIO);
424 if (bp->bio_offset > pp->mediasize)
425 return (EIO);
426
427 /* Truncate requests to the end of providers media. */
428 excess = bp->bio_offset + bp->bio_length;
429 if (excess > bp->bio_to->mediasize) {
430 KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 ||
431 round_page(bp->bio_ma_offset +
432 bp->bio_length) / PAGE_SIZE == bp->bio_ma_n,
433 ("excess bio %p too short", bp));
434 excess -= bp->bio_to->mediasize;
435 bp->bio_length -= excess;
436 if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
437 bp->bio_ma_n = round_page(bp->bio_ma_offset +
438 bp->bio_length) / PAGE_SIZE;
439 }
440 if (excess > 0)
441 CTR3(KTR_GEOM, "g_down truncated bio "
442 "%p provider %s by %d", bp,
443 bp->bio_to->name, excess);
444 }
445
446 /* Deliver zero length transfers right here. */
447 if (bp->bio_length == 0) {
448 CTR2(KTR_GEOM, "g_down terminated 0-length "
449 "bp %p provider %s", bp, bp->bio_to->name);
450 return (0);
451 }
452
453 if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
454 (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 &&
455 (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
456 if ((error = g_io_transient_map_bio(bp)) >= 0)
457 return (error);
458 }
459 break;
460 default:
461 break;
462 }
463 return (EJUSTRETURN);
464 }
465
466 /*
467 * bio classification support.
468 *
469 * g_register_classifier() and g_unregister_classifier()
470 * are used to add/remove a classifier from the list.
471 * The list is protected using the g_bio_run_down lock,
472 * because the classifiers are called in this path.
473 *
474 * g_io_request() passes bio's that are not already classified
475 * (i.e. those with bio_classifier1 == NULL) to g_run_classifiers().
476 * Classifiers can store their result in the two fields
477 * bio_classifier1 and bio_classifier2.
478 * A classifier that updates one of the fields should
479 * return a non-zero value.
480 * If no classifier updates the field, g_run_classifiers() sets
481 * bio_classifier1 = BIO_NOTCLASSIFIED to avoid further calls.
482 */
483
484 int
g_register_classifier(struct g_classifier_hook * hook)485 g_register_classifier(struct g_classifier_hook *hook)
486 {
487
488 g_bioq_lock(&g_bio_run_down);
489 TAILQ_INSERT_TAIL(&g_classifier_tailq, hook, link);
490 g_bioq_unlock(&g_bio_run_down);
491
492 return (0);
493 }
494
495 void
g_unregister_classifier(struct g_classifier_hook * hook)496 g_unregister_classifier(struct g_classifier_hook *hook)
497 {
498 struct g_classifier_hook *entry;
499
500 g_bioq_lock(&g_bio_run_down);
501 TAILQ_FOREACH(entry, &g_classifier_tailq, link) {
502 if (entry == hook) {
503 TAILQ_REMOVE(&g_classifier_tailq, hook, link);
504 break;
505 }
506 }
507 g_bioq_unlock(&g_bio_run_down);
508 }
509
510 static void
g_run_classifiers(struct bio * bp)511 g_run_classifiers(struct bio *bp)
512 {
513 struct g_classifier_hook *hook;
514 int classified = 0;
515
516 biotrack(bp, __func__);
517
518 TAILQ_FOREACH(hook, &g_classifier_tailq, link)
519 classified |= hook->func(hook->arg, bp);
520
521 if (!classified)
522 bp->bio_classifier1 = BIO_NOTCLASSIFIED;
523 }
524
525 void
g_io_request(struct bio * bp,struct g_consumer * cp)526 g_io_request(struct bio *bp, struct g_consumer *cp)
527 {
528 struct g_provider *pp;
529 struct mtx *mtxp;
530 int direct, error, first;
531 uint8_t cmd;
532
533 biotrack(bp, __func__);
534
535 KASSERT(cp != NULL, ("NULL cp in g_io_request"));
536 KASSERT(bp != NULL, ("NULL bp in g_io_request"));
537 pp = cp->provider;
538 KASSERT(pp != NULL, ("consumer not attached in g_io_request"));
539 #ifdef DIAGNOSTIC
540 KASSERT(bp->bio_driver1 == NULL,
541 ("bio_driver1 used by the consumer (geom %s)", cp->geom->name));
542 KASSERT(bp->bio_driver2 == NULL,
543 ("bio_driver2 used by the consumer (geom %s)", cp->geom->name));
544 KASSERT(bp->bio_pflags == 0,
545 ("bio_pflags used by the consumer (geom %s)", cp->geom->name));
546 /*
547 * Remember consumer's private fields, so we can detect if they were
548 * modified by the provider.
549 */
550 bp->_bio_caller1 = bp->bio_caller1;
551 bp->_bio_caller2 = bp->bio_caller2;
552 bp->_bio_cflags = bp->bio_cflags;
553 #endif
554
555 cmd = bp->bio_cmd;
556 if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_GETATTR) {
557 KASSERT(bp->bio_data != NULL,
558 ("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd));
559 }
560 if (cmd == BIO_DELETE || cmd == BIO_FLUSH) {
561 KASSERT(bp->bio_data == NULL,
562 ("non-NULL bp->data in g_io_request(cmd=%hu)",
563 bp->bio_cmd));
564 }
565 if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_DELETE) {
566 KASSERT(bp->bio_offset % cp->provider->sectorsize == 0,
567 ("wrong offset %jd for sectorsize %u",
568 bp->bio_offset, cp->provider->sectorsize));
569 KASSERT(bp->bio_length % cp->provider->sectorsize == 0,
570 ("wrong length %jd for sectorsize %u",
571 bp->bio_length, cp->provider->sectorsize));
572 }
573
574 g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d",
575 bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd);
576
577 bp->bio_from = cp;
578 bp->bio_to = pp;
579 bp->bio_error = 0;
580 bp->bio_completed = 0;
581
582 KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
583 ("Bio already on queue bp=%p", bp));
584 if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
585 ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
586 binuptime(&bp->bio_t0);
587 else
588 getbinuptime(&bp->bio_t0);
589
590 direct = (cp->flags & G_CF_DIRECT_SEND) != 0 &&
591 (pp->flags & G_PF_DIRECT_RECEIVE) != 0 &&
592 !g_is_geom_thread(curthread) &&
593 ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ||
594 (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) &&
595 pace == 0;
596 if (direct) {
597 /* Block direct execution if less then half of stack left. */
598 size_t st, su;
599 GET_STACK_USAGE(st, su);
600 if (su * 2 > st)
601 direct = 0;
602 }
603
604 if (!TAILQ_EMPTY(&g_classifier_tailq) && !bp->bio_classifier1) {
605 g_bioq_lock(&g_bio_run_down);
606 g_run_classifiers(bp);
607 g_bioq_unlock(&g_bio_run_down);
608 }
609
610 /*
611 * The statistics collection is lockless, as such, but we
612 * can not update one instance of the statistics from more
613 * than one thread at a time, so grab the lock first.
614 */
615 mtxp = mtx_pool_find(mtxpool_sleep, pp);
616 mtx_lock(mtxp);
617 if (g_collectstats & G_STATS_PROVIDERS)
618 devstat_start_transaction_bio_t0(pp->stat, bp);
619 if (g_collectstats & G_STATS_CONSUMERS)
620 devstat_start_transaction_bio_t0(cp->stat, bp);
621 pp->nstart++;
622 cp->nstart++;
623 mtx_unlock(mtxp);
624
625 if (direct) {
626 error = g_io_check(bp);
627 if (error >= 0) {
628 CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p "
629 "provider %s returned %d", bp, bp->bio_to->name,
630 error);
631 g_io_deliver(bp, error);
632 return;
633 }
634 bp->bio_to->geom->start(bp);
635 } else {
636 g_bioq_lock(&g_bio_run_down);
637 first = TAILQ_EMPTY(&g_bio_run_down.bio_queue);
638 TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue);
639 bp->bio_flags |= BIO_ONQUEUE;
640 g_bio_run_down.bio_queue_length++;
641 g_bioq_unlock(&g_bio_run_down);
642 /* Pass it on down. */
643 if (first)
644 wakeup(&g_wait_down);
645 }
646 }
647
648 void
g_io_deliver(struct bio * bp,int error)649 g_io_deliver(struct bio *bp, int error)
650 {
651 struct bintime now;
652 struct g_consumer *cp;
653 struct g_provider *pp;
654 struct mtx *mtxp;
655 int direct, first;
656
657 biotrack(bp, __func__);
658
659 KASSERT(bp != NULL, ("NULL bp in g_io_deliver"));
660 pp = bp->bio_to;
661 KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver"));
662 cp = bp->bio_from;
663 if (cp == NULL) {
664 bp->bio_error = error;
665 bp->bio_done(bp);
666 return;
667 }
668 KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver"));
669 KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver"));
670 #ifdef DIAGNOSTIC
671 /*
672 * Some classes - GJournal in particular - can modify bio's
673 * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO
674 * flag means it's an expected behaviour for that particular geom.
675 */
676 if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) {
677 KASSERT(bp->bio_caller1 == bp->_bio_caller1,
678 ("bio_caller1 used by the provider %s", pp->name));
679 KASSERT(bp->bio_caller2 == bp->_bio_caller2,
680 ("bio_caller2 used by the provider %s", pp->name));
681 KASSERT(bp->bio_cflags == bp->_bio_cflags,
682 ("bio_cflags used by the provider %s", pp->name));
683 }
684 #endif
685 KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0"));
686 KASSERT(bp->bio_completed <= bp->bio_length,
687 ("bio_completed can't be greater than bio_length"));
688
689 g_trace(G_T_BIO,
690 "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd",
691 bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error,
692 (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
693
694 KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
695 ("Bio already on queue bp=%p", bp));
696
697 /*
698 * XXX: next two doesn't belong here
699 */
700 bp->bio_bcount = bp->bio_length;
701 bp->bio_resid = bp->bio_bcount - bp->bio_completed;
702
703 direct = (pp->flags & G_PF_DIRECT_SEND) &&
704 (cp->flags & G_CF_DIRECT_RECEIVE) &&
705 !g_is_geom_thread(curthread);
706 if (direct) {
707 /* Block direct execution if less then half of stack left. */
708 size_t st, su;
709 GET_STACK_USAGE(st, su);
710 if (su * 2 > st)
711 direct = 0;
712 }
713
714 /*
715 * The statistics collection is lockless, as such, but we
716 * can not update one instance of the statistics from more
717 * than one thread at a time, so grab the lock first.
718 */
719 if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
720 ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
721 binuptime(&now);
722 mtxp = mtx_pool_find(mtxpool_sleep, cp);
723 mtx_lock(mtxp);
724 if (g_collectstats & G_STATS_PROVIDERS)
725 devstat_end_transaction_bio_bt(pp->stat, bp, &now);
726 if (g_collectstats & G_STATS_CONSUMERS)
727 devstat_end_transaction_bio_bt(cp->stat, bp, &now);
728 cp->nend++;
729 pp->nend++;
730 mtx_unlock(mtxp);
731
732 if (error != ENOMEM) {
733 bp->bio_error = error;
734 if (direct) {
735 biodone(bp);
736 } else {
737 g_bioq_lock(&g_bio_run_up);
738 first = TAILQ_EMPTY(&g_bio_run_up.bio_queue);
739 TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue);
740 bp->bio_flags |= BIO_ONQUEUE;
741 g_bio_run_up.bio_queue_length++;
742 g_bioq_unlock(&g_bio_run_up);
743 if (first)
744 wakeup(&g_wait_up);
745 }
746 return;
747 }
748
749 if (bootverbose)
750 printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name);
751 bp->bio_children = 0;
752 bp->bio_inbed = 0;
753 bp->bio_driver1 = NULL;
754 bp->bio_driver2 = NULL;
755 bp->bio_pflags = 0;
756 g_io_request(bp, cp);
757 pace = 1;
758 return;
759 }
760
761 SYSCTL_DECL(_kern_geom);
762
763 static long transient_maps;
764 SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD,
765 &transient_maps, 0,
766 "Total count of the transient mapping requests");
767 u_int transient_map_retries = 10;
768 SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW,
769 &transient_map_retries, 0,
770 "Max count of retries used before giving up on creating transient map");
771 int transient_map_hard_failures;
772 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD,
773 &transient_map_hard_failures, 0,
774 "Failures to establish the transient mapping due to retry attempts "
775 "exhausted");
776 int transient_map_soft_failures;
777 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD,
778 &transient_map_soft_failures, 0,
779 "Count of retried failures to establish the transient mapping");
780 int inflight_transient_maps;
781 SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD,
782 &inflight_transient_maps, 0,
783 "Current count of the active transient maps");
784
785 static int
g_io_transient_map_bio(struct bio * bp)786 g_io_transient_map_bio(struct bio *bp)
787 {
788 vm_offset_t addr;
789 long size;
790 u_int retried;
791
792 KASSERT(unmapped_buf_allowed, ("unmapped disabled"));
793
794 size = round_page(bp->bio_ma_offset + bp->bio_length);
795 KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp));
796 addr = 0;
797 retried = 0;
798 atomic_add_long(&transient_maps, 1);
799 retry:
800 if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) {
801 if (transient_map_retries != 0 &&
802 retried >= transient_map_retries) {
803 CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s",
804 bp, bp->bio_to->name);
805 atomic_add_int(&transient_map_hard_failures, 1);
806 return (EDEADLK/* XXXKIB */);
807 } else {
808 /*
809 * Naive attempt to quisce the I/O to get more
810 * in-flight requests completed and defragment
811 * the transient_arena.
812 */
813 CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d",
814 bp, bp->bio_to->name, retried);
815 pause("g_d_tra", hz / 10);
816 retried++;
817 atomic_add_int(&transient_map_soft_failures, 1);
818 goto retry;
819 }
820 }
821 atomic_add_int(&inflight_transient_maps, 1);
822 pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size));
823 bp->bio_data = (caddr_t)addr + bp->bio_ma_offset;
824 bp->bio_flags |= BIO_TRANSIENT_MAPPING;
825 bp->bio_flags &= ~BIO_UNMAPPED;
826 return (EJUSTRETURN);
827 }
828
829 void
g_io_schedule_down(struct thread * tp __unused)830 g_io_schedule_down(struct thread *tp __unused)
831 {
832 struct bio *bp;
833 int error;
834
835 for(;;) {
836 g_bioq_lock(&g_bio_run_down);
837 bp = g_bioq_first(&g_bio_run_down);
838 if (bp == NULL) {
839 CTR0(KTR_GEOM, "g_down going to sleep");
840 msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock,
841 PRIBIO | PDROP, "-", 0);
842 continue;
843 }
844 CTR0(KTR_GEOM, "g_down has work to do");
845 g_bioq_unlock(&g_bio_run_down);
846 biotrack(bp, __func__);
847 if (pace != 0) {
848 /*
849 * There has been at least one memory allocation
850 * failure since the last I/O completed. Pause 1ms to
851 * give the system a chance to free up memory. We only
852 * do this once because a large number of allocations
853 * can fail in the direct dispatch case and there's no
854 * relationship between the number of these failures and
855 * the length of the outage. If there's still an outage,
856 * we'll pause again and again until it's
857 * resolved. Older versions paused longer and once per
858 * allocation failure. This was OK for a single threaded
859 * g_down, but with direct dispatch would lead to max of
860 * 10 IOPs for minutes at a time when transient memory
861 * issues prevented allocation for a batch of requests
862 * from the upper layers.
863 *
864 * XXX This pacing is really lame. It needs to be solved
865 * by other methods. This is OK only because the worst
866 * case scenario is so rare. In the worst case scenario
867 * all memory is tied up waiting for I/O to complete
868 * which can never happen since we can't allocate bios
869 * for that I/O.
870 */
871 CTR0(KTR_GEOM, "g_down pacing self");
872 pause("g_down", min(hz/1000, 1));
873 pace = 0;
874 }
875 CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp,
876 bp->bio_to->name);
877 error = g_io_check(bp);
878 if (error >= 0) {
879 CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider "
880 "%s returned %d", bp, bp->bio_to->name, error);
881 g_io_deliver(bp, error);
882 continue;
883 }
884 THREAD_NO_SLEEPING();
885 CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld "
886 "len %ld", bp, bp->bio_to->name, bp->bio_offset,
887 bp->bio_length);
888 bp->bio_to->geom->start(bp);
889 THREAD_SLEEPING_OK();
890 }
891 }
892
893 void
g_io_schedule_up(struct thread * tp __unused)894 g_io_schedule_up(struct thread *tp __unused)
895 {
896 struct bio *bp;
897
898 for(;;) {
899 g_bioq_lock(&g_bio_run_up);
900 bp = g_bioq_first(&g_bio_run_up);
901 if (bp == NULL) {
902 CTR0(KTR_GEOM, "g_up going to sleep");
903 msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock,
904 PRIBIO | PDROP, "-", 0);
905 continue;
906 }
907 g_bioq_unlock(&g_bio_run_up);
908 THREAD_NO_SLEEPING();
909 CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off "
910 "%jd len %ld", bp, bp->bio_to->name,
911 bp->bio_offset, bp->bio_length);
912 biodone(bp);
913 THREAD_SLEEPING_OK();
914 }
915 }
916
917 void *
g_read_data(struct g_consumer * cp,off_t offset,off_t length,int * error)918 g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error)
919 {
920 struct bio *bp;
921 void *ptr;
922 int errorc;
923
924 KASSERT(length > 0 && length >= cp->provider->sectorsize &&
925 length <= MAXPHYS, ("g_read_data(): invalid length %jd",
926 (intmax_t)length));
927
928 bp = g_alloc_bio();
929 bp->bio_cmd = BIO_READ;
930 bp->bio_done = NULL;
931 bp->bio_offset = offset;
932 bp->bio_length = length;
933 ptr = g_malloc(length, M_WAITOK);
934 bp->bio_data = ptr;
935 g_io_request(bp, cp);
936 errorc = biowait(bp, "gread");
937 if (error != NULL)
938 *error = errorc;
939 g_destroy_bio(bp);
940 if (errorc) {
941 g_free(ptr);
942 ptr = NULL;
943 }
944 return (ptr);
945 }
946
947 /*
948 * A read function for use by ffs_sbget when used by GEOM-layer routines.
949 */
950 int
g_use_g_read_data(void * devfd,off_t loc,void ** bufp,int size)951 g_use_g_read_data(void *devfd, off_t loc, void **bufp, int size)
952 {
953 struct g_consumer *cp;
954
955 KASSERT(*bufp == NULL,
956 ("g_use_g_read_data: non-NULL *bufp %p\n", *bufp));
957
958 cp = (struct g_consumer *)devfd;
959 /*
960 * Take care not to issue an invalid I/O request. The offset of
961 * the superblock candidate must be multiples of the provider's
962 * sector size, otherwise an FFS can't exist on the provider
963 * anyway.
964 */
965 if (loc % cp->provider->sectorsize != 0)
966 return (ENOENT);
967 *bufp = g_read_data(cp, loc, size, NULL);
968 if (*bufp == NULL)
969 return (ENOENT);
970 return (0);
971 }
972
973 int
g_write_data(struct g_consumer * cp,off_t offset,void * ptr,off_t length)974 g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length)
975 {
976 struct bio *bp;
977 int error;
978
979 KASSERT(length > 0 && length >= cp->provider->sectorsize &&
980 length <= MAXPHYS, ("g_write_data(): invalid length %jd",
981 (intmax_t)length));
982
983 bp = g_alloc_bio();
984 bp->bio_cmd = BIO_WRITE;
985 bp->bio_done = NULL;
986 bp->bio_offset = offset;
987 bp->bio_length = length;
988 bp->bio_data = ptr;
989 g_io_request(bp, cp);
990 error = biowait(bp, "gwrite");
991 g_destroy_bio(bp);
992 return (error);
993 }
994
995 /*
996 * A write function for use by ffs_sbput when used by GEOM-layer routines.
997 */
998 int
g_use_g_write_data(void * devfd,off_t loc,void * buf,int size)999 g_use_g_write_data(void *devfd, off_t loc, void *buf, int size)
1000 {
1001
1002 return (g_write_data((struct g_consumer *)devfd, loc, buf, size));
1003 }
1004
1005 int
g_delete_data(struct g_consumer * cp,off_t offset,off_t length)1006 g_delete_data(struct g_consumer *cp, off_t offset, off_t length)
1007 {
1008 struct bio *bp;
1009 int error;
1010
1011 KASSERT(length > 0 && length >= cp->provider->sectorsize,
1012 ("g_delete_data(): invalid length %jd", (intmax_t)length));
1013
1014 bp = g_alloc_bio();
1015 bp->bio_cmd = BIO_DELETE;
1016 bp->bio_done = NULL;
1017 bp->bio_offset = offset;
1018 bp->bio_length = length;
1019 bp->bio_data = NULL;
1020 g_io_request(bp, cp);
1021 error = biowait(bp, "gdelete");
1022 g_destroy_bio(bp);
1023 return (error);
1024 }
1025
1026 void
g_print_bio(struct bio * bp)1027 g_print_bio(struct bio *bp)
1028 {
1029 const char *pname, *cmd = NULL;
1030
1031 if (bp->bio_to != NULL)
1032 pname = bp->bio_to->name;
1033 else
1034 pname = "[unknown]";
1035
1036 switch (bp->bio_cmd) {
1037 case BIO_GETATTR:
1038 cmd = "GETATTR";
1039 printf("%s[%s(attr=%s)]", pname, cmd, bp->bio_attribute);
1040 return;
1041 case BIO_FLUSH:
1042 cmd = "FLUSH";
1043 printf("%s[%s]", pname, cmd);
1044 return;
1045 case BIO_ZONE: {
1046 char *subcmd = NULL;
1047 cmd = "ZONE";
1048 switch (bp->bio_zone.zone_cmd) {
1049 case DISK_ZONE_OPEN:
1050 subcmd = "OPEN";
1051 break;
1052 case DISK_ZONE_CLOSE:
1053 subcmd = "CLOSE";
1054 break;
1055 case DISK_ZONE_FINISH:
1056 subcmd = "FINISH";
1057 break;
1058 case DISK_ZONE_RWP:
1059 subcmd = "RWP";
1060 break;
1061 case DISK_ZONE_REPORT_ZONES:
1062 subcmd = "REPORT ZONES";
1063 break;
1064 case DISK_ZONE_GET_PARAMS:
1065 subcmd = "GET PARAMS";
1066 break;
1067 default:
1068 subcmd = "UNKNOWN";
1069 break;
1070 }
1071 printf("%s[%s,%s]", pname, cmd, subcmd);
1072 return;
1073 }
1074 case BIO_READ:
1075 cmd = "READ";
1076 break;
1077 case BIO_WRITE:
1078 cmd = "WRITE";
1079 break;
1080 case BIO_DELETE:
1081 cmd = "DELETE";
1082 break;
1083 default:
1084 cmd = "UNKNOWN";
1085 printf("%s[%s()]", pname, cmd);
1086 return;
1087 }
1088 printf("%s[%s(offset=%jd, length=%jd)]", pname, cmd,
1089 (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
1090 }
1091