1 /*-
2 * Copyright (c) 2009-2010 Fabio Checconi
3 * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 /*
29 * $Id$
30 * $FreeBSD: stable/10/sys/geom/sched/g_sched.c 243333 2012-11-20 12:32:18Z jh $
31 *
32 * Main control module for geom-based disk schedulers ('sched').
33 *
34 * USER VIEW
35 * A 'sched' node is typically inserted transparently between
36 * an existing provider pp and its original geom gp
37 *
38 * [pp --> gp ..]
39 *
40 * using the command "geom sched insert <provider>" and
41 * resulting in the following topology
42 *
43 * [pp --> sched_gp --> cp] [new_pp --> gp ... ]
44 *
45 * Deletion "geom sched destroy <provider>.sched." restores the
46 * original chain. The normal "geom sched create <provide>"
47 * is also supported.
48 *
49 * INTERNALS
50 * Internally, the 'sched' uses the following data structures
51 *
52 * geom{} g_sched_softc{} g_gsched{}
53 * +----------+ +---------------+ +-------------+
54 * | softc *-|--->| sc_gsched *-|-->| gs_init |
55 * | ... | | | | gs_fini |
56 * | | | [ hash table] | | gs_start |
57 * +----------+ | | | ... |
58 * | | +-------------+
59 * | |
60 * | | g_*_softc{}
61 * | | +-------------+
62 * | sc_data *-|-->| |
63 * +---------------+ | algorithm- |
64 * | specific |
65 * +-------------+
66 *
67 * A g_sched_softc{} is created with a "geom sched insert" call.
68 * In turn this instantiates a specific scheduling algorithm,
69 * which sets sc_gsched to point to the algorithm callbacks,
70 * and calls gs_init() to create the g_*_softc{} .
71 * The other callbacks (gs_start, gs_next, ...) are invoked
72 * as needed
73 *
74 * g_sched_softc{} is defined in g_sched.h and mostly used here;
75 * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h;
76 * g_*_softc{} is defined/implemented by each algorithm (gs_*.c)
77 *
78 * DATA MOVING
79 * When a bio is received on the provider, it goes to the
80 * g_sched_start() which calls gs_start() to initially queue it;
81 * then we call g_sched_dispatch() that loops around gs_next()
82 * to select zero or more bio's to be sent downstream.
83 *
84 * g_sched_dispatch() can also be called as a result of a timeout,
85 * e.g. when doing anticipation or pacing requests.
86 *
87 * When a bio comes back, it goes to g_sched_done() which in turn
88 * calls gs_done(). The latter does any necessary housekeeping in
89 * the scheduling algorithm, and may decide to call g_sched_dispatch()
90 * to send more bio's downstream.
91 *
92 * If an algorithm needs per-flow queues, these are created
93 * calling gs_init_class() and destroyed with gs_fini_class(),
94 * and they are also inserted in the hash table implemented in
95 * the g_sched_softc{}
96 *
97 * If an algorithm is replaced, or a transparently-inserted node is
98 * removed with "geom sched destroy", we need to remove all references
99 * to the g_*_softc{} and g_sched_softc from the bio's still in
100 * the scheduler. g_sched_forced_dispatch() helps doing this.
101 * XXX need to explain better.
102 */
103
104 #include <sys/cdefs.h>
105 #include <sys/param.h>
106 #include <sys/systm.h>
107 #include <sys/kernel.h>
108 #include <sys/module.h>
109 #include <sys/lock.h>
110 #include <sys/mutex.h>
111 #include <sys/bio.h>
112 #include <sys/limits.h>
113 #include <sys/hash.h>
114 #include <sys/sbuf.h>
115 #include <sys/sysctl.h>
116 #include <sys/malloc.h>
117 #include <sys/proc.h> /* we access curthread */
118 #include <geom/geom.h>
119 #include "gs_scheduler.h"
120 #include "g_sched.h" /* geom hooks */
121
122 /*
123 * Size of the per-geom hash table storing traffic classes.
124 * We may decide to change it at a later time, it has no ABI
125 * implications as it is only used for run-time allocations.
126 */
127 #define G_SCHED_HASH_SIZE 32
128
129 static int g_sched_destroy(struct g_geom *gp, boolean_t force);
130 static int g_sched_destroy_geom(struct gctl_req *req,
131 struct g_class *mp, struct g_geom *gp);
132 static void g_sched_config(struct gctl_req *req, struct g_class *mp,
133 const char *verb);
134 static struct g_geom *g_sched_taste(struct g_class *mp,
135 struct g_provider *pp, int flags __unused);
136 static void g_sched_dumpconf(struct sbuf *sb, const char *indent,
137 struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
138 static void g_sched_init(struct g_class *mp);
139 static void g_sched_fini(struct g_class *mp);
140 static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data,
141 int fflag, struct thread *td);
142
143 struct g_class g_sched_class = {
144 .name = G_SCHED_CLASS_NAME,
145 .version = G_VERSION,
146 .ctlreq = g_sched_config,
147 .taste = g_sched_taste,
148 .destroy_geom = g_sched_destroy_geom,
149 .init = g_sched_init,
150 .ioctl = g_sched_ioctl,
151 .fini = g_sched_fini
152 };
153
154 MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures");
155
156 /*
157 * Global variables describing the state of the geom_sched module.
158 * There is only one static instance of this structure.
159 */
160 LIST_HEAD(gs_list, g_gsched); /* type, link field */
161 struct geom_sched_vars {
162 struct mtx gs_mtx;
163 struct gs_list gs_scheds; /* list of algorithms */
164 u_int gs_debug;
165 u_int gs_sched_count; /* how many algorithms ? */
166 u_int gs_patched; /* g_io_request was patched */
167
168 u_int gs_initialized;
169 u_int gs_expire_secs; /* expiration of hash entries */
170
171 struct bio_queue_head gs_pending;
172 u_int gs_npending;
173
174 /* The following are for stats, usually protected by gs_mtx. */
175 u_long gs_requests; /* total requests */
176 u_long gs_done; /* total done */
177 u_int gs_in_flight; /* requests in flight */
178 u_int gs_writes_in_flight;
179 u_int gs_bytes_in_flight;
180 u_int gs_write_bytes_in_flight;
181
182 char gs_names[256]; /* names of schedulers */
183 };
184
185 static struct geom_sched_vars me = {
186 .gs_expire_secs = 10,
187 };
188
189 SYSCTL_DECL(_kern_geom);
190 SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
191 "GEOM_SCHED stuff");
192
193 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD,
194 &me.gs_write_bytes_in_flight, 0, "Write bytes in flight");
195
196 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD,
197 &me.gs_bytes_in_flight, 0, "Bytes in flight");
198
199 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD,
200 &me.gs_writes_in_flight, 0, "Write Requests in flight");
201
202 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD,
203 &me.gs_in_flight, 0, "Requests in flight");
204
205 SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD,
206 &me.gs_done, 0, "Total done");
207
208 SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD,
209 &me.gs_requests, 0, "Total requests");
210
211 SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD,
212 &me.gs_names, 0, "Algorithm names");
213
214 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD,
215 &me.gs_sched_count, 0, "Number of algorithms");
216
217 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW,
218 &me.gs_debug, 0, "Debug level");
219
220 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW,
221 &me.gs_expire_secs, 0, "Expire time in seconds");
222
223 /*
224 * g_sched calls the scheduler algorithms with this lock held.
225 * The locking functions are exposed so the scheduler algorithms can also
226 * protect themselves e.g. when running a callout handler.
227 */
228 void
g_sched_lock(struct g_geom * gp)229 g_sched_lock(struct g_geom *gp)
230 {
231 struct g_sched_softc *sc = gp->softc;
232
233 mtx_lock(&sc->sc_mtx);
234 }
235
236 void
g_sched_unlock(struct g_geom * gp)237 g_sched_unlock(struct g_geom *gp)
238 {
239 struct g_sched_softc *sc = gp->softc;
240
241 mtx_unlock(&sc->sc_mtx);
242 }
243
244 /*
245 * Support functions to handle references to the module,
246 * which are coming from devices using this scheduler.
247 */
248 static inline void
g_gsched_ref(struct g_gsched * gsp)249 g_gsched_ref(struct g_gsched *gsp)
250 {
251
252 atomic_add_int(&gsp->gs_refs, 1);
253 }
254
255 static inline void
g_gsched_unref(struct g_gsched * gsp)256 g_gsched_unref(struct g_gsched *gsp)
257 {
258
259 atomic_add_int(&gsp->gs_refs, -1);
260 }
261
262 /*
263 * Update the stats when this request is done.
264 */
265 static void
g_sched_update_stats(struct bio * bio)266 g_sched_update_stats(struct bio *bio)
267 {
268
269 me.gs_done++;
270 me.gs_in_flight--;
271 me.gs_bytes_in_flight -= bio->bio_length;
272 if (bio->bio_cmd & BIO_WRITE) {
273 me.gs_writes_in_flight--;
274 me.gs_write_bytes_in_flight -= bio->bio_length;
275 }
276 }
277
278 /*
279 * Dispatch any pending request.
280 */
281 static void
g_sched_forced_dispatch(struct g_geom * gp)282 g_sched_forced_dispatch(struct g_geom *gp)
283 {
284 struct g_sched_softc *sc = gp->softc;
285 struct g_gsched *gsp = sc->sc_gsched;
286 struct bio *bp;
287
288 KASSERT(mtx_owned(&sc->sc_mtx),
289 ("sc_mtx not owned during forced dispatch"));
290
291 while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL)
292 g_io_request(bp, LIST_FIRST(&gp->consumer));
293 }
294
295 /*
296 * The main dispatch loop, called either here after the start
297 * routine, or by scheduling algorithms when they receive a timeout
298 * or a 'done' notification. Does not share code with the forced
299 * dispatch path, since the gs_done() callback can call us.
300 */
301 void
g_sched_dispatch(struct g_geom * gp)302 g_sched_dispatch(struct g_geom *gp)
303 {
304 struct g_sched_softc *sc = gp->softc;
305 struct g_gsched *gsp = sc->sc_gsched;
306 struct bio *bp;
307
308 KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch"));
309
310 if ((sc->sc_flags & G_SCHED_FLUSHING))
311 return;
312
313 while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL)
314 g_io_request(bp, LIST_FIRST(&gp->consumer));
315 }
316
317 /*
318 * Recent (8.0 and above) versions of FreeBSD have support to
319 * register classifiers of disk requests. The classifier is
320 * invoked by g_io_request(), and stores the information into
321 * bp->bio_classifier1.
322 *
323 * Support for older versions, which is left here only for
324 * documentation purposes, relies on two hacks:
325 * 1. classification info is written into the bio_caller1
326 * field of the topmost node in the bio chain. This field
327 * is rarely used, but this module is incompatible with
328 * those that use bio_caller1 for other purposes,
329 * such as ZFS and gjournal;
330 * 2. g_io_request() is patched in-memory when the module is
331 * loaded, so that the function calls a classifier as its
332 * first thing. g_io_request() is restored when the module
333 * is unloaded. This functionality is only supported for
334 * x86 and amd64, other architectures need source code changes.
335 */
336
337 /*
338 * Lookup the identity of the issuer of the original request.
339 * In the current implementation we use the curthread of the
340 * issuer, but different mechanisms may be implemented later
341 * so we do not make assumptions on the return value which for
342 * us is just an opaque identifier.
343 */
344
345 static inline u_long
g_sched_classify(struct bio * bp)346 g_sched_classify(struct bio *bp)
347 {
348
349 #if __FreeBSD_version > 800098
350 /* we have classifier fields in the struct bio */
351 #define HAVE_BIO_CLASSIFIER
352 return ((u_long)bp->bio_classifier1);
353 #else
354 #warning old version!!!
355 while (bp->bio_parent != NULL)
356 bp = bp->bio_parent;
357
358 return ((u_long)bp->bio_caller1);
359 #endif
360 }
361
362 /* Return the hash chain for the given key. */
363 static inline struct g_hash *
g_sched_hash(struct g_sched_softc * sc,u_long key)364 g_sched_hash(struct g_sched_softc *sc, u_long key)
365 {
366
367 return (&sc->sc_hash[key & sc->sc_mask]);
368 }
369
370 /*
371 * Helper function for the children classes, which takes
372 * a geom and a bio and returns the private descriptor
373 * associated to the request. This involves fetching
374 * the classification field and [al]locating the
375 * corresponding entry in the hash table.
376 */
377 void *
g_sched_get_class(struct g_geom * gp,struct bio * bp)378 g_sched_get_class(struct g_geom *gp, struct bio *bp)
379 {
380 struct g_sched_softc *sc;
381 struct g_sched_class *gsc;
382 struct g_gsched *gsp;
383 struct g_hash *bucket;
384 u_long key;
385
386 sc = gp->softc;
387 key = g_sched_classify(bp);
388 bucket = g_sched_hash(sc, key);
389 LIST_FOREACH(gsc, bucket, gsc_clist) {
390 if (key == gsc->gsc_key) {
391 gsc->gsc_refs++;
392 return (gsc->gsc_priv);
393 }
394 }
395
396 gsp = sc->sc_gsched;
397 gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size,
398 M_GEOM_SCHED, M_NOWAIT | M_ZERO);
399 if (!gsc)
400 return (NULL);
401
402 if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
403 free(gsc, M_GEOM_SCHED);
404 return (NULL);
405 }
406
407 gsc->gsc_refs = 2; /* 1 for the hash table, 1 for the caller. */
408 gsc->gsc_key = key;
409 LIST_INSERT_HEAD(bucket, gsc, gsc_clist);
410
411 gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
412
413 return (gsc->gsc_priv);
414 }
415
416 /*
417 * Release a reference to the per-client descriptor,
418 */
419 void
g_sched_put_class(struct g_geom * gp,void * priv)420 g_sched_put_class(struct g_geom *gp, void *priv)
421 {
422 struct g_sched_class *gsc;
423 struct g_sched_softc *sc;
424
425 gsc = g_sched_priv2class(priv);
426 gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
427
428 if (--gsc->gsc_refs > 0)
429 return;
430
431 sc = gp->softc;
432 sc->sc_gsched->gs_fini_class(sc->sc_data, priv);
433
434 LIST_REMOVE(gsc, gsc_clist);
435 free(gsc, M_GEOM_SCHED);
436 }
437
438 static void
g_sched_hash_fini(struct g_geom * gp,struct g_hash * hp,u_long mask,struct g_gsched * gsp,void * data)439 g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
440 struct g_gsched *gsp, void *data)
441 {
442 struct g_sched_class *cp, *cp2;
443 int i;
444
445 if (!hp)
446 return;
447
448 if (data && gsp->gs_hash_unref)
449 gsp->gs_hash_unref(data);
450
451 for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
452 LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
453 g_sched_put_class(gp, cp->gsc_priv);
454 }
455
456 hashdestroy(hp, M_GEOM_SCHED, mask);
457 }
458
459 static struct g_hash *
g_sched_hash_init(struct g_gsched * gsp,u_long * mask,int flags)460 g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags)
461 {
462 struct g_hash *hash;
463
464 if (gsp->gs_priv_size == 0)
465 return (NULL);
466
467 hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags);
468
469 return (hash);
470 }
471
472 static void
g_sched_flush_classes(struct g_geom * gp)473 g_sched_flush_classes(struct g_geom *gp)
474 {
475 struct g_sched_softc *sc;
476 struct g_sched_class *cp, *cp2;
477 int i;
478
479 sc = gp->softc;
480
481 if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0)
482 return;
483
484 for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
485 LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) {
486 if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0)
487 g_sched_put_class(gp, cp->gsc_priv);
488 }
489 }
490
491 sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz;
492 }
493
494 /*
495 * Wait for the completion of any outstanding request. To ensure
496 * that this does not take forever the caller has to make sure that
497 * no new request enter the scehduler before calling us.
498 *
499 * Must be called with the gp mutex held and topology locked.
500 */
501 static int
g_sched_wait_pending(struct g_geom * gp)502 g_sched_wait_pending(struct g_geom *gp)
503 {
504 struct g_sched_softc *sc = gp->softc;
505 int endticks = ticks + hz;
506
507 g_topology_assert();
508
509 while (sc->sc_pending && endticks - ticks >= 0)
510 msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
511
512 return (sc->sc_pending ? ETIMEDOUT : 0);
513 }
514
515 static int
g_sched_remove_locked(struct g_geom * gp,struct g_gsched * gsp)516 g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp)
517 {
518 struct g_sched_softc *sc = gp->softc;
519 int error;
520
521 /* Set the flushing flag: new bios will not enter the scheduler. */
522 sc->sc_flags |= G_SCHED_FLUSHING;
523
524 g_sched_forced_dispatch(gp);
525 error = g_sched_wait_pending(gp);
526 if (error)
527 goto failed;
528
529 /* No more requests pending or in flight from the old gsp. */
530
531 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data);
532 sc->sc_hash = NULL;
533
534 /*
535 * Avoid deadlock here by releasing the gp mutex and reacquiring
536 * it once done. It should be safe, since no reconfiguration or
537 * destruction can take place due to the geom topology lock; no
538 * new request can use the current sc_data since we flagged the
539 * geom as being flushed.
540 */
541 g_sched_unlock(gp);
542 gsp->gs_fini(sc->sc_data);
543 g_sched_lock(gp);
544
545 sc->sc_gsched = NULL;
546 sc->sc_data = NULL;
547 g_gsched_unref(gsp);
548
549 failed:
550 sc->sc_flags &= ~G_SCHED_FLUSHING;
551
552 return (error);
553 }
554
555 static int
g_sched_remove(struct g_geom * gp,struct g_gsched * gsp)556 g_sched_remove(struct g_geom *gp, struct g_gsched *gsp)
557 {
558 int error;
559
560 g_sched_lock(gp);
561 error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */
562 g_sched_unlock(gp);
563
564 return (error);
565 }
566
567 /*
568 * Support function for create/taste -- locate the desired
569 * algorithm and grab a reference to it.
570 */
571 static struct g_gsched *
g_gsched_find(const char * name)572 g_gsched_find(const char *name)
573 {
574 struct g_gsched *gsp = NULL;
575
576 mtx_lock(&me.gs_mtx);
577 LIST_FOREACH(gsp, &me.gs_scheds, glist) {
578 if (strcmp(name, gsp->gs_name) == 0) {
579 g_gsched_ref(gsp);
580 break;
581 }
582 }
583 mtx_unlock(&me.gs_mtx);
584
585 return (gsp);
586 }
587
588 /*
589 * Rebuild the list of scheduler names.
590 * To be called with me.gs_mtx lock held.
591 */
592 static void
g_gsched_build_names(struct g_gsched * gsp)593 g_gsched_build_names(struct g_gsched *gsp)
594 {
595 int pos, l;
596 struct g_gsched *cur;
597
598 pos = 0;
599 LIST_FOREACH(cur, &me.gs_scheds, glist) {
600 l = strlen(cur->gs_name);
601 if (l + pos + 1 + 1 < sizeof(me.gs_names)) {
602 if (pos != 0)
603 me.gs_names[pos++] = ' ';
604 strcpy(me.gs_names + pos, cur->gs_name);
605 pos += l;
606 }
607 }
608 me.gs_names[pos] = '\0';
609 }
610
611 /*
612 * Register or unregister individual scheduling algorithms.
613 */
614 static int
g_gsched_register(struct g_gsched * gsp)615 g_gsched_register(struct g_gsched *gsp)
616 {
617 struct g_gsched *cur;
618 int error = 0;
619
620 mtx_lock(&me.gs_mtx);
621 LIST_FOREACH(cur, &me.gs_scheds, glist) {
622 if (strcmp(gsp->gs_name, cur->gs_name) == 0)
623 break;
624 }
625 if (cur != NULL) {
626 G_SCHED_DEBUG(0, "A scheduler named %s already"
627 "exists.", gsp->gs_name);
628 error = EEXIST;
629 } else {
630 LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist);
631 gsp->gs_refs = 1;
632 me.gs_sched_count++;
633 g_gsched_build_names(gsp);
634 }
635 mtx_unlock(&me.gs_mtx);
636
637 return (error);
638 }
639
640 struct g_gsched_unregparm {
641 struct g_gsched *gup_gsp;
642 int gup_error;
643 };
644
645 static void
g_gsched_unregister(void * arg,int flag)646 g_gsched_unregister(void *arg, int flag)
647 {
648 struct g_gsched_unregparm *parm = arg;
649 struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp;
650 struct g_sched_softc *sc;
651 struct g_geom *gp, *gp_tmp;
652 int error;
653
654 parm->gup_error = 0;
655
656 g_topology_assert();
657
658 if (flag == EV_CANCEL)
659 return;
660
661 mtx_lock(&me.gs_mtx);
662
663 LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) {
664 if (gp->class != &g_sched_class)
665 continue; /* Should not happen. */
666
667 sc = gp->softc;
668 if (sc->sc_gsched == gsp) {
669 error = g_sched_remove(gp, gsp);
670 if (error)
671 goto failed;
672 }
673 }
674
675 LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) {
676 if (cur != gsp)
677 continue;
678
679 if (gsp->gs_refs != 1) {
680 G_SCHED_DEBUG(0, "%s still in use.",
681 gsp->gs_name);
682 parm->gup_error = EBUSY;
683 } else {
684 LIST_REMOVE(gsp, glist);
685 me.gs_sched_count--;
686 g_gsched_build_names(gsp);
687 }
688 break;
689 }
690
691 if (cur == NULL) {
692 G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name);
693 parm->gup_error = ENOENT;
694 }
695
696 failed:
697 mtx_unlock(&me.gs_mtx);
698 }
699
700 static inline void
g_gsched_global_init(void)701 g_gsched_global_init(void)
702 {
703
704 if (!me.gs_initialized) {
705 G_SCHED_DEBUG(0, "Initializing global data.");
706 mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF);
707 LIST_INIT(&me.gs_scheds);
708 gs_bioq_init(&me.gs_pending);
709 me.gs_initialized = 1;
710 }
711 }
712
713 /*
714 * Module event called when a scheduling algorithm module is loaded or
715 * unloaded.
716 */
717 int
g_gsched_modevent(module_t mod,int cmd,void * arg)718 g_gsched_modevent(module_t mod, int cmd, void *arg)
719 {
720 struct g_gsched *gsp = arg;
721 struct g_gsched_unregparm parm;
722 int error;
723
724 G_SCHED_DEBUG(0, "Modevent %d.", cmd);
725
726 /*
727 * If the module is loaded at boot, the geom thread that calls
728 * g_sched_init() might actually run after g_gsched_modevent(),
729 * so make sure that the module is properly initialized.
730 */
731 g_gsched_global_init();
732
733 error = EOPNOTSUPP;
734 switch (cmd) {
735 case MOD_LOAD:
736 error = g_gsched_register(gsp);
737 G_SCHED_DEBUG(0, "Loaded module %s error %d.",
738 gsp->gs_name, error);
739 if (error == 0)
740 g_retaste(&g_sched_class);
741 break;
742
743 case MOD_UNLOAD:
744 parm.gup_gsp = gsp;
745 parm.gup_error = 0;
746
747 error = g_waitfor_event(g_gsched_unregister,
748 &parm, M_WAITOK, NULL);
749 if (error == 0)
750 error = parm.gup_error;
751 G_SCHED_DEBUG(0, "Unloaded module %s error %d.",
752 gsp->gs_name, error);
753 break;
754 };
755
756 return (error);
757 }
758
759 #ifdef KTR
760 #define TRC_BIO_EVENT(e, bp) g_sched_trace_bio_ ## e (bp)
761
762 static inline char
g_sched_type(struct bio * bp)763 g_sched_type(struct bio *bp)
764 {
765
766 if (0 != (bp->bio_cmd & BIO_READ))
767 return ('R');
768 else if (0 != (bp->bio_cmd & BIO_WRITE))
769 return ('W');
770 return ('U');
771 }
772
773 static inline void
g_sched_trace_bio_START(struct bio * bp)774 g_sched_trace_bio_START(struct bio *bp)
775 {
776
777 CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp),
778 g_sched_type(bp), bp->bio_offset / ULONG_MAX,
779 bp->bio_offset, bp->bio_length);
780 }
781
782 static inline void
g_sched_trace_bio_DONE(struct bio * bp)783 g_sched_trace_bio_DONE(struct bio *bp)
784 {
785
786 CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp),
787 g_sched_type(bp), bp->bio_offset / ULONG_MAX,
788 bp->bio_offset, bp->bio_length);
789 }
790 #else /* !KTR */
791 #define TRC_BIO_EVENT(e, bp)
792 #endif /* !KTR */
793
794 /*
795 * g_sched_done() and g_sched_start() dispatch the geom requests to
796 * the scheduling algorithm in use.
797 */
798 static void
g_sched_done(struct bio * bio)799 g_sched_done(struct bio *bio)
800 {
801 struct g_geom *gp = bio->bio_caller2;
802 struct g_sched_softc *sc = gp->softc;
803
804 TRC_BIO_EVENT(DONE, bio);
805
806 KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done"));
807
808 g_sched_lock(gp);
809
810 g_sched_update_stats(bio);
811 sc->sc_gsched->gs_done(sc->sc_data, bio);
812 if (!--sc->sc_pending)
813 wakeup(gp);
814
815 g_sched_flush_classes(gp);
816 g_sched_unlock(gp);
817
818 g_std_done(bio);
819 }
820
821 static void
g_sched_start(struct bio * bp)822 g_sched_start(struct bio *bp)
823 {
824 struct g_geom *gp = bp->bio_to->geom;
825 struct g_sched_softc *sc = gp->softc;
826 struct bio *cbp;
827
828 TRC_BIO_EVENT(START, bp);
829 G_SCHED_LOGREQ(bp, "Request received.");
830
831 cbp = g_clone_bio(bp);
832 if (cbp == NULL) {
833 g_io_deliver(bp, ENOMEM);
834 return;
835 }
836 cbp->bio_done = g_sched_done;
837 cbp->bio_to = LIST_FIRST(&gp->provider);
838 KASSERT(cbp->bio_to != NULL, ("NULL provider"));
839
840 /* We only schedule reads and writes. */
841 if (0 == (bp->bio_cmd & (BIO_READ | BIO_WRITE)))
842 goto bypass;
843
844 G_SCHED_LOGREQ(cbp, "Sending request.");
845
846 g_sched_lock(gp);
847 /*
848 * Call the algorithm's gs_start to queue the request in the
849 * scheduler. If gs_start fails then pass the request down,
850 * otherwise call g_sched_dispatch() which tries to push
851 * one or more requests down.
852 */
853 if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) ||
854 sc->sc_gsched->gs_start(sc->sc_data, cbp)) {
855 g_sched_unlock(gp);
856 goto bypass;
857 }
858 /*
859 * We use bio_caller1 to mark requests that are scheduled
860 * so make sure it is not NULL.
861 */
862 if (cbp->bio_caller1 == NULL)
863 cbp->bio_caller1 = &me; /* anything not NULL */
864
865 cbp->bio_caller2 = gp;
866 sc->sc_pending++;
867
868 /* Update general stats. */
869 me.gs_in_flight++;
870 me.gs_requests++;
871 me.gs_bytes_in_flight += bp->bio_length;
872 if (bp->bio_cmd & BIO_WRITE) {
873 me.gs_writes_in_flight++;
874 me.gs_write_bytes_in_flight += bp->bio_length;
875 }
876 g_sched_dispatch(gp);
877 g_sched_unlock(gp);
878 return;
879
880 bypass:
881 cbp->bio_done = g_std_done;
882 cbp->bio_caller1 = NULL; /* not scheduled */
883 g_io_request(cbp, LIST_FIRST(&gp->consumer));
884 }
885
886 /*
887 * The next few functions are the geom glue.
888 */
889 static void
g_sched_orphan(struct g_consumer * cp)890 g_sched_orphan(struct g_consumer *cp)
891 {
892
893 g_topology_assert();
894 g_sched_destroy(cp->geom, 1);
895 }
896
897 static int
g_sched_access(struct g_provider * pp,int dr,int dw,int de)898 g_sched_access(struct g_provider *pp, int dr, int dw, int de)
899 {
900 struct g_geom *gp;
901 struct g_consumer *cp;
902 int error;
903
904 gp = pp->geom;
905 cp = LIST_FIRST(&gp->consumer);
906 error = g_access(cp, dr, dw, de);
907
908 return (error);
909 }
910
911 static void
g_sched_temporary_start(struct bio * bio)912 g_sched_temporary_start(struct bio *bio)
913 {
914
915 mtx_lock(&me.gs_mtx);
916 me.gs_npending++;
917 gs_bioq_disksort(&me.gs_pending, bio);
918 mtx_unlock(&me.gs_mtx);
919 }
920
921 static void
g_sched_flush_pending(g_start_t * start)922 g_sched_flush_pending(g_start_t *start)
923 {
924 struct bio *bp;
925
926 while ((bp = gs_bioq_takefirst(&me.gs_pending)))
927 start(bp);
928 }
929
930 static int
g_insert_proxy(struct g_geom * gp,struct g_provider * newpp,struct g_geom * dstgp,struct g_provider * pp,struct g_consumer * cp)931 g_insert_proxy(struct g_geom *gp, struct g_provider *newpp,
932 struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp)
933 {
934 struct g_sched_softc *sc = gp->softc;
935 g_start_t *saved_start, *flush = g_sched_start;
936 int error = 0, endticks = ticks + hz;
937
938 g_cancel_event(newpp); /* prevent taste() */
939 /* copy private fields */
940 newpp->private = pp->private;
941 newpp->index = pp->index;
942
943 /* Queue all the early requests coming for us. */
944 me.gs_npending = 0;
945 saved_start = pp->geom->start;
946 dstgp->start = g_sched_temporary_start;
947
948 while (pp->nstart - pp->nend != me.gs_npending &&
949 endticks - ticks >= 0)
950 tsleep(pp, PRIBIO, "-", hz/10);
951
952 if (pp->nstart - pp->nend != me.gs_npending) {
953 flush = saved_start;
954 error = ETIMEDOUT;
955 goto fail;
956 }
957
958 /* link pp to this geom */
959 LIST_REMOVE(pp, provider);
960 pp->geom = gp;
961 LIST_INSERT_HEAD(&gp->provider, pp, provider);
962
963 /*
964 * replicate the counts from the parent in the
965 * new provider and consumer nodes
966 */
967 cp->acr = newpp->acr = pp->acr;
968 cp->acw = newpp->acw = pp->acw;
969 cp->ace = newpp->ace = pp->ace;
970 sc->sc_flags |= G_SCHED_PROXYING;
971
972 fail:
973 dstgp->start = saved_start;
974
975 g_sched_flush_pending(flush);
976
977 return (error);
978 }
979
980 /*
981 * Create a geom node for the device passed as *pp.
982 * If successful, add a reference to this gsp.
983 */
984 static int
g_sched_create(struct gctl_req * req,struct g_class * mp,struct g_provider * pp,struct g_gsched * gsp,int proxy)985 g_sched_create(struct gctl_req *req, struct g_class *mp,
986 struct g_provider *pp, struct g_gsched *gsp, int proxy)
987 {
988 struct g_sched_softc *sc = NULL;
989 struct g_geom *gp, *dstgp;
990 struct g_provider *newpp = NULL;
991 struct g_consumer *cp = NULL;
992 char name[64];
993 int error;
994
995 g_topology_assert();
996
997 snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX);
998 LIST_FOREACH(gp, &mp->geom, geom) {
999 if (strcmp(gp->name, name) == 0) {
1000 gctl_error(req, "Geom %s already exists.",
1001 name);
1002 return (EEXIST);
1003 }
1004 }
1005
1006 gp = g_new_geomf(mp, "%s", name);
1007 dstgp = proxy ? pp->geom : gp; /* where do we link the provider */
1008
1009 sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
1010 sc->sc_gsched = gsp;
1011 sc->sc_data = gsp->gs_init(gp);
1012 if (sc->sc_data == NULL) {
1013 error = ENOMEM;
1014 goto fail;
1015 }
1016
1017 sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK);
1018
1019 /*
1020 * Do not initialize the flush mechanism, will be initialized
1021 * on the first insertion on the hash table.
1022 */
1023
1024 mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF);
1025
1026 gp->softc = sc;
1027 gp->start = g_sched_start;
1028 gp->orphan = g_sched_orphan;
1029 gp->access = g_sched_access;
1030 gp->dumpconf = g_sched_dumpconf;
1031
1032 newpp = g_new_providerf(dstgp, "%s", gp->name);
1033 newpp->mediasize = pp->mediasize;
1034 newpp->sectorsize = pp->sectorsize;
1035
1036 cp = g_new_consumer(gp);
1037 error = g_attach(cp, proxy ? newpp : pp);
1038 if (error != 0) {
1039 gctl_error(req, "Cannot attach to provider %s.",
1040 pp->name);
1041 goto fail;
1042 }
1043
1044 g_error_provider(newpp, 0);
1045 if (proxy) {
1046 error = g_insert_proxy(gp, newpp, dstgp, pp, cp);
1047 if (error)
1048 goto fail;
1049 }
1050 G_SCHED_DEBUG(0, "Device %s created.", gp->name);
1051
1052 g_gsched_ref(gsp);
1053
1054 return (0);
1055
1056 fail:
1057 if (cp != NULL) {
1058 if (cp->provider != NULL)
1059 g_detach(cp);
1060 g_destroy_consumer(cp);
1061 }
1062 if (newpp != NULL)
1063 g_destroy_provider(newpp);
1064 if (sc->sc_hash)
1065 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1066 gsp, sc->sc_data);
1067 if (sc->sc_data)
1068 gsp->gs_fini(sc->sc_data);
1069 g_free(gp->softc);
1070 g_destroy_geom(gp);
1071
1072 return (error);
1073 }
1074
1075 /*
1076 * Support for dynamic switching of scheduling algorithms.
1077 * First initialize the data structures for the new algorithm,
1078 * then call g_sched_remove_locked() to flush all references
1079 * to the old one, finally link the new algorithm.
1080 */
1081 static int
g_sched_change_algo(struct gctl_req * req,struct g_class * mp,struct g_provider * pp,struct g_gsched * gsp)1082 g_sched_change_algo(struct gctl_req *req, struct g_class *mp,
1083 struct g_provider *pp, struct g_gsched *gsp)
1084 {
1085 struct g_sched_softc *sc;
1086 struct g_geom *gp;
1087 struct g_hash *newh;
1088 void *data;
1089 u_long mask;
1090 int error = 0;
1091
1092 gp = pp->geom;
1093 sc = gp->softc;
1094
1095 data = gsp->gs_init(gp);
1096 if (data == NULL)
1097 return (ENOMEM);
1098
1099 newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK);
1100 if (gsp->gs_priv_size && !newh) {
1101 error = ENOMEM;
1102 goto fail;
1103 }
1104
1105 g_sched_lock(gp);
1106 if (sc->sc_gsched) { /* can be NULL in some cases */
1107 error = g_sched_remove_locked(gp, sc->sc_gsched);
1108 if (error)
1109 goto fail;
1110 }
1111
1112 g_gsched_ref(gsp);
1113 sc->sc_gsched = gsp;
1114 sc->sc_data = data;
1115 sc->sc_hash = newh;
1116 sc->sc_mask = mask;
1117
1118 g_sched_unlock(gp);
1119
1120 return (0);
1121
1122 fail:
1123 if (newh)
1124 g_sched_hash_fini(gp, newh, mask, gsp, data);
1125
1126 if (data)
1127 gsp->gs_fini(data);
1128
1129 g_sched_unlock(gp);
1130
1131 return (error);
1132 }
1133
1134 /*
1135 * Stop the request flow directed to the proxy, redirecting the new
1136 * requests to the me.gs_pending queue.
1137 */
1138 static struct g_provider *
g_detach_proxy(struct g_geom * gp)1139 g_detach_proxy(struct g_geom *gp)
1140 {
1141 struct g_consumer *cp;
1142 struct g_provider *pp, *newpp;
1143
1144 do {
1145 pp = LIST_FIRST(&gp->provider);
1146 if (pp == NULL)
1147 break;
1148 cp = LIST_FIRST(&gp->consumer);
1149 if (cp == NULL)
1150 break;
1151 newpp = cp->provider;
1152 if (newpp == NULL)
1153 break;
1154
1155 me.gs_npending = 0;
1156 pp->geom->start = g_sched_temporary_start;
1157
1158 return (pp);
1159 } while (0);
1160 printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name);
1161
1162 return (NULL);
1163 }
1164
1165 static void
g_sched_blackhole(struct bio * bp)1166 g_sched_blackhole(struct bio *bp)
1167 {
1168
1169 g_io_deliver(bp, ENXIO);
1170 }
1171
1172 static inline void
g_reparent_provider(struct g_provider * pp,struct g_geom * gp,struct g_provider * newpp)1173 g_reparent_provider(struct g_provider *pp, struct g_geom *gp,
1174 struct g_provider *newpp)
1175 {
1176
1177 LIST_REMOVE(pp, provider);
1178 if (newpp) {
1179 pp->private = newpp->private;
1180 pp->index = newpp->index;
1181 }
1182 pp->geom = gp;
1183 LIST_INSERT_HEAD(&gp->provider, pp, provider);
1184 }
1185
1186 static inline void
g_unproxy_provider(struct g_provider * oldpp,struct g_provider * newpp)1187 g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp)
1188 {
1189 struct g_geom *gp = oldpp->geom;
1190
1191 g_reparent_provider(oldpp, newpp->geom, newpp);
1192
1193 /*
1194 * Hackish: let the system destroy the old provider for us, just
1195 * in case someone attached a consumer to it, in which case a
1196 * direct call to g_destroy_provider() would not work.
1197 */
1198 g_reparent_provider(newpp, gp, NULL);
1199 }
1200
1201 /*
1202 * Complete the proxy destruction, linking the old provider to its
1203 * original geom, and destroying the proxy provider. Also take care
1204 * of issuing the pending requests collected in me.gs_pending (if any).
1205 */
1206 static int
g_destroy_proxy(struct g_geom * gp,struct g_provider * oldpp)1207 g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp)
1208 {
1209 struct g_consumer *cp;
1210 struct g_provider *newpp;
1211
1212 do {
1213 cp = LIST_FIRST(&gp->consumer);
1214 if (cp == NULL)
1215 break;
1216 newpp = cp->provider;
1217 if (newpp == NULL)
1218 break;
1219
1220 /* Relink the provider to its original geom. */
1221 g_unproxy_provider(oldpp, newpp);
1222
1223 /* Detach consumer from provider, and destroy provider. */
1224 cp->acr = newpp->acr = 0;
1225 cp->acw = newpp->acw = 0;
1226 cp->ace = newpp->ace = 0;
1227 g_detach(cp);
1228
1229 /* Send the pending bios through the right start function. */
1230 g_sched_flush_pending(oldpp->geom->start);
1231
1232 return (0);
1233 } while (0);
1234 printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name);
1235
1236 /* We cannot send the pending bios anywhere... */
1237 g_sched_flush_pending(g_sched_blackhole);
1238
1239 return (EINVAL);
1240 }
1241
1242 static int
g_sched_destroy(struct g_geom * gp,boolean_t force)1243 g_sched_destroy(struct g_geom *gp, boolean_t force)
1244 {
1245 struct g_provider *pp, *oldpp = NULL;
1246 struct g_sched_softc *sc;
1247 struct g_gsched *gsp;
1248 int error;
1249
1250 g_topology_assert();
1251 sc = gp->softc;
1252 if (sc == NULL)
1253 return (ENXIO);
1254 if (!(sc->sc_flags & G_SCHED_PROXYING)) {
1255 pp = LIST_FIRST(&gp->provider);
1256 if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
1257 const char *msg = force ?
1258 "but we force removal" : "cannot remove";
1259
1260 G_SCHED_DEBUG(!force,
1261 "Device %s is still open (r%dw%de%d), %s.",
1262 pp->name, pp->acr, pp->acw, pp->ace, msg);
1263 if (!force)
1264 return (EBUSY);
1265 } else {
1266 G_SCHED_DEBUG(0, "Device %s removed.", gp->name);
1267 }
1268 } else
1269 oldpp = g_detach_proxy(gp);
1270
1271 gsp = sc->sc_gsched;
1272 if (gsp) {
1273 /*
1274 * XXX bad hack here: force a dispatch to release
1275 * any reference to the hash table still held by
1276 * the scheduler.
1277 */
1278 g_sched_lock(gp);
1279 /*
1280 * We are dying here, no new requests should enter
1281 * the scheduler. This is granted by the topolgy,
1282 * either in case we were proxying (new bios are
1283 * being redirected) or not (see the access check
1284 * above).
1285 */
1286 g_sched_forced_dispatch(gp);
1287 error = g_sched_wait_pending(gp);
1288
1289 if (error) {
1290 /*
1291 * Not all the requests came home: this might happen
1292 * under heavy load, or if we were waiting for any
1293 * bio which is served in the event path (see
1294 * geom_slice.c for an example of how this can
1295 * happen). Try to restore a working configuration
1296 * if we can fail.
1297 */
1298 if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1299 g_sched_flush_pending(force ?
1300 g_sched_blackhole : g_sched_start);
1301 }
1302
1303 /*
1304 * In the forced destroy case there is not so much
1305 * we can do, we have pending bios that will call
1306 * g_sched_done() somehow, and we don't want them
1307 * to crash the system using freed memory. We tell
1308 * the user that something went wrong, and leak some
1309 * memory here.
1310 * Note: the callers using force = 1 ignore the
1311 * return value.
1312 */
1313 if (force) {
1314 G_SCHED_DEBUG(0, "Pending requests while "
1315 " destroying geom, some memory leaked.");
1316 }
1317
1318 return (error);
1319 }
1320
1321 g_sched_unlock(gp);
1322 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1323 gsp, sc->sc_data);
1324 sc->sc_hash = NULL;
1325 gsp->gs_fini(sc->sc_data);
1326 g_gsched_unref(gsp);
1327 sc->sc_gsched = NULL;
1328 }
1329
1330 if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1331 error = g_destroy_proxy(gp, oldpp);
1332
1333 if (error) {
1334 if (force) {
1335 G_SCHED_DEBUG(0, "Unrecoverable error while "
1336 "destroying a proxy geom, leaking some "
1337 " memory.");
1338 }
1339
1340 return (error);
1341 }
1342 }
1343
1344 mtx_destroy(&sc->sc_mtx);
1345
1346 g_free(gp->softc);
1347 gp->softc = NULL;
1348 g_wither_geom(gp, ENXIO);
1349
1350 return (error);
1351 }
1352
1353 static int
g_sched_destroy_geom(struct gctl_req * req,struct g_class * mp,struct g_geom * gp)1354 g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp,
1355 struct g_geom *gp)
1356 {
1357
1358 return (g_sched_destroy(gp, 0));
1359 }
1360
1361 /*
1362 * Functions related to the classification of requests.
1363 *
1364 * On recent FreeBSD versions (8.0 and above), we store a reference
1365 * to the issuer of a request in bp->bio_classifier1 as soon
1366 * as the bio is posted to the geom queue (and not later, because
1367 * requests are managed by the g_down thread afterwards).
1368 *
1369 * On older versions of the system (but this code is not used
1370 * in any existing release), we [ab]use the caller1 field in the
1371 * root element of the bio tree to store the classification info.
1372 * The marking is done at the beginning of g_io_request()
1373 * and only if we find that the field is NULL.
1374 *
1375 * To avoid rebuilding the kernel, this module will patch the
1376 * initial part of g_io_request() so it jumps to some hand-coded
1377 * assembly that does the marking and then executes the original
1378 * body of g_io_request().
1379 *
1380 * fake_ioreq[] is architecture-specific machine code
1381 * that implements the above. CODE_SIZE, STORE_SIZE etc.
1382 * are constants used in the patching routine. Look at the
1383 * code in g_ioreq_patch() for the details.
1384 */
1385
1386 #ifndef HAVE_BIO_CLASSIFIER
1387 /*
1388 * Support for old FreeBSD versions
1389 */
1390 #if defined(__i386__)
1391 #define CODE_SIZE 29
1392 #define STORE_SIZE 5
1393 #define EPILOGUE 5
1394 #define SIZE (CODE_SIZE + STORE_SIZE + EPILOGUE)
1395
1396 static u_char fake_ioreq[SIZE] = {
1397 0x8b, 0x44, 0x24, 0x04, /* mov bp, %eax */
1398 /* 1: */
1399 0x89, 0xc2, /* mov %eax, %edx # edx = bp */
1400 0x8b, 0x40, 0x64, /* mov bp->bio_parent, %eax */
1401 0x85, 0xc0, /* test %eax, %eax */
1402 0x75, 0xf7, /* jne 1b */
1403 0x8b, 0x42, 0x30, /* mov bp->bp_caller1, %eax */
1404 0x85, 0xc0, /* test %eax, %eax */
1405 0x75, 0x09, /* jne 2f */
1406 0x64, 0xa1, 0x00, 0x00, /* mov %fs:0, %eax */
1407 0x00, 0x00,
1408 0x89, 0x42, 0x30, /* mov %eax, bp->bio_caller1 */
1409 /* 2: */
1410 0x55, 0x89, 0xe5, 0x57, 0x56,
1411 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp back... */
1412 };
1413 #elif defined(__amd64)
1414 #define CODE_SIZE 38
1415 #define STORE_SIZE 6
1416 #define EPILOGUE 5
1417 #define SIZE (CODE_SIZE + STORE_SIZE + EPILOGUE)
1418
1419 static u_char fake_ioreq[SIZE] = {
1420 0x48, 0x89, 0xf8, /* mov bp, %rax */
1421 /* 1: */
1422 0x48, 0x89, 0xc2, /* mov %rax, %rdx # rdx = bp */
1423 0x48, 0x8b, 0x82, 0xa8, /* mov bp->bio_parent, %rax */
1424 0x00, 0x00, 0x00,
1425 0x48, 0x85, 0xc0, /* test %rax, %rax */
1426 0x75, 0xf1, /* jne 1b */
1427 0x48, 0x83, 0x7a, 0x58, /* cmp $0, bp->bp_caller1 */
1428 0x00,
1429 0x75, 0x0d, /* jne 2f */
1430 0x65, 0x48, 0x8b, 0x04, /* mov %gs:0, %rax */
1431 0x25, 0x00, 0x00, 0x00,
1432 0x00,
1433 0x48, 0x89, 0x42, 0x58, /* mov %rax, bp->bio_caller1 */
1434 /* 2: */
1435 0x55, 0x48, 0x89, 0xe5, 0x41, 0x56,
1436 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp back... */
1437 };
1438 #else /* neither x86 nor amd64 */
1439 static void
g_new_io_request(struct bio * bp,struct g_consumer * cp)1440 g_new_io_request(struct bio *bp, struct g_consumer *cp)
1441 {
1442 struct bio *top = bp;
1443
1444 /*
1445 * bio classification: if bio_caller1 is available in the
1446 * root of the 'struct bio' tree, store there the thread id
1447 * of the thread that originated the request.
1448 * More sophisticated classification schemes can be used.
1449 */
1450 while (top->bio_parent)
1451 top = top->bio_parent;
1452
1453 if (top->bio_caller1 == NULL)
1454 top->bio_caller1 = curthread;
1455 }
1456
1457 #error please add the code above in g_new_io_request() to the beginning of \
1458 /sys/geom/geom_io.c::g_io_request(), and remove this line.
1459 #endif /* end of arch-specific code */
1460
1461 static int
g_ioreq_patch(void)1462 g_ioreq_patch(void)
1463 {
1464 u_char *original;
1465 u_long ofs;
1466 int found;
1467
1468 if (me.gs_patched)
1469 return (-1);
1470
1471 original = (u_char *)g_io_request;
1472
1473 found = !bcmp(original, fake_ioreq + CODE_SIZE, STORE_SIZE);
1474 if (!found)
1475 return (-1);
1476
1477 /* Jump back to the original + STORE_SIZE. */
1478 ofs = (original + STORE_SIZE) - (fake_ioreq + SIZE);
1479 bcopy(&ofs, fake_ioreq + CODE_SIZE + STORE_SIZE + 1, 4);
1480
1481 /* Patch the original address with a jump to the trampoline. */
1482 *original = 0xe9; /* jump opcode */
1483 ofs = fake_ioreq - (original + 5);
1484 bcopy(&ofs, original + 1, 4);
1485
1486 me.gs_patched = 1;
1487
1488 return (0);
1489 }
1490
1491 /*
1492 * Restore the original code, this is easy.
1493 */
1494 static void
g_ioreq_restore(void)1495 g_ioreq_restore(void)
1496 {
1497 u_char *original;
1498
1499 if (me.gs_patched) {
1500 original = (u_char *)g_io_request;
1501 bcopy(fake_ioreq + CODE_SIZE, original, STORE_SIZE);
1502 me.gs_patched = 0;
1503 }
1504 }
1505
1506 static inline void
g_classifier_ini(void)1507 g_classifier_ini(void)
1508 {
1509
1510 g_ioreq_patch();
1511 }
1512
1513 static inline void
g_classifier_fini(void)1514 g_classifier_fini(void)
1515 {
1516
1517 g_ioreq_restore();
1518 }
1519
1520 /*--- end of support code for older FreeBSD versions */
1521
1522 #else /* HAVE_BIO_CLASSIFIER */
1523
1524 /*
1525 * Classifier support for recent FreeBSD versions: we use
1526 * a very simple classifier, only use curthread to tag a request.
1527 * The classifier is registered at module load, and unregistered
1528 * at module unload.
1529 */
1530 static int
g_sched_tag(void * arg,struct bio * bp)1531 g_sched_tag(void *arg, struct bio *bp)
1532 {
1533
1534 bp->bio_classifier1 = curthread;
1535 return (1);
1536 }
1537
1538 static struct g_classifier_hook g_sched_classifier = {
1539 .func = g_sched_tag,
1540 };
1541
1542 static inline void
g_classifier_ini(void)1543 g_classifier_ini(void)
1544 {
1545
1546 g_register_classifier(&g_sched_classifier);
1547 }
1548
1549 static inline void
g_classifier_fini(void)1550 g_classifier_fini(void)
1551 {
1552
1553 g_unregister_classifier(&g_sched_classifier);
1554 }
1555 #endif /* HAVE_BIO_CLASSIFIER */
1556
1557 static void
g_sched_init(struct g_class * mp)1558 g_sched_init(struct g_class *mp)
1559 {
1560
1561 g_gsched_global_init();
1562
1563 G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.",
1564 mp, &g_sched_class);
1565
1566 /* Patch g_io_request to store classification info in the bio. */
1567 g_classifier_ini();
1568 }
1569
1570 static void
g_sched_fini(struct g_class * mp)1571 g_sched_fini(struct g_class *mp)
1572 {
1573
1574 g_classifier_fini();
1575
1576 G_SCHED_DEBUG(0, "Unloading...");
1577
1578 KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers"));
1579 mtx_destroy(&me.gs_mtx);
1580 }
1581
1582 static int
g_sched_ioctl(struct g_provider * pp,u_long cmd,void * data,int fflag,struct thread * td)1583 g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag,
1584 struct thread *td)
1585 {
1586 struct g_consumer *cp;
1587 struct g_geom *gp;
1588
1589 cp = LIST_FIRST(&pp->geom->consumer);
1590 if (cp == NULL)
1591 return (ENOIOCTL);
1592 gp = cp->provider->geom;
1593 if (gp->ioctl == NULL)
1594 return (ENOIOCTL);
1595 return (gp->ioctl(cp->provider, cmd, data, fflag, td));
1596 }
1597
1598 /*
1599 * Read the i-th argument for a request, skipping the /dev/
1600 * prefix if present.
1601 */
1602 static const char *
g_sched_argi(struct gctl_req * req,int i)1603 g_sched_argi(struct gctl_req *req, int i)
1604 {
1605 static const char *dev_prefix = "/dev/";
1606 const char *name;
1607 char param[16];
1608 int l = strlen(dev_prefix);
1609
1610 snprintf(param, sizeof(param), "arg%d", i);
1611 name = gctl_get_asciiparam(req, param);
1612 if (name == NULL)
1613 gctl_error(req, "No 'arg%d' argument", i);
1614 else if (strncmp(name, dev_prefix, l) == 0)
1615 name += l;
1616 return (name);
1617 }
1618
1619 /*
1620 * Fetch nargs and do appropriate checks.
1621 */
1622 static int
g_sched_get_nargs(struct gctl_req * req)1623 g_sched_get_nargs(struct gctl_req *req)
1624 {
1625 int *nargs;
1626
1627 nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1628 if (nargs == NULL) {
1629 gctl_error(req, "No 'nargs' argument");
1630 return (0);
1631 }
1632 if (*nargs <= 0)
1633 gctl_error(req, "Missing device(s).");
1634 return (*nargs);
1635 }
1636
1637 /*
1638 * Check whether we should add the class on certain volumes when
1639 * this geom is created. Right now this is under control of a kenv
1640 * variable containing the names of all devices that we care about.
1641 * Probably we should only support transparent insertion as the
1642 * preferred mode of operation.
1643 */
1644 static struct g_geom *
g_sched_taste(struct g_class * mp,struct g_provider * pp,int flags __unused)1645 g_sched_taste(struct g_class *mp, struct g_provider *pp,
1646 int flags __unused)
1647 {
1648 struct g_gsched *gsp = NULL; /* the . algorithm we want */
1649 const char *s; /* generic string pointer */
1650 const char *taste_names; /* devices we like */
1651 int l;
1652
1653 g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__,
1654 mp->name, pp->name);
1655 g_topology_assert();
1656
1657 G_SCHED_DEBUG(2, "Tasting %s.", pp->name);
1658
1659 do {
1660 /* do not taste on ourselves */
1661 if (pp->geom->class == mp)
1662 break;
1663
1664 taste_names = getenv("geom.sched.taste");
1665 if (taste_names == NULL)
1666 break;
1667
1668 l = strlen(pp->name);
1669 for (s = taste_names; *s &&
1670 (s = strstr(s, pp->name)); s++) {
1671 /* further checks for an exact match */
1672 if ( (s == taste_names || s[-1] == ' ') &&
1673 (s[l] == '\0' || s[l] == ' ') )
1674 break;
1675 }
1676 if (s == NULL)
1677 break;
1678 G_SCHED_DEBUG(0, "Attach device %s match [%s]\n",
1679 pp->name, s);
1680
1681 /* look up the provider name in the list */
1682 s = getenv("geom.sched.algo");
1683 if (s == NULL)
1684 s = "rr";
1685
1686 gsp = g_gsched_find(s); /* also get a reference */
1687 if (gsp == NULL) {
1688 G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s);
1689 break;
1690 }
1691
1692 /* XXX create with 1 as last argument ? */
1693 g_sched_create(NULL, mp, pp, gsp, 0);
1694 g_gsched_unref(gsp);
1695 } while (0);
1696 return NULL;
1697 }
1698
1699 static void
g_sched_ctl_create(struct gctl_req * req,struct g_class * mp,int proxy)1700 g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy)
1701 {
1702 struct g_provider *pp;
1703 struct g_gsched *gsp;
1704 const char *name;
1705 int i, nargs;
1706
1707 g_topology_assert();
1708
1709 name = gctl_get_asciiparam(req, "algo");
1710 if (name == NULL) {
1711 gctl_error(req, "No '%s' argument", "algo");
1712 return;
1713 }
1714
1715 gsp = g_gsched_find(name); /* also get a reference */
1716 if (gsp == NULL) {
1717 gctl_error(req, "Bad algorithm '%s'", name);
1718 return;
1719 }
1720
1721 nargs = g_sched_get_nargs(req);
1722
1723 /*
1724 * Run on the arguments, and break on any error.
1725 * We look for a device name, but skip the /dev/ prefix if any.
1726 */
1727 for (i = 0; i < nargs; i++) {
1728 name = g_sched_argi(req, i);
1729 if (name == NULL)
1730 break;
1731 pp = g_provider_by_name(name);
1732 if (pp == NULL) {
1733 G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1734 gctl_error(req, "Provider %s is invalid.", name);
1735 break;
1736 }
1737 if (g_sched_create(req, mp, pp, gsp, proxy) != 0)
1738 break;
1739 }
1740
1741 g_gsched_unref(gsp);
1742 }
1743
1744 static void
g_sched_ctl_configure(struct gctl_req * req,struct g_class * mp)1745 g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp)
1746 {
1747 struct g_provider *pp;
1748 struct g_gsched *gsp;
1749 const char *name;
1750 int i, nargs;
1751
1752 g_topology_assert();
1753
1754 name = gctl_get_asciiparam(req, "algo");
1755 if (name == NULL) {
1756 gctl_error(req, "No '%s' argument", "algo");
1757 return;
1758 }
1759
1760 gsp = g_gsched_find(name); /* also get a reference */
1761 if (gsp == NULL) {
1762 gctl_error(req, "Bad algorithm '%s'", name);
1763 return;
1764 }
1765
1766 nargs = g_sched_get_nargs(req);
1767
1768 /*
1769 * Run on the arguments, and break on any error.
1770 * We look for a device name, but skip the /dev/ prefix if any.
1771 */
1772 for (i = 0; i < nargs; i++) {
1773 name = g_sched_argi(req, i);
1774 if (name == NULL)
1775 break;
1776 pp = g_provider_by_name(name);
1777 if (pp == NULL || pp->geom->class != mp) {
1778 G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1779 gctl_error(req, "Provider %s is invalid.", name);
1780 break;
1781 }
1782 if (g_sched_change_algo(req, mp, pp, gsp) != 0)
1783 break;
1784 }
1785
1786 g_gsched_unref(gsp);
1787 }
1788
1789 static struct g_geom *
g_sched_find_geom(struct g_class * mp,const char * name)1790 g_sched_find_geom(struct g_class *mp, const char *name)
1791 {
1792 struct g_geom *gp;
1793
1794 LIST_FOREACH(gp, &mp->geom, geom) {
1795 if (strcmp(gp->name, name) == 0)
1796 return (gp);
1797 }
1798 return (NULL);
1799 }
1800
1801 static void
g_sched_ctl_destroy(struct gctl_req * req,struct g_class * mp)1802 g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp)
1803 {
1804 int nargs, *force, error, i;
1805 struct g_geom *gp;
1806 const char *name;
1807
1808 g_topology_assert();
1809
1810 nargs = g_sched_get_nargs(req);
1811
1812 force = gctl_get_paraml(req, "force", sizeof(*force));
1813 if (force == NULL) {
1814 gctl_error(req, "No 'force' argument");
1815 return;
1816 }
1817
1818 for (i = 0; i < nargs; i++) {
1819 name = g_sched_argi(req, i);
1820 if (name == NULL)
1821 break;
1822
1823 gp = g_sched_find_geom(mp, name);
1824 if (gp == NULL) {
1825 G_SCHED_DEBUG(1, "Device %s is invalid.", name);
1826 gctl_error(req, "Device %s is invalid.", name);
1827 break;
1828 }
1829
1830 error = g_sched_destroy(gp, *force);
1831 if (error != 0) {
1832 gctl_error(req, "Cannot destroy device %s (error=%d).",
1833 gp->name, error);
1834 break;
1835 }
1836 }
1837 }
1838
1839 static void
g_sched_config(struct gctl_req * req,struct g_class * mp,const char * verb)1840 g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb)
1841 {
1842 uint32_t *version;
1843
1844 g_topology_assert();
1845
1846 version = gctl_get_paraml(req, "version", sizeof(*version));
1847 if (version == NULL) {
1848 gctl_error(req, "No '%s' argument.", "version");
1849 return;
1850 }
1851
1852 if (*version != G_SCHED_VERSION) {
1853 gctl_error(req, "Userland and kernel parts are "
1854 "out of sync.");
1855 return;
1856 }
1857
1858 if (strcmp(verb, "create") == 0) {
1859 g_sched_ctl_create(req, mp, 0);
1860 return;
1861 } else if (strcmp(verb, "insert") == 0) {
1862 g_sched_ctl_create(req, mp, 1);
1863 return;
1864 } else if (strcmp(verb, "configure") == 0) {
1865 g_sched_ctl_configure(req, mp);
1866 return;
1867 } else if (strcmp(verb, "destroy") == 0) {
1868 g_sched_ctl_destroy(req, mp);
1869 return;
1870 }
1871
1872 gctl_error(req, "Unknown verb.");
1873 }
1874
1875 static void
g_sched_dumpconf(struct sbuf * sb,const char * indent,struct g_geom * gp,struct g_consumer * cp,struct g_provider * pp)1876 g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1877 struct g_consumer *cp, struct g_provider *pp)
1878 {
1879 struct g_sched_softc *sc = gp->softc;
1880 struct g_gsched *gsp = sc->sc_gsched;
1881 if (indent == NULL) { /* plaintext */
1882 sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--");
1883 }
1884 if (gsp != NULL && gsp->gs_dumpconf)
1885 gsp->gs_dumpconf(sb, indent, gp, cp, pp);
1886 }
1887
1888 DECLARE_GEOM_CLASS(g_sched_class, g_sched);
1889 MODULE_VERSION(geom_sched, 0);
1890