1 /*-
2  * Copyright (c) 2009-2010 Fabio Checconi
3  * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 /*
29  * $Id$
30  * $FreeBSD: stable/10/sys/geom/sched/g_sched.c 243333 2012-11-20 12:32:18Z jh $
31  *
32  * Main control module for geom-based disk schedulers ('sched').
33  *
34  * USER VIEW
35  * A 'sched' node is typically inserted transparently between
36  * an existing provider pp and its original geom gp
37  *
38  *	[pp --> gp  ..]
39  *
40  * using the command "geom sched insert <provider>" and
41  * resulting in the following topology
42  *
43  *	[pp --> sched_gp --> cp]   [new_pp --> gp ... ]
44  *
45  * Deletion "geom sched destroy <provider>.sched." restores the
46  * original chain. The normal "geom sched create <provide>"
47  * is also supported.
48  *
49  * INTERNALS
50  * Internally, the 'sched' uses the following data structures
51  *
52  *   geom{}         g_sched_softc{}      g_gsched{}
53  * +----------+    +---------------+   +-------------+
54  * |  softc *-|--->| sc_gsched   *-|-->|  gs_init    |
55  * |  ...     |    |               |   |  gs_fini    |
56  * |          |    | [ hash table] |   |  gs_start   |
57  * +----------+    |               |   |  ...        |
58  *                 |               |   +-------------+
59  *                 |               |
60  *                 |               |     g_*_softc{}
61  *                 |               |   +-------------+
62  *                 | sc_data     *-|-->|             |
63  *                 +---------------+   |  algorithm- |
64  *                                     |  specific   |
65  *                                     +-------------+
66  *
67  * A g_sched_softc{} is created with a "geom sched insert" call.
68  * In turn this instantiates a specific scheduling algorithm,
69  * which sets sc_gsched to point to the algorithm callbacks,
70  * and calls gs_init() to create the g_*_softc{} .
71  * The other callbacks (gs_start, gs_next, ...) are invoked
72  * as needed
73  *
74  * g_sched_softc{} is defined in g_sched.h and mostly used here;
75  * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h;
76  * g_*_softc{} is defined/implemented by each algorithm (gs_*.c)
77  *
78  * DATA MOVING
79  * When a bio is received on the provider, it goes to the
80  * g_sched_start() which calls gs_start() to initially queue it;
81  * then we call g_sched_dispatch() that loops around gs_next()
82  * to select zero or more bio's to be sent downstream.
83  *
84  * g_sched_dispatch() can also be called as a result of a timeout,
85  * e.g. when doing anticipation or pacing requests.
86  *
87  * When a bio comes back, it goes to g_sched_done() which in turn
88  * calls gs_done(). The latter does any necessary housekeeping in
89  * the scheduling algorithm, and may decide to call g_sched_dispatch()
90  * to send more bio's downstream.
91  *
92  * If an algorithm needs per-flow queues, these are created
93  * calling gs_init_class() and destroyed with gs_fini_class(),
94  * and they are also inserted in the hash table implemented in
95  * the g_sched_softc{}
96  *
97  * If an algorithm is replaced, or a transparently-inserted node is
98  * removed with "geom sched destroy", we need to remove all references
99  * to the g_*_softc{} and g_sched_softc from the bio's still in
100  * the scheduler. g_sched_forced_dispatch() helps doing this.
101  * XXX need to explain better.
102  */
103 
104 #include <sys/cdefs.h>
105 #include <sys/param.h>
106 #include <sys/systm.h>
107 #include <sys/kernel.h>
108 #include <sys/module.h>
109 #include <sys/lock.h>
110 #include <sys/mutex.h>
111 #include <sys/bio.h>
112 #include <sys/limits.h>
113 #include <sys/hash.h>
114 #include <sys/sbuf.h>
115 #include <sys/sysctl.h>
116 #include <sys/malloc.h>
117 #include <sys/proc.h>		/* we access curthread */
118 #include <geom/geom.h>
119 #include "gs_scheduler.h"
120 #include "g_sched.h"		/* geom hooks */
121 
122 /*
123  * Size of the per-geom hash table storing traffic classes.
124  * We may decide to change it at a later time, it has no ABI
125  * implications as it is only used for run-time allocations.
126  */
127 #define G_SCHED_HASH_SIZE	32
128 
129 static int g_sched_destroy(struct g_geom *gp, boolean_t force);
130 static int g_sched_destroy_geom(struct gctl_req *req,
131     struct g_class *mp, struct g_geom *gp);
132 static void g_sched_config(struct gctl_req *req, struct g_class *mp,
133     const char *verb);
134 static struct g_geom *g_sched_taste(struct g_class *mp,
135     struct g_provider *pp, int flags __unused);
136 static void g_sched_dumpconf(struct sbuf *sb, const char *indent,
137     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
138 static void g_sched_init(struct g_class *mp);
139 static void g_sched_fini(struct g_class *mp);
140 static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data,
141     int fflag, struct thread *td);
142 
143 struct g_class g_sched_class = {
144 	.name = G_SCHED_CLASS_NAME,
145 	.version = G_VERSION,
146 	.ctlreq = g_sched_config,
147 	.taste = g_sched_taste,
148 	.destroy_geom = g_sched_destroy_geom,
149 	.init = g_sched_init,
150 	.ioctl = g_sched_ioctl,
151 	.fini = g_sched_fini
152 };
153 
154 MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures");
155 
156 /*
157  * Global variables describing the state of the geom_sched module.
158  * There is only one static instance of this structure.
159  */
160 LIST_HEAD(gs_list, g_gsched);	/* type, link field */
161 struct geom_sched_vars {
162 	struct mtx	gs_mtx;
163 	struct gs_list	gs_scheds;	/* list of algorithms */
164 	u_int		gs_debug;
165 	u_int		gs_sched_count;	/* how many algorithms ? */
166 	u_int 		gs_patched;	/* g_io_request was patched */
167 
168 	u_int		gs_initialized;
169 	u_int		gs_expire_secs;	/* expiration of hash entries */
170 
171 	struct bio_queue_head gs_pending;
172 	u_int		gs_npending;
173 
174 	/* The following are for stats, usually protected by gs_mtx. */
175 	u_long		gs_requests;	/* total requests */
176 	u_long		gs_done;	/* total done */
177 	u_int 		gs_in_flight;	/* requests in flight */
178 	u_int 		gs_writes_in_flight;
179 	u_int 		gs_bytes_in_flight;
180 	u_int 		gs_write_bytes_in_flight;
181 
182 	char		gs_names[256];	/* names of schedulers */
183 };
184 
185 static struct geom_sched_vars me = {
186 	.gs_expire_secs = 10,
187 };
188 
189 SYSCTL_DECL(_kern_geom);
190 SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
191     "GEOM_SCHED stuff");
192 
193 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD,
194     &me.gs_write_bytes_in_flight, 0, "Write bytes in flight");
195 
196 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD,
197     &me.gs_bytes_in_flight, 0, "Bytes in flight");
198 
199 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD,
200     &me.gs_writes_in_flight, 0, "Write Requests in flight");
201 
202 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD,
203     &me.gs_in_flight, 0, "Requests in flight");
204 
205 SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD,
206     &me.gs_done, 0, "Total done");
207 
208 SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD,
209     &me.gs_requests, 0, "Total requests");
210 
211 SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD,
212     &me.gs_names, 0, "Algorithm names");
213 
214 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD,
215     &me.gs_sched_count, 0, "Number of algorithms");
216 
217 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW,
218     &me.gs_debug, 0, "Debug level");
219 
220 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW,
221     &me.gs_expire_secs, 0, "Expire time in seconds");
222 
223 /*
224  * g_sched calls the scheduler algorithms with this lock held.
225  * The locking functions are exposed so the scheduler algorithms can also
226  * protect themselves e.g. when running a callout handler.
227  */
228 void
g_sched_lock(struct g_geom * gp)229 g_sched_lock(struct g_geom *gp)
230 {
231 	struct g_sched_softc *sc = gp->softc;
232 
233 	mtx_lock(&sc->sc_mtx);
234 }
235 
236 void
g_sched_unlock(struct g_geom * gp)237 g_sched_unlock(struct g_geom *gp)
238 {
239 	struct g_sched_softc *sc = gp->softc;
240 
241 	mtx_unlock(&sc->sc_mtx);
242 }
243 
244 /*
245  * Support functions to handle references to the module,
246  * which are coming from devices using this scheduler.
247  */
248 static inline void
g_gsched_ref(struct g_gsched * gsp)249 g_gsched_ref(struct g_gsched *gsp)
250 {
251 
252 	atomic_add_int(&gsp->gs_refs, 1);
253 }
254 
255 static inline void
g_gsched_unref(struct g_gsched * gsp)256 g_gsched_unref(struct g_gsched *gsp)
257 {
258 
259 	atomic_add_int(&gsp->gs_refs, -1);
260 }
261 
262 /*
263  * Update the stats when this request is done.
264  */
265 static void
g_sched_update_stats(struct bio * bio)266 g_sched_update_stats(struct bio *bio)
267 {
268 
269 	me.gs_done++;
270 	me.gs_in_flight--;
271 	me.gs_bytes_in_flight -= bio->bio_length;
272 	if (bio->bio_cmd & BIO_WRITE) {
273 		me.gs_writes_in_flight--;
274 		me.gs_write_bytes_in_flight -= bio->bio_length;
275 	}
276 }
277 
278 /*
279  * Dispatch any pending request.
280  */
281 static void
g_sched_forced_dispatch(struct g_geom * gp)282 g_sched_forced_dispatch(struct g_geom *gp)
283 {
284 	struct g_sched_softc *sc = gp->softc;
285 	struct g_gsched *gsp = sc->sc_gsched;
286 	struct bio *bp;
287 
288 	KASSERT(mtx_owned(&sc->sc_mtx),
289 	    ("sc_mtx not owned during forced dispatch"));
290 
291 	while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL)
292 		g_io_request(bp, LIST_FIRST(&gp->consumer));
293 }
294 
295 /*
296  * The main dispatch loop, called either here after the start
297  * routine, or by scheduling algorithms when they receive a timeout
298  * or a 'done' notification.  Does not share code with the forced
299  * dispatch path, since the gs_done() callback can call us.
300  */
301 void
g_sched_dispatch(struct g_geom * gp)302 g_sched_dispatch(struct g_geom *gp)
303 {
304 	struct g_sched_softc *sc = gp->softc;
305 	struct g_gsched *gsp = sc->sc_gsched;
306 	struct bio *bp;
307 
308 	KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch"));
309 
310 	if ((sc->sc_flags & G_SCHED_FLUSHING))
311 		return;
312 
313 	while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL)
314 		g_io_request(bp, LIST_FIRST(&gp->consumer));
315 }
316 
317 /*
318  * Recent (8.0 and above) versions of FreeBSD have support to
319  * register classifiers of disk requests. The classifier is
320  * invoked by g_io_request(), and stores the information into
321  * bp->bio_classifier1.
322  *
323  * Support for older versions, which is left here only for
324  * documentation purposes, relies on two hacks:
325  * 1. classification info is written into the bio_caller1
326  *    field of the topmost node in the bio chain. This field
327  *    is rarely used, but this module is incompatible with
328  *    those that use bio_caller1 for other purposes,
329  *    such as ZFS and gjournal;
330  * 2. g_io_request() is patched in-memory when the module is
331  *    loaded, so that the function calls a classifier as its
332  *    first thing. g_io_request() is restored when the module
333  *    is unloaded. This functionality is only supported for
334  *    x86 and amd64, other architectures need source code changes.
335  */
336 
337 /*
338  * Lookup the identity of the issuer of the original request.
339  * In the current implementation we use the curthread of the
340  * issuer, but different mechanisms may be implemented later
341  * so we do not make assumptions on the return value which for
342  * us is just an opaque identifier.
343  */
344 
345 static inline u_long
g_sched_classify(struct bio * bp)346 g_sched_classify(struct bio *bp)
347 {
348 
349 #if __FreeBSD_version > 800098
350 	/* we have classifier fields in the struct bio */
351 #define HAVE_BIO_CLASSIFIER
352 	return ((u_long)bp->bio_classifier1);
353 #else
354 #warning old version!!!
355 	while (bp->bio_parent != NULL)
356 		bp = bp->bio_parent;
357 
358 	return ((u_long)bp->bio_caller1);
359 #endif
360 }
361 
362 /* Return the hash chain for the given key. */
363 static inline struct g_hash *
g_sched_hash(struct g_sched_softc * sc,u_long key)364 g_sched_hash(struct g_sched_softc *sc, u_long key)
365 {
366 
367 	return (&sc->sc_hash[key & sc->sc_mask]);
368 }
369 
370 /*
371  * Helper function for the children classes, which takes
372  * a geom and a bio and returns the private descriptor
373  * associated to the request.  This involves fetching
374  * the classification field and [al]locating the
375  * corresponding entry in the hash table.
376  */
377 void *
g_sched_get_class(struct g_geom * gp,struct bio * bp)378 g_sched_get_class(struct g_geom *gp, struct bio *bp)
379 {
380 	struct g_sched_softc *sc;
381 	struct g_sched_class *gsc;
382 	struct g_gsched *gsp;
383 	struct g_hash *bucket;
384 	u_long key;
385 
386 	sc = gp->softc;
387 	key = g_sched_classify(bp);
388 	bucket = g_sched_hash(sc, key);
389 	LIST_FOREACH(gsc, bucket, gsc_clist) {
390 		if (key == gsc->gsc_key) {
391 			gsc->gsc_refs++;
392 			return (gsc->gsc_priv);
393 		}
394 	}
395 
396 	gsp = sc->sc_gsched;
397 	gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size,
398 	    M_GEOM_SCHED, M_NOWAIT | M_ZERO);
399 	if (!gsc)
400 		return (NULL);
401 
402 	if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
403 		free(gsc, M_GEOM_SCHED);
404 		return (NULL);
405 	}
406 
407 	gsc->gsc_refs = 2;	/* 1 for the hash table, 1 for the caller. */
408 	gsc->gsc_key = key;
409 	LIST_INSERT_HEAD(bucket, gsc, gsc_clist);
410 
411 	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
412 
413 	return (gsc->gsc_priv);
414 }
415 
416 /*
417  * Release a reference to the per-client descriptor,
418  */
419 void
g_sched_put_class(struct g_geom * gp,void * priv)420 g_sched_put_class(struct g_geom *gp, void *priv)
421 {
422 	struct g_sched_class *gsc;
423 	struct g_sched_softc *sc;
424 
425 	gsc = g_sched_priv2class(priv);
426 	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
427 
428 	if (--gsc->gsc_refs > 0)
429 		return;
430 
431 	sc = gp->softc;
432 	sc->sc_gsched->gs_fini_class(sc->sc_data, priv);
433 
434 	LIST_REMOVE(gsc, gsc_clist);
435 	free(gsc, M_GEOM_SCHED);
436 }
437 
438 static void
g_sched_hash_fini(struct g_geom * gp,struct g_hash * hp,u_long mask,struct g_gsched * gsp,void * data)439 g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
440     struct g_gsched *gsp, void *data)
441 {
442 	struct g_sched_class *cp, *cp2;
443 	int i;
444 
445 	if (!hp)
446 		return;
447 
448 	if (data && gsp->gs_hash_unref)
449 		gsp->gs_hash_unref(data);
450 
451 	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
452 		LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
453 			g_sched_put_class(gp, cp->gsc_priv);
454 	}
455 
456 	hashdestroy(hp, M_GEOM_SCHED, mask);
457 }
458 
459 static struct g_hash *
g_sched_hash_init(struct g_gsched * gsp,u_long * mask,int flags)460 g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags)
461 {
462 	struct g_hash *hash;
463 
464 	if (gsp->gs_priv_size == 0)
465 		return (NULL);
466 
467 	hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags);
468 
469 	return (hash);
470 }
471 
472 static void
g_sched_flush_classes(struct g_geom * gp)473 g_sched_flush_classes(struct g_geom *gp)
474 {
475 	struct g_sched_softc *sc;
476 	struct g_sched_class *cp, *cp2;
477 	int i;
478 
479 	sc = gp->softc;
480 
481 	if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0)
482 		return;
483 
484 	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
485 		LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) {
486 			if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0)
487 				g_sched_put_class(gp, cp->gsc_priv);
488 		}
489 	}
490 
491 	sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz;
492 }
493 
494 /*
495  * Wait for the completion of any outstanding request.  To ensure
496  * that this does not take forever the caller has to make sure that
497  * no new request enter the scehduler before calling us.
498  *
499  * Must be called with the gp mutex held and topology locked.
500  */
501 static int
g_sched_wait_pending(struct g_geom * gp)502 g_sched_wait_pending(struct g_geom *gp)
503 {
504 	struct g_sched_softc *sc = gp->softc;
505 	int endticks = ticks + hz;
506 
507 	g_topology_assert();
508 
509 	while (sc->sc_pending && endticks - ticks >= 0)
510 		msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
511 
512 	return (sc->sc_pending ? ETIMEDOUT : 0);
513 }
514 
515 static int
g_sched_remove_locked(struct g_geom * gp,struct g_gsched * gsp)516 g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp)
517 {
518 	struct g_sched_softc *sc = gp->softc;
519 	int error;
520 
521 	/* Set the flushing flag: new bios will not enter the scheduler. */
522 	sc->sc_flags |= G_SCHED_FLUSHING;
523 
524 	g_sched_forced_dispatch(gp);
525 	error = g_sched_wait_pending(gp);
526 	if (error)
527 		goto failed;
528 
529 	/* No more requests pending or in flight from the old gsp. */
530 
531 	g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data);
532 	sc->sc_hash = NULL;
533 
534 	/*
535 	 * Avoid deadlock here by releasing the gp mutex and reacquiring
536 	 * it once done.  It should be safe, since no reconfiguration or
537 	 * destruction can take place due to the geom topology lock; no
538 	 * new request can use the current sc_data since we flagged the
539 	 * geom as being flushed.
540 	 */
541 	g_sched_unlock(gp);
542 	gsp->gs_fini(sc->sc_data);
543 	g_sched_lock(gp);
544 
545 	sc->sc_gsched = NULL;
546 	sc->sc_data = NULL;
547 	g_gsched_unref(gsp);
548 
549 failed:
550 	sc->sc_flags &= ~G_SCHED_FLUSHING;
551 
552 	return (error);
553 }
554 
555 static int
g_sched_remove(struct g_geom * gp,struct g_gsched * gsp)556 g_sched_remove(struct g_geom *gp, struct g_gsched *gsp)
557 {
558 	int error;
559 
560 	g_sched_lock(gp);
561 	error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */
562 	g_sched_unlock(gp);
563 
564 	return (error);
565 }
566 
567 /*
568  * Support function for create/taste -- locate the desired
569  * algorithm and grab a reference to it.
570  */
571 static struct g_gsched *
g_gsched_find(const char * name)572 g_gsched_find(const char *name)
573 {
574 	struct g_gsched *gsp = NULL;
575 
576 	mtx_lock(&me.gs_mtx);
577 	LIST_FOREACH(gsp, &me.gs_scheds, glist) {
578 		if (strcmp(name, gsp->gs_name) == 0) {
579 			g_gsched_ref(gsp);
580 			break;
581 		}
582 	}
583 	mtx_unlock(&me.gs_mtx);
584 
585 	return (gsp);
586 }
587 
588 /*
589  * Rebuild the list of scheduler names.
590  * To be called with me.gs_mtx lock held.
591  */
592 static void
g_gsched_build_names(struct g_gsched * gsp)593 g_gsched_build_names(struct g_gsched *gsp)
594 {
595 	int pos, l;
596 	struct g_gsched *cur;
597 
598 	pos = 0;
599 	LIST_FOREACH(cur, &me.gs_scheds, glist) {
600 		l = strlen(cur->gs_name);
601 		if (l + pos + 1 + 1 < sizeof(me.gs_names)) {
602 			if (pos != 0)
603 				me.gs_names[pos++] = ' ';
604 			strcpy(me.gs_names + pos, cur->gs_name);
605 			pos += l;
606 		}
607 	}
608 	me.gs_names[pos] = '\0';
609 }
610 
611 /*
612  * Register or unregister individual scheduling algorithms.
613  */
614 static int
g_gsched_register(struct g_gsched * gsp)615 g_gsched_register(struct g_gsched *gsp)
616 {
617 	struct g_gsched *cur;
618 	int error = 0;
619 
620 	mtx_lock(&me.gs_mtx);
621 	LIST_FOREACH(cur, &me.gs_scheds, glist) {
622 		if (strcmp(gsp->gs_name, cur->gs_name) == 0)
623 			break;
624 	}
625 	if (cur != NULL) {
626 		G_SCHED_DEBUG(0, "A scheduler named %s already"
627 		    "exists.", gsp->gs_name);
628 		error = EEXIST;
629 	} else {
630 		LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist);
631 		gsp->gs_refs = 1;
632 		me.gs_sched_count++;
633 		g_gsched_build_names(gsp);
634 	}
635 	mtx_unlock(&me.gs_mtx);
636 
637 	return (error);
638 }
639 
640 struct g_gsched_unregparm {
641 	struct g_gsched *gup_gsp;
642 	int		gup_error;
643 };
644 
645 static void
g_gsched_unregister(void * arg,int flag)646 g_gsched_unregister(void *arg, int flag)
647 {
648 	struct g_gsched_unregparm *parm = arg;
649 	struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp;
650 	struct g_sched_softc *sc;
651 	struct g_geom *gp, *gp_tmp;
652 	int error;
653 
654 	parm->gup_error = 0;
655 
656 	g_topology_assert();
657 
658 	if (flag == EV_CANCEL)
659 		return;
660 
661 	mtx_lock(&me.gs_mtx);
662 
663 	LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) {
664 		if (gp->class != &g_sched_class)
665 			continue;	/* Should not happen. */
666 
667 		sc = gp->softc;
668 		if (sc->sc_gsched == gsp) {
669 			error = g_sched_remove(gp, gsp);
670 			if (error)
671 				goto failed;
672 		}
673 	}
674 
675 	LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) {
676 		if (cur != gsp)
677 			continue;
678 
679 		if (gsp->gs_refs != 1) {
680 			G_SCHED_DEBUG(0, "%s still in use.",
681 			    gsp->gs_name);
682 			parm->gup_error = EBUSY;
683 		} else {
684 			LIST_REMOVE(gsp, glist);
685 			me.gs_sched_count--;
686 			g_gsched_build_names(gsp);
687 		}
688 		break;
689 	}
690 
691 	if (cur == NULL) {
692 		G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name);
693 		parm->gup_error = ENOENT;
694 	}
695 
696 failed:
697 	mtx_unlock(&me.gs_mtx);
698 }
699 
700 static inline void
g_gsched_global_init(void)701 g_gsched_global_init(void)
702 {
703 
704 	if (!me.gs_initialized) {
705 		G_SCHED_DEBUG(0, "Initializing global data.");
706 		mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF);
707 		LIST_INIT(&me.gs_scheds);
708 		gs_bioq_init(&me.gs_pending);
709 		me.gs_initialized = 1;
710 	}
711 }
712 
713 /*
714  * Module event called when a scheduling algorithm module is loaded or
715  * unloaded.
716  */
717 int
g_gsched_modevent(module_t mod,int cmd,void * arg)718 g_gsched_modevent(module_t mod, int cmd, void *arg)
719 {
720 	struct g_gsched *gsp = arg;
721 	struct g_gsched_unregparm parm;
722 	int error;
723 
724 	G_SCHED_DEBUG(0, "Modevent %d.", cmd);
725 
726 	/*
727 	 * If the module is loaded at boot, the geom thread that calls
728 	 * g_sched_init() might actually run after g_gsched_modevent(),
729 	 * so make sure that the module is properly initialized.
730 	 */
731 	g_gsched_global_init();
732 
733 	error = EOPNOTSUPP;
734 	switch (cmd) {
735 	case MOD_LOAD:
736 		error = g_gsched_register(gsp);
737 		G_SCHED_DEBUG(0, "Loaded module %s error %d.",
738 		    gsp->gs_name, error);
739 		if (error == 0)
740 			g_retaste(&g_sched_class);
741 		break;
742 
743 	case MOD_UNLOAD:
744 		parm.gup_gsp = gsp;
745 		parm.gup_error = 0;
746 
747 		error = g_waitfor_event(g_gsched_unregister,
748 		    &parm, M_WAITOK, NULL);
749 		if (error == 0)
750 			error = parm.gup_error;
751 		G_SCHED_DEBUG(0, "Unloaded module %s error %d.",
752 		    gsp->gs_name, error);
753 		break;
754 	};
755 
756 	return (error);
757 }
758 
759 #ifdef KTR
760 #define	TRC_BIO_EVENT(e, bp)	g_sched_trace_bio_ ## e (bp)
761 
762 static inline char
g_sched_type(struct bio * bp)763 g_sched_type(struct bio *bp)
764 {
765 
766 	if (0 != (bp->bio_cmd & BIO_READ))
767 		return ('R');
768 	else if (0 != (bp->bio_cmd & BIO_WRITE))
769 		return ('W');
770 	return ('U');
771 }
772 
773 static inline void
g_sched_trace_bio_START(struct bio * bp)774 g_sched_trace_bio_START(struct bio *bp)
775 {
776 
777 	CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp),
778 	    g_sched_type(bp), bp->bio_offset / ULONG_MAX,
779 	    bp->bio_offset, bp->bio_length);
780 }
781 
782 static inline void
g_sched_trace_bio_DONE(struct bio * bp)783 g_sched_trace_bio_DONE(struct bio *bp)
784 {
785 
786 	CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp),
787 	    g_sched_type(bp), bp->bio_offset / ULONG_MAX,
788 	    bp->bio_offset, bp->bio_length);
789 }
790 #else /* !KTR */
791 #define	TRC_BIO_EVENT(e, bp)
792 #endif /* !KTR */
793 
794 /*
795  * g_sched_done() and g_sched_start() dispatch the geom requests to
796  * the scheduling algorithm in use.
797  */
798 static void
g_sched_done(struct bio * bio)799 g_sched_done(struct bio *bio)
800 {
801 	struct g_geom *gp = bio->bio_caller2;
802 	struct g_sched_softc *sc = gp->softc;
803 
804 	TRC_BIO_EVENT(DONE, bio);
805 
806 	KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done"));
807 
808 	g_sched_lock(gp);
809 
810 	g_sched_update_stats(bio);
811 	sc->sc_gsched->gs_done(sc->sc_data, bio);
812 	if (!--sc->sc_pending)
813 		wakeup(gp);
814 
815 	g_sched_flush_classes(gp);
816 	g_sched_unlock(gp);
817 
818 	g_std_done(bio);
819 }
820 
821 static void
g_sched_start(struct bio * bp)822 g_sched_start(struct bio *bp)
823 {
824 	struct g_geom *gp = bp->bio_to->geom;
825 	struct g_sched_softc *sc = gp->softc;
826 	struct bio *cbp;
827 
828 	TRC_BIO_EVENT(START, bp);
829 	G_SCHED_LOGREQ(bp, "Request received.");
830 
831 	cbp = g_clone_bio(bp);
832 	if (cbp == NULL) {
833 		g_io_deliver(bp, ENOMEM);
834 		return;
835 	}
836 	cbp->bio_done = g_sched_done;
837 	cbp->bio_to = LIST_FIRST(&gp->provider);
838 	KASSERT(cbp->bio_to != NULL, ("NULL provider"));
839 
840 	/* We only schedule reads and writes. */
841 	if (0 == (bp->bio_cmd & (BIO_READ | BIO_WRITE)))
842 		goto bypass;
843 
844 	G_SCHED_LOGREQ(cbp, "Sending request.");
845 
846 	g_sched_lock(gp);
847 	/*
848 	 * Call the algorithm's gs_start to queue the request in the
849 	 * scheduler. If gs_start fails then pass the request down,
850 	 * otherwise call g_sched_dispatch() which tries to push
851 	 * one or more requests down.
852 	 */
853 	if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) ||
854 	    sc->sc_gsched->gs_start(sc->sc_data, cbp)) {
855 		g_sched_unlock(gp);
856 		goto bypass;
857 	}
858 	/*
859 	 * We use bio_caller1 to mark requests that are scheduled
860 	 * so make sure it is not NULL.
861 	 */
862 	if (cbp->bio_caller1 == NULL)
863 		cbp->bio_caller1 = &me;	/* anything not NULL */
864 
865 	cbp->bio_caller2 = gp;
866 	sc->sc_pending++;
867 
868 	/* Update general stats. */
869 	me.gs_in_flight++;
870 	me.gs_requests++;
871 	me.gs_bytes_in_flight += bp->bio_length;
872 	if (bp->bio_cmd & BIO_WRITE) {
873 		me.gs_writes_in_flight++;
874 		me.gs_write_bytes_in_flight += bp->bio_length;
875 	}
876 	g_sched_dispatch(gp);
877 	g_sched_unlock(gp);
878 	return;
879 
880 bypass:
881 	cbp->bio_done = g_std_done;
882 	cbp->bio_caller1 = NULL; /* not scheduled */
883 	g_io_request(cbp, LIST_FIRST(&gp->consumer));
884 }
885 
886 /*
887  * The next few functions are the geom glue.
888  */
889 static void
g_sched_orphan(struct g_consumer * cp)890 g_sched_orphan(struct g_consumer *cp)
891 {
892 
893 	g_topology_assert();
894 	g_sched_destroy(cp->geom, 1);
895 }
896 
897 static int
g_sched_access(struct g_provider * pp,int dr,int dw,int de)898 g_sched_access(struct g_provider *pp, int dr, int dw, int de)
899 {
900 	struct g_geom *gp;
901 	struct g_consumer *cp;
902 	int error;
903 
904 	gp = pp->geom;
905 	cp = LIST_FIRST(&gp->consumer);
906 	error = g_access(cp, dr, dw, de);
907 
908 	return (error);
909 }
910 
911 static void
g_sched_temporary_start(struct bio * bio)912 g_sched_temporary_start(struct bio *bio)
913 {
914 
915 	mtx_lock(&me.gs_mtx);
916 	me.gs_npending++;
917 	gs_bioq_disksort(&me.gs_pending, bio);
918 	mtx_unlock(&me.gs_mtx);
919 }
920 
921 static void
g_sched_flush_pending(g_start_t * start)922 g_sched_flush_pending(g_start_t *start)
923 {
924 	struct bio *bp;
925 
926 	while ((bp = gs_bioq_takefirst(&me.gs_pending)))
927 		start(bp);
928 }
929 
930 static int
g_insert_proxy(struct g_geom * gp,struct g_provider * newpp,struct g_geom * dstgp,struct g_provider * pp,struct g_consumer * cp)931 g_insert_proxy(struct g_geom *gp, struct g_provider *newpp,
932     struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp)
933 {
934 	struct g_sched_softc *sc = gp->softc;
935 	g_start_t *saved_start, *flush = g_sched_start;
936 	int error = 0, endticks = ticks + hz;
937 
938 	g_cancel_event(newpp);	/* prevent taste() */
939 	/* copy private fields */
940 	newpp->private = pp->private;
941 	newpp->index = pp->index;
942 
943 	/* Queue all the early requests coming for us. */
944 	me.gs_npending = 0;
945 	saved_start = pp->geom->start;
946 	dstgp->start = g_sched_temporary_start;
947 
948 	while (pp->nstart - pp->nend != me.gs_npending &&
949 	    endticks - ticks >= 0)
950 		tsleep(pp, PRIBIO, "-", hz/10);
951 
952 	if (pp->nstart - pp->nend != me.gs_npending) {
953 		flush = saved_start;
954 		error = ETIMEDOUT;
955 		goto fail;
956 	}
957 
958 	/* link pp to this geom */
959 	LIST_REMOVE(pp, provider);
960 	pp->geom = gp;
961 	LIST_INSERT_HEAD(&gp->provider, pp, provider);
962 
963 	/*
964 	 * replicate the counts from the parent in the
965 	 * new provider and consumer nodes
966 	 */
967 	cp->acr = newpp->acr = pp->acr;
968 	cp->acw = newpp->acw = pp->acw;
969 	cp->ace = newpp->ace = pp->ace;
970 	sc->sc_flags |= G_SCHED_PROXYING;
971 
972 fail:
973 	dstgp->start = saved_start;
974 
975 	g_sched_flush_pending(flush);
976 
977 	return (error);
978 }
979 
980 /*
981  * Create a geom node for the device passed as *pp.
982  * If successful, add a reference to this gsp.
983  */
984 static int
g_sched_create(struct gctl_req * req,struct g_class * mp,struct g_provider * pp,struct g_gsched * gsp,int proxy)985 g_sched_create(struct gctl_req *req, struct g_class *mp,
986     struct g_provider *pp, struct g_gsched *gsp, int proxy)
987 {
988 	struct g_sched_softc *sc = NULL;
989 	struct g_geom *gp, *dstgp;
990 	struct g_provider *newpp = NULL;
991 	struct g_consumer *cp = NULL;
992 	char name[64];
993 	int error;
994 
995 	g_topology_assert();
996 
997 	snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX);
998 	LIST_FOREACH(gp, &mp->geom, geom) {
999 		if (strcmp(gp->name, name) == 0) {
1000 			gctl_error(req, "Geom %s already exists.",
1001 			    name);
1002 			return (EEXIST);
1003 		}
1004 	}
1005 
1006 	gp = g_new_geomf(mp, "%s", name);
1007 	dstgp = proxy ? pp->geom : gp; /* where do we link the provider */
1008 
1009 	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
1010 	sc->sc_gsched = gsp;
1011 	sc->sc_data = gsp->gs_init(gp);
1012 	if (sc->sc_data == NULL) {
1013 		error = ENOMEM;
1014 		goto fail;
1015 	}
1016 
1017 	sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK);
1018 
1019 	/*
1020 	 * Do not initialize the flush mechanism, will be initialized
1021 	 * on the first insertion on the hash table.
1022 	 */
1023 
1024 	mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF);
1025 
1026 	gp->softc = sc;
1027 	gp->start = g_sched_start;
1028 	gp->orphan = g_sched_orphan;
1029 	gp->access = g_sched_access;
1030 	gp->dumpconf = g_sched_dumpconf;
1031 
1032 	newpp = g_new_providerf(dstgp, "%s", gp->name);
1033 	newpp->mediasize = pp->mediasize;
1034 	newpp->sectorsize = pp->sectorsize;
1035 
1036 	cp = g_new_consumer(gp);
1037 	error = g_attach(cp, proxy ? newpp : pp);
1038 	if (error != 0) {
1039 		gctl_error(req, "Cannot attach to provider %s.",
1040 		    pp->name);
1041 		goto fail;
1042 	}
1043 
1044 	g_error_provider(newpp, 0);
1045 	if (proxy) {
1046 		error = g_insert_proxy(gp, newpp, dstgp, pp, cp);
1047 		if (error)
1048 			goto fail;
1049 	}
1050 	G_SCHED_DEBUG(0, "Device %s created.", gp->name);
1051 
1052 	g_gsched_ref(gsp);
1053 
1054 	return (0);
1055 
1056 fail:
1057 	if (cp != NULL) {
1058 		if (cp->provider != NULL)
1059 			g_detach(cp);
1060 		g_destroy_consumer(cp);
1061 	}
1062 	if (newpp != NULL)
1063 		g_destroy_provider(newpp);
1064 	if (sc->sc_hash)
1065 		g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1066 		    gsp, sc->sc_data);
1067 	if (sc->sc_data)
1068 		gsp->gs_fini(sc->sc_data);
1069 	g_free(gp->softc);
1070 	g_destroy_geom(gp);
1071 
1072 	return (error);
1073 }
1074 
1075 /*
1076  * Support for dynamic switching of scheduling algorithms.
1077  * First initialize the data structures for the new algorithm,
1078  * then call g_sched_remove_locked() to flush all references
1079  * to the old one, finally link the new algorithm.
1080  */
1081 static int
g_sched_change_algo(struct gctl_req * req,struct g_class * mp,struct g_provider * pp,struct g_gsched * gsp)1082 g_sched_change_algo(struct gctl_req *req, struct g_class *mp,
1083     struct g_provider *pp, struct g_gsched *gsp)
1084 {
1085 	struct g_sched_softc *sc;
1086 	struct g_geom *gp;
1087 	struct g_hash *newh;
1088 	void *data;
1089 	u_long mask;
1090 	int error = 0;
1091 
1092 	gp = pp->geom;
1093 	sc = gp->softc;
1094 
1095 	data = gsp->gs_init(gp);
1096 	if (data == NULL)
1097 		return (ENOMEM);
1098 
1099 	newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK);
1100 	if (gsp->gs_priv_size && !newh) {
1101 		error = ENOMEM;
1102 		goto fail;
1103 	}
1104 
1105 	g_sched_lock(gp);
1106 	if (sc->sc_gsched) {	/* can be NULL in some cases */
1107 		error = g_sched_remove_locked(gp, sc->sc_gsched);
1108 		if (error)
1109 			goto fail;
1110 	}
1111 
1112 	g_gsched_ref(gsp);
1113 	sc->sc_gsched = gsp;
1114 	sc->sc_data = data;
1115 	sc->sc_hash = newh;
1116 	sc->sc_mask = mask;
1117 
1118 	g_sched_unlock(gp);
1119 
1120 	return (0);
1121 
1122 fail:
1123 	if (newh)
1124 		g_sched_hash_fini(gp, newh, mask, gsp, data);
1125 
1126 	if (data)
1127 		gsp->gs_fini(data);
1128 
1129 	g_sched_unlock(gp);
1130 
1131 	return (error);
1132 }
1133 
1134 /*
1135  * Stop the request flow directed to the proxy, redirecting the new
1136  * requests to the me.gs_pending queue.
1137  */
1138 static struct g_provider *
g_detach_proxy(struct g_geom * gp)1139 g_detach_proxy(struct g_geom *gp)
1140 {
1141 	struct g_consumer *cp;
1142 	struct g_provider *pp, *newpp;
1143 
1144 	do {
1145 		pp = LIST_FIRST(&gp->provider);
1146 		if (pp == NULL)
1147 			break;
1148 		cp = LIST_FIRST(&gp->consumer);
1149 		if (cp == NULL)
1150 			break;
1151 		newpp = cp->provider;
1152 		if (newpp == NULL)
1153 			break;
1154 
1155 		me.gs_npending = 0;
1156 		pp->geom->start = g_sched_temporary_start;
1157 
1158 		return (pp);
1159 	} while (0);
1160 	printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name);
1161 
1162 	return (NULL);
1163 }
1164 
1165 static void
g_sched_blackhole(struct bio * bp)1166 g_sched_blackhole(struct bio *bp)
1167 {
1168 
1169 	g_io_deliver(bp, ENXIO);
1170 }
1171 
1172 static inline void
g_reparent_provider(struct g_provider * pp,struct g_geom * gp,struct g_provider * newpp)1173 g_reparent_provider(struct g_provider *pp, struct g_geom *gp,
1174     struct g_provider *newpp)
1175 {
1176 
1177 	LIST_REMOVE(pp, provider);
1178 	if (newpp) {
1179 		pp->private = newpp->private;
1180 		pp->index = newpp->index;
1181 	}
1182 	pp->geom = gp;
1183 	LIST_INSERT_HEAD(&gp->provider, pp, provider);
1184 }
1185 
1186 static inline void
g_unproxy_provider(struct g_provider * oldpp,struct g_provider * newpp)1187 g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp)
1188 {
1189 	struct g_geom *gp = oldpp->geom;
1190 
1191 	g_reparent_provider(oldpp, newpp->geom, newpp);
1192 
1193 	/*
1194 	 * Hackish: let the system destroy the old provider for us, just
1195 	 * in case someone attached a consumer to it, in which case a
1196 	 * direct call to g_destroy_provider() would not work.
1197 	 */
1198 	g_reparent_provider(newpp, gp, NULL);
1199 }
1200 
1201 /*
1202  * Complete the proxy destruction, linking the old provider to its
1203  * original geom, and destroying the proxy provider.  Also take care
1204  * of issuing the pending requests collected in me.gs_pending (if any).
1205  */
1206 static int
g_destroy_proxy(struct g_geom * gp,struct g_provider * oldpp)1207 g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp)
1208 {
1209 	struct g_consumer *cp;
1210 	struct g_provider *newpp;
1211 
1212 	do {
1213 		cp = LIST_FIRST(&gp->consumer);
1214 		if (cp == NULL)
1215 			break;
1216 		newpp = cp->provider;
1217 		if (newpp == NULL)
1218 			break;
1219 
1220 		/* Relink the provider to its original geom. */
1221 		g_unproxy_provider(oldpp, newpp);
1222 
1223 		/* Detach consumer from provider, and destroy provider. */
1224 		cp->acr = newpp->acr = 0;
1225 		cp->acw = newpp->acw = 0;
1226 		cp->ace = newpp->ace = 0;
1227 		g_detach(cp);
1228 
1229 		/* Send the pending bios through the right start function. */
1230 		g_sched_flush_pending(oldpp->geom->start);
1231 
1232 		return (0);
1233 	} while (0);
1234 	printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name);
1235 
1236 	/* We cannot send the pending bios anywhere... */
1237 	g_sched_flush_pending(g_sched_blackhole);
1238 
1239 	return (EINVAL);
1240 }
1241 
1242 static int
g_sched_destroy(struct g_geom * gp,boolean_t force)1243 g_sched_destroy(struct g_geom *gp, boolean_t force)
1244 {
1245 	struct g_provider *pp, *oldpp = NULL;
1246 	struct g_sched_softc *sc;
1247 	struct g_gsched *gsp;
1248 	int error;
1249 
1250 	g_topology_assert();
1251 	sc = gp->softc;
1252 	if (sc == NULL)
1253 		return (ENXIO);
1254 	if (!(sc->sc_flags & G_SCHED_PROXYING)) {
1255 		pp = LIST_FIRST(&gp->provider);
1256 		if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
1257 			const char *msg = force ?
1258 				"but we force removal" : "cannot remove";
1259 
1260 			G_SCHED_DEBUG(!force,
1261 			    "Device %s is still open (r%dw%de%d), %s.",
1262 			    pp->name, pp->acr, pp->acw, pp->ace, msg);
1263 			if (!force)
1264 				return (EBUSY);
1265 		} else {
1266 			G_SCHED_DEBUG(0, "Device %s removed.", gp->name);
1267 		}
1268 	} else
1269 		oldpp = g_detach_proxy(gp);
1270 
1271 	gsp = sc->sc_gsched;
1272 	if (gsp) {
1273 		/*
1274 		 * XXX bad hack here: force a dispatch to release
1275 		 * any reference to the hash table still held by
1276 		 * the scheduler.
1277 		 */
1278 		g_sched_lock(gp);
1279 		/*
1280 		 * We are dying here, no new requests should enter
1281 		 * the scheduler.  This is granted by the topolgy,
1282 		 * either in case we were proxying (new bios are
1283 		 * being redirected) or not (see the access check
1284 		 * above).
1285 		 */
1286 		g_sched_forced_dispatch(gp);
1287 		error = g_sched_wait_pending(gp);
1288 
1289 		if (error) {
1290 			/*
1291 			 * Not all the requests came home: this might happen
1292 			 * under heavy load, or if we were waiting for any
1293 			 * bio which is served in the event path (see
1294 			 * geom_slice.c for an example of how this can
1295 			 * happen).  Try to restore a working configuration
1296 			 * if we can fail.
1297 			 */
1298 			if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1299 				g_sched_flush_pending(force ?
1300 				    g_sched_blackhole : g_sched_start);
1301 			}
1302 
1303 			/*
1304 			 * In the forced destroy case there is not so much
1305 			 * we can do, we have pending bios that will call
1306 			 * g_sched_done() somehow, and we don't want them
1307 			 * to crash the system using freed memory.  We tell
1308 			 * the user that something went wrong, and leak some
1309 			 * memory here.
1310 			 * Note: the callers using force = 1 ignore the
1311 			 * return value.
1312 			 */
1313 			if (force) {
1314 				G_SCHED_DEBUG(0, "Pending requests while "
1315 				    " destroying geom, some memory leaked.");
1316 			}
1317 
1318 			return (error);
1319 		}
1320 
1321 		g_sched_unlock(gp);
1322 		g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1323 		    gsp, sc->sc_data);
1324 		sc->sc_hash = NULL;
1325 		gsp->gs_fini(sc->sc_data);
1326 		g_gsched_unref(gsp);
1327 		sc->sc_gsched = NULL;
1328 	}
1329 
1330 	if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1331 		error = g_destroy_proxy(gp, oldpp);
1332 
1333 		if (error) {
1334 			if (force) {
1335 				G_SCHED_DEBUG(0, "Unrecoverable error while "
1336 				    "destroying a proxy geom, leaking some "
1337 				    " memory.");
1338 			}
1339 
1340 			return (error);
1341 		}
1342 	}
1343 
1344 	mtx_destroy(&sc->sc_mtx);
1345 
1346 	g_free(gp->softc);
1347 	gp->softc = NULL;
1348 	g_wither_geom(gp, ENXIO);
1349 
1350 	return (error);
1351 }
1352 
1353 static int
g_sched_destroy_geom(struct gctl_req * req,struct g_class * mp,struct g_geom * gp)1354 g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp,
1355     struct g_geom *gp)
1356 {
1357 
1358 	return (g_sched_destroy(gp, 0));
1359 }
1360 
1361 /*
1362  * Functions related to the classification of requests.
1363  *
1364  * On recent FreeBSD versions (8.0 and above), we store a reference
1365  * to the issuer of a request in bp->bio_classifier1 as soon
1366  * as the bio is posted to the geom queue (and not later, because
1367  * requests are managed by the g_down thread afterwards).
1368  *
1369  * On older versions of the system (but this code is not used
1370  * in any existing release), we [ab]use the caller1 field in the
1371  * root element of the bio tree to store the classification info.
1372  * The marking is done at the beginning of g_io_request()
1373  * and only if we find that the field is NULL.
1374  *
1375  * To avoid rebuilding the kernel, this module will patch the
1376  * initial part of g_io_request() so it jumps to some hand-coded
1377  * assembly that does the marking and then executes the original
1378  * body of g_io_request().
1379  *
1380  * fake_ioreq[] is architecture-specific machine code
1381  * that implements the above. CODE_SIZE, STORE_SIZE etc.
1382  * are constants used in the patching routine. Look at the
1383  * code in g_ioreq_patch() for the details.
1384  */
1385 
1386 #ifndef HAVE_BIO_CLASSIFIER
1387 /*
1388  * Support for old FreeBSD versions
1389  */
1390 #if defined(__i386__)
1391 #define	CODE_SIZE	29
1392 #define	STORE_SIZE	5
1393 #define	EPILOGUE	5
1394 #define	SIZE		(CODE_SIZE + STORE_SIZE + EPILOGUE)
1395 
1396 static u_char fake_ioreq[SIZE] = {
1397 	0x8b, 0x44, 0x24, 0x04,		/* mov bp, %eax */
1398 	/* 1: */
1399 	0x89, 0xc2,			/* mov %eax, %edx # edx = bp */
1400 	0x8b, 0x40, 0x64,		/* mov bp->bio_parent, %eax */
1401 	0x85, 0xc0,			/* test %eax, %eax */
1402 	0x75, 0xf7,			/* jne 1b */
1403 	0x8b, 0x42, 0x30,		/* mov bp->bp_caller1, %eax */
1404 	0x85, 0xc0,			/* test %eax, %eax */
1405 	0x75, 0x09,			/* jne 2f */
1406 	0x64, 0xa1, 0x00, 0x00,		/* mov %fs:0, %eax */
1407 	0x00, 0x00,
1408 	0x89, 0x42, 0x30,		/* mov %eax, bp->bio_caller1 */
1409 	/* 2: */
1410         0x55, 0x89, 0xe5, 0x57, 0x56,
1411 	0xe9, 0x00, 0x00, 0x00, 0x00,	/* jmp back... */
1412 };
1413 #elif defined(__amd64)
1414 #define	CODE_SIZE	38
1415 #define	STORE_SIZE	6
1416 #define	EPILOGUE	5
1417 #define	SIZE		(CODE_SIZE + STORE_SIZE + EPILOGUE)
1418 
1419 static u_char fake_ioreq[SIZE] = {
1420 	0x48, 0x89, 0xf8,		/* mov bp, %rax */
1421 	/* 1: */
1422 	0x48, 0x89, 0xc2,		/* mov %rax, %rdx # rdx = bp */
1423 	0x48, 0x8b, 0x82, 0xa8,		/* mov bp->bio_parent, %rax */
1424 	0x00, 0x00, 0x00,
1425 	0x48, 0x85, 0xc0,		/* test %rax, %rax */
1426 	0x75, 0xf1,			/* jne 1b */
1427 	0x48, 0x83, 0x7a, 0x58,		/* cmp $0, bp->bp_caller1 */
1428 	0x00,
1429 	0x75, 0x0d,			/* jne 2f */
1430 	0x65, 0x48, 0x8b, 0x04,		/* mov %gs:0, %rax */
1431 	0x25, 0x00, 0x00, 0x00,
1432 	0x00,
1433 	0x48, 0x89, 0x42, 0x58,		/* mov %rax, bp->bio_caller1 */
1434 	/* 2: */
1435 	0x55, 0x48, 0x89, 0xe5, 0x41, 0x56,
1436 	0xe9, 0x00, 0x00, 0x00, 0x00,	/* jmp back... */
1437 };
1438 #else /* neither x86 nor amd64 */
1439 static void
g_new_io_request(struct bio * bp,struct g_consumer * cp)1440 g_new_io_request(struct bio *bp, struct g_consumer *cp)
1441 {
1442 	struct bio *top = bp;
1443 
1444         /*
1445          * bio classification: if bio_caller1 is available in the
1446          * root of the 'struct bio' tree, store there the thread id
1447          * of the thread that originated the request.
1448          * More sophisticated classification schemes can be used.
1449          */
1450 	while (top->bio_parent)
1451 		top = top->bio_parent;
1452 
1453 	if (top->bio_caller1 == NULL)
1454 		top->bio_caller1 = curthread;
1455 }
1456 
1457 #error please add the code above in g_new_io_request() to the beginning of \
1458 	/sys/geom/geom_io.c::g_io_request(), and remove this line.
1459 #endif /* end of arch-specific code */
1460 
1461 static int
g_ioreq_patch(void)1462 g_ioreq_patch(void)
1463 {
1464 	u_char *original;
1465 	u_long ofs;
1466 	int found;
1467 
1468 	if (me.gs_patched)
1469 		return (-1);
1470 
1471 	original = (u_char *)g_io_request;
1472 
1473 	found = !bcmp(original, fake_ioreq + CODE_SIZE, STORE_SIZE);
1474 	if (!found)
1475 		return (-1);
1476 
1477 	/* Jump back to the original + STORE_SIZE. */
1478 	ofs = (original + STORE_SIZE) - (fake_ioreq + SIZE);
1479 	bcopy(&ofs, fake_ioreq + CODE_SIZE + STORE_SIZE + 1, 4);
1480 
1481 	/* Patch the original address with a jump to the trampoline. */
1482 	*original = 0xe9;     /* jump opcode */
1483 	ofs = fake_ioreq - (original + 5);
1484 	bcopy(&ofs, original + 1, 4);
1485 
1486 	me.gs_patched = 1;
1487 
1488 	return (0);
1489 }
1490 
1491 /*
1492  * Restore the original code, this is easy.
1493  */
1494 static void
g_ioreq_restore(void)1495 g_ioreq_restore(void)
1496 {
1497 	u_char *original;
1498 
1499 	if (me.gs_patched) {
1500 		original = (u_char *)g_io_request;
1501 		bcopy(fake_ioreq + CODE_SIZE, original, STORE_SIZE);
1502 		me.gs_patched = 0;
1503 	}
1504 }
1505 
1506 static inline void
g_classifier_ini(void)1507 g_classifier_ini(void)
1508 {
1509 
1510 	g_ioreq_patch();
1511 }
1512 
1513 static inline void
g_classifier_fini(void)1514 g_classifier_fini(void)
1515 {
1516 
1517 	g_ioreq_restore();
1518 }
1519 
1520 /*--- end of support code for older FreeBSD versions */
1521 
1522 #else /* HAVE_BIO_CLASSIFIER */
1523 
1524 /*
1525  * Classifier support for recent FreeBSD versions: we use
1526  * a very simple classifier, only use curthread to tag a request.
1527  * The classifier is registered at module load, and unregistered
1528  * at module unload.
1529  */
1530 static int
g_sched_tag(void * arg,struct bio * bp)1531 g_sched_tag(void *arg, struct bio *bp)
1532 {
1533 
1534 	bp->bio_classifier1 = curthread;
1535 	return (1);
1536 }
1537 
1538 static struct g_classifier_hook g_sched_classifier = {
1539 	.func =	g_sched_tag,
1540 };
1541 
1542 static inline void
g_classifier_ini(void)1543 g_classifier_ini(void)
1544 {
1545 
1546 	g_register_classifier(&g_sched_classifier);
1547 }
1548 
1549 static inline void
g_classifier_fini(void)1550 g_classifier_fini(void)
1551 {
1552 
1553 	g_unregister_classifier(&g_sched_classifier);
1554 }
1555 #endif /* HAVE_BIO_CLASSIFIER */
1556 
1557 static void
g_sched_init(struct g_class * mp)1558 g_sched_init(struct g_class *mp)
1559 {
1560 
1561 	g_gsched_global_init();
1562 
1563 	G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.",
1564 	    mp, &g_sched_class);
1565 
1566 	/* Patch g_io_request to store classification info in the bio. */
1567 	g_classifier_ini();
1568 }
1569 
1570 static void
g_sched_fini(struct g_class * mp)1571 g_sched_fini(struct g_class *mp)
1572 {
1573 
1574 	g_classifier_fini();
1575 
1576 	G_SCHED_DEBUG(0, "Unloading...");
1577 
1578 	KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers"));
1579 	mtx_destroy(&me.gs_mtx);
1580 }
1581 
1582 static int
g_sched_ioctl(struct g_provider * pp,u_long cmd,void * data,int fflag,struct thread * td)1583 g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag,
1584     struct thread *td)
1585 {
1586 	struct g_consumer *cp;
1587 	struct g_geom *gp;
1588 
1589 	cp = LIST_FIRST(&pp->geom->consumer);
1590 	if (cp == NULL)
1591 		return (ENOIOCTL);
1592 	gp = cp->provider->geom;
1593 	if (gp->ioctl == NULL)
1594 		return (ENOIOCTL);
1595 	return (gp->ioctl(cp->provider, cmd, data, fflag, td));
1596 }
1597 
1598 /*
1599  * Read the i-th argument for a request, skipping the /dev/
1600  * prefix if present.
1601  */
1602 static const char *
g_sched_argi(struct gctl_req * req,int i)1603 g_sched_argi(struct gctl_req *req, int i)
1604 {
1605 	static const char *dev_prefix = "/dev/";
1606 	const char *name;
1607 	char param[16];
1608 	int l = strlen(dev_prefix);
1609 
1610 	snprintf(param, sizeof(param), "arg%d", i);
1611 	name = gctl_get_asciiparam(req, param);
1612 	if (name == NULL)
1613 		gctl_error(req, "No 'arg%d' argument", i);
1614 	else if (strncmp(name, dev_prefix, l) == 0)
1615 		name += l;
1616 	return (name);
1617 }
1618 
1619 /*
1620  * Fetch nargs and do appropriate checks.
1621  */
1622 static int
g_sched_get_nargs(struct gctl_req * req)1623 g_sched_get_nargs(struct gctl_req *req)
1624 {
1625 	int *nargs;
1626 
1627 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1628 	if (nargs == NULL) {
1629 		gctl_error(req, "No 'nargs' argument");
1630 		return (0);
1631 	}
1632 	if (*nargs <= 0)
1633 		gctl_error(req, "Missing device(s).");
1634 	return (*nargs);
1635 }
1636 
1637 /*
1638  * Check whether we should add the class on certain volumes when
1639  * this geom is created. Right now this is under control of a kenv
1640  * variable containing the names of all devices that we care about.
1641  * Probably we should only support transparent insertion as the
1642  * preferred mode of operation.
1643  */
1644 static struct g_geom *
g_sched_taste(struct g_class * mp,struct g_provider * pp,int flags __unused)1645 g_sched_taste(struct g_class *mp, struct g_provider *pp,
1646 		int flags __unused)
1647 {
1648 	struct g_gsched *gsp = NULL;	/* the . algorithm we want */
1649 	const char *s;			/* generic string pointer */
1650 	const char *taste_names;	/* devices we like */
1651 	int l;
1652 
1653         g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__,
1654 	    mp->name, pp->name);
1655         g_topology_assert();
1656 
1657         G_SCHED_DEBUG(2, "Tasting %s.", pp->name);
1658 
1659 	do {
1660 		/* do not taste on ourselves */
1661 		if (pp->geom->class == mp)
1662                 	break;
1663 
1664 		taste_names = getenv("geom.sched.taste");
1665 		if (taste_names == NULL)
1666 			break;
1667 
1668 		l = strlen(pp->name);
1669 		for (s = taste_names; *s &&
1670 		    (s = strstr(s, pp->name)); s++) {
1671 			/* further checks for an exact match */
1672 			if ( (s == taste_names || s[-1] == ' ') &&
1673 			     (s[l] == '\0' || s[l] == ' ') )
1674 				break;
1675 		}
1676 		if (s == NULL)
1677 			break;
1678 		G_SCHED_DEBUG(0, "Attach device %s match [%s]\n",
1679 		    pp->name, s);
1680 
1681 		/* look up the provider name in the list */
1682 		s = getenv("geom.sched.algo");
1683 		if (s == NULL)
1684 			s = "rr";
1685 
1686 		gsp = g_gsched_find(s);	/* also get a reference */
1687 		if (gsp == NULL) {
1688 			G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s);
1689 			break;
1690 		}
1691 
1692 		/* XXX create with 1 as last argument ? */
1693 		g_sched_create(NULL, mp, pp, gsp, 0);
1694 		g_gsched_unref(gsp);
1695 	} while (0);
1696 	return NULL;
1697 }
1698 
1699 static void
g_sched_ctl_create(struct gctl_req * req,struct g_class * mp,int proxy)1700 g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy)
1701 {
1702 	struct g_provider *pp;
1703 	struct g_gsched *gsp;
1704 	const char *name;
1705 	int i, nargs;
1706 
1707 	g_topology_assert();
1708 
1709 	name = gctl_get_asciiparam(req, "algo");
1710 	if (name == NULL) {
1711 		gctl_error(req, "No '%s' argument", "algo");
1712 		return;
1713 	}
1714 
1715 	gsp = g_gsched_find(name);	/* also get a reference */
1716 	if (gsp == NULL) {
1717 		gctl_error(req, "Bad algorithm '%s'", name);
1718 		return;
1719 	}
1720 
1721 	nargs = g_sched_get_nargs(req);
1722 
1723 	/*
1724 	 * Run on the arguments, and break on any error.
1725 	 * We look for a device name, but skip the /dev/ prefix if any.
1726 	 */
1727 	for (i = 0; i < nargs; i++) {
1728 		name = g_sched_argi(req, i);
1729 		if (name == NULL)
1730 			break;
1731 		pp = g_provider_by_name(name);
1732 		if (pp == NULL) {
1733 			G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1734 			gctl_error(req, "Provider %s is invalid.", name);
1735 			break;
1736 		}
1737 		if (g_sched_create(req, mp, pp, gsp, proxy) != 0)
1738 			break;
1739 	}
1740 
1741 	g_gsched_unref(gsp);
1742 }
1743 
1744 static void
g_sched_ctl_configure(struct gctl_req * req,struct g_class * mp)1745 g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp)
1746 {
1747 	struct g_provider *pp;
1748 	struct g_gsched *gsp;
1749 	const char *name;
1750 	int i, nargs;
1751 
1752 	g_topology_assert();
1753 
1754 	name = gctl_get_asciiparam(req, "algo");
1755 	if (name == NULL) {
1756 		gctl_error(req, "No '%s' argument", "algo");
1757 		return;
1758 	}
1759 
1760 	gsp = g_gsched_find(name);	/* also get a reference */
1761 	if (gsp == NULL) {
1762 		gctl_error(req, "Bad algorithm '%s'", name);
1763 		return;
1764 	}
1765 
1766 	nargs = g_sched_get_nargs(req);
1767 
1768 	/*
1769 	 * Run on the arguments, and break on any error.
1770 	 * We look for a device name, but skip the /dev/ prefix if any.
1771 	 */
1772 	for (i = 0; i < nargs; i++) {
1773 		name = g_sched_argi(req, i);
1774 		if (name == NULL)
1775 			break;
1776 		pp = g_provider_by_name(name);
1777 		if (pp == NULL || pp->geom->class != mp) {
1778 			G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1779 			gctl_error(req, "Provider %s is invalid.", name);
1780 			break;
1781 		}
1782 		if (g_sched_change_algo(req, mp, pp, gsp) != 0)
1783 			break;
1784 	}
1785 
1786 	g_gsched_unref(gsp);
1787 }
1788 
1789 static struct g_geom *
g_sched_find_geom(struct g_class * mp,const char * name)1790 g_sched_find_geom(struct g_class *mp, const char *name)
1791 {
1792 	struct g_geom *gp;
1793 
1794 	LIST_FOREACH(gp, &mp->geom, geom) {
1795 		if (strcmp(gp->name, name) == 0)
1796 			return (gp);
1797 	}
1798 	return (NULL);
1799 }
1800 
1801 static void
g_sched_ctl_destroy(struct gctl_req * req,struct g_class * mp)1802 g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp)
1803 {
1804 	int nargs, *force, error, i;
1805 	struct g_geom *gp;
1806 	const char *name;
1807 
1808 	g_topology_assert();
1809 
1810 	nargs = g_sched_get_nargs(req);
1811 
1812 	force = gctl_get_paraml(req, "force", sizeof(*force));
1813 	if (force == NULL) {
1814 		gctl_error(req, "No 'force' argument");
1815 		return;
1816 	}
1817 
1818 	for (i = 0; i < nargs; i++) {
1819 		name = g_sched_argi(req, i);
1820 		if (name == NULL)
1821 			break;
1822 
1823 		gp = g_sched_find_geom(mp, name);
1824 		if (gp == NULL) {
1825 			G_SCHED_DEBUG(1, "Device %s is invalid.", name);
1826 			gctl_error(req, "Device %s is invalid.", name);
1827 			break;
1828 		}
1829 
1830 		error = g_sched_destroy(gp, *force);
1831 		if (error != 0) {
1832 			gctl_error(req, "Cannot destroy device %s (error=%d).",
1833 			    gp->name, error);
1834 			break;
1835 		}
1836 	}
1837 }
1838 
1839 static void
g_sched_config(struct gctl_req * req,struct g_class * mp,const char * verb)1840 g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb)
1841 {
1842 	uint32_t *version;
1843 
1844 	g_topology_assert();
1845 
1846 	version = gctl_get_paraml(req, "version", sizeof(*version));
1847 	if (version == NULL) {
1848 		gctl_error(req, "No '%s' argument.", "version");
1849 		return;
1850 	}
1851 
1852 	if (*version != G_SCHED_VERSION) {
1853 		gctl_error(req, "Userland and kernel parts are "
1854 		    "out of sync.");
1855 		return;
1856 	}
1857 
1858 	if (strcmp(verb, "create") == 0) {
1859 		g_sched_ctl_create(req, mp, 0);
1860 		return;
1861 	} else if (strcmp(verb, "insert") == 0) {
1862 		g_sched_ctl_create(req, mp, 1);
1863 		return;
1864 	} else if (strcmp(verb, "configure") == 0) {
1865 		g_sched_ctl_configure(req, mp);
1866 		return;
1867 	} else if (strcmp(verb, "destroy") == 0) {
1868 		g_sched_ctl_destroy(req, mp);
1869 		return;
1870 	}
1871 
1872 	gctl_error(req, "Unknown verb.");
1873 }
1874 
1875 static void
g_sched_dumpconf(struct sbuf * sb,const char * indent,struct g_geom * gp,struct g_consumer * cp,struct g_provider * pp)1876 g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1877     struct g_consumer *cp, struct g_provider *pp)
1878 {
1879 	struct g_sched_softc *sc = gp->softc;
1880 	struct g_gsched *gsp = sc->sc_gsched;
1881 	if (indent == NULL) {	/* plaintext */
1882 		sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--");
1883 	}
1884 	if (gsp != NULL && gsp->gs_dumpconf)
1885 		gsp->gs_dumpconf(sb, indent, gp, cp, pp);
1886 }
1887 
1888 DECLARE_GEOM_CLASS(g_sched_class, g_sched);
1889 MODULE_VERSION(geom_sched, 0);
1890