geom_sched/000755 000423 000000 00000000000 11360647061 013415 5ustar00luigiwheel000000 000000 geom_sched/sys/000755 000423 000000 00000000000 11154477443 014241 5ustar00luigiwheel000000 000000 geom_sched/geom_sched.c000644 000423 000000 00000006607 11360617522 015666 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id$ * $FreeBSD$ * * This file implements the userspace library used by the 'geom' * command to load and manipulate disk schedulers. */ #include #include #include #include #include #include #include #include "core/geom.h" #include "misc/subr.h" #define G_SCHED_VERSION 0 uint32_t lib_version = G_LIB_VERSION; uint32_t version = G_SCHED_VERSION; /* * storage for parameters used by this geom class. * Right now only the scheduler name is used. */ static char algo[] = "rr"; /* default scheduler */ /* * Adapt to differences in geom library. * in V1 struct g_command misses gc_argname, eld, and G_BOOL is undefined */ #if G_LIB_VERSION == 1 #define G_ARGNAME #define G_TYPE_BOOL G_TYPE_NUMBER #else #define G_ARGNAME NULL, #endif static void gcmd_createinsert(struct gctl_req *req, unsigned flags __unused) { const char *reqalgo; char name[64]; if (gctl_has_param(req, "algo")) reqalgo = gctl_get_ascii(req, "algo"); else reqalgo = algo; snprintf(name, sizeof(name), "gsched_%s", reqalgo); /* * Do not complain about errors here, gctl_issue() * will fail anyway. */ if (modfind(name) < 0) kldload(name); gctl_issue(req); } struct g_command class_commands[] = { { "create", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert, { { 'a', "algo", algo, G_TYPE_STRING }, G_OPT_SENTINEL }, G_ARGNAME "[-v] [-a algorithm_name] dev ..." }, { "insert", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert, { { 'a', "algo", algo, G_TYPE_STRING }, G_OPT_SENTINEL }, G_ARGNAME "[-v] [-a algorithm_name] dev ..." }, { "configure", G_FLAG_VERBOSE, NULL, { { 'a', "algo", algo, G_TYPE_STRING }, G_OPT_SENTINEL }, G_ARGNAME "[-v] [-a algorithm_name] prov ..." }, { "destroy", G_FLAG_VERBOSE, NULL, { { 'f', "force", NULL, G_TYPE_BOOL }, G_OPT_SENTINEL }, G_ARGNAME "[-fv] prov ..." }, { "reset", G_FLAG_VERBOSE, NULL, G_NULL_OPTS, G_ARGNAME "[-v] prov ..." }, G_CMD_SENTINEL }; geom_sched/gsched.8000644 000423 000000 00000011056 11360647320 014744 0ustar00luigiwheel000000 000000 .\" Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo .\" All rights reserved. .\" $FreeBSD$ .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .Dd March 6, 2009 .Dt GSCHED 8 .Os .Sh NAME .Nm gsched .Nd "control utility for disk scheduler GEOM class" .Sh SYNOPSIS .Nm .Cm create .Op Fl v .Op Fl a Ar algorithm .Ar provider ... .Nm .Cm insert .Op Fl v .Op Fl a Ar algorithm .Ar provider ... .Nm .Cm configure .Op Fl v .Op Fl a Ar algorithm .Ar node ... .Nm .Cm destroy .Op Fl fv .Ar node ... .Nm .Cm reset .Op Fl v .Ar node ... .Nm .Cm { list | status | load | unload } .Sh DESCRIPTION The .Nm utility (also callable as .Nm geom sched ... ) changes the scheduling policy of the requests going to a provider. .Pp The first argument to .Nm indicates an action to be performed: .Bl -tag -width ".Cm configure" .It Cm create Create a new provider and geom node using the specified scheduling algorithm. .Ar algorithm is the name of the scheduling algorithm used for the provider. Available algorithms include: .Ar rr , which implements anticipatory scheduling with round robin service among clients; .Ar as , which implements a simple form of anticipatory scheduling with no per-client queue. .Pp If the operation succeeds, the new provider should appear with name .Pa /dev/ Ns Ao Ar dev Ac Ns Pa .sched. . The kernel module .Pa geom_sched.ko will be loaded if it is not loaded already. .It Cm insert Operates as "create", but the insertion is "transparent", i.e. the existing provider is rerouted to the newly created geom, which in turn forwards requests to the existing geom. This operation allows one to start/stop a scheduling service on an already existing provider. .Pp A subsequent 'destroy' will remove the newly created geom and hook the provider back to the original geom. .Ar algorithm .It Cm configure Configure existing scheduling provider. It supports the same options as the .Nm create command. .It Cm destroy Destroy the geom specified in the parameter. .It Cm reset Do nothing. .It Cm list | status | load | unload See .Xr geom 8 . .El .Pp Additional options: .Bl -tag -width ".Fl f" .It Fl f Force the removal of the specified provider. .It Fl v Be more verbose. .El .Sh SYSCTL VARIABLES The following .Xr sysctl 8 variables can be used to control the behavior of the .Nm SCHED GEOM class. The default value is shown next to each variable. .Bl -tag -width indent .It Va kern.geom.sched.debug : No 0 Debug level of the .Nm SCHED GEOM class. This can be set to a number between 0 and 2 inclusive. If set to 0 minimal debug information is printed, and if set to 2 the maximum amount of debug information is printed. .El .Sh EXIT STATUS Exit status is 0 on success, and 1 if the command fails. .Sh EXAMPLES The following example shows how to create a scheduling provider for disk .Pa /dev/da0 , and how to destroy it. .Bd -literal -offset indent # Load the geom_sched module: kldload geom_sched # Load some scheduler classes used by geom_sched: kldload gsched_rr gsched_as # Configure device ad0 to use scheduler 'rr': geom sched insert -s rr ad0 # Now provider ad0 uses the 'rr' algorithm; # the new geom is ad0.sched. # Remove the scheduler on the device: geom sched destroy -v ad0.sched. .Ed .Pp .Sh SEE ALSO .Xr geom 4 , .Xr geom 8 .Sh HISTORY The .Nm utility appeared in April 2010. .Sh AUTHORS .An Fabio Checconi Aq fabio@FreeBSD.org .An Luigi Rizzo Aq luigi@FreeBSD.org geom_sched/Makefile000644 000423 000000 00000000471 11154477443 015065 0ustar00luigiwheel000000 000000 # GEOM_LIBRARY_PATH # $FreeBSD: $ .PATH: /usr/src/sbin/geom/misc CFLAGS += -I/usr/src/sbin/geom CLASS=sched WARNS?= 6 CLASS_DIR?=/lib/geom SHLIBDIR?=${CLASS_DIR} SHLIB_NAME?=geom_${CLASS}.so LINKS= ${BINDIR}/geom ${BINDIR}/g${CLASS} MAN= g${CLASS}.8 SRCS+= geom_${CLASS}.c subr.c .include geom_sched/sys/geom/000755 000423 000000 00000000000 11222101576 015153 5ustar00luigiwheel000000 000000 geom_sched/sys/modules/000755 000423 000000 00000000000 11154477443 015711 5ustar00luigiwheel000000 000000 geom_sched/sys/modules/geom/000755 000423 000000 00000000000 11222101576 016623 5ustar00luigiwheel000000 000000 geom_sched/sys/modules/geom/geom_sched/000755 000423 000000 00000000000 11247450343 020726 5ustar00luigiwheel000000 000000 geom_sched/sys/modules/geom/geom_sched/gs_sched/000755 000423 000000 00000000000 11360647061 022506 5ustar00luigiwheel000000 000000 geom_sched/sys/modules/geom/geom_sched/gsched_as/000755 000423 000000 00000000000 11360647061 022647 5ustar00luigiwheel000000 000000 geom_sched/sys/modules/geom/geom_sched/gsched_rr/000755 000423 000000 00000000000 11360647061 022667 5ustar00luigiwheel000000 000000 geom_sched/sys/modules/geom/geom_sched/gsched_ssd/000755 000423 000000 00000000000 11360647247 023043 5ustar00luigiwheel000000 000000 geom_sched/sys/modules/geom/geom_sched/gsched_bfq/000755 000423 000000 00000000000 11360647061 023014 5ustar00luigiwheel000000 000000 geom_sched/sys/modules/geom/geom_sched/Makefile.inc000644 000423 000000 00000000253 11154477443 023145 0ustar00luigiwheel000000 000000 # included by geom_sched children .PATH: ${.CURDIR}/../../../../geom/sched # 6.x needs this path CFLAGS += -I${.CURDIR}/../../../../geom/sched # .include geom_sched/sys/modules/geom/geom_sched/Makefile000644 000423 000000 00000000300 11360647272 022364 0ustar00luigiwheel000000 000000 # $FreeBSD: $ SUBDIR= gs_sched gsched_rr #SUBDIR += gsched_as .if defined(WITH_BFQ) SUBDIR += gs_sched_bfq .endif .if defined(WITH_SSD) SUBDIR += gsched_ssd .endif .include geom_sched/sys/modules/geom/geom_sched/gsched_bfq/Makefile000644 000423 000000 00000000264 11247450342 024454 0ustar00luigiwheel000000 000000 # $FreeBSD: $ KMOD= gsched_bfq SRCS= gs_bfq.c # hash.h on 6.x has a (char *) cast on a const pointer #CWARNFLAGS= # ../Makefile.inc automatically included .include geom_sched/sys/modules/geom/geom_sched/gsched_ssd/Makefile000644 000423 000000 00000000264 11154477443 024505 0ustar00luigiwheel000000 000000 # $FreeBSD: $ KMOD= gsched_ssd SRCS= gs_ssd.c # hash.h on 6.x has a (char *) cast on a const pointer #CWARNFLAGS= # ../Makefile.inc automatically included .include geom_sched/sys/modules/geom/geom_sched/gsched_rr/Makefile000644 000423 000000 00000000262 11154477443 024335 0ustar00luigiwheel000000 000000 # $FreeBSD: $ KMOD= gsched_rr SRCS= gs_rr.c # hash.h on 6.x has a (char *) cast on a const pointer #CWARNFLAGS= # ../Makefile.inc automatically included .include geom_sched/sys/modules/geom/geom_sched/gsched_as/Makefile000644 000423 000000 00000000156 11154477443 024317 0ustar00luigiwheel000000 000000 # $FreeBSD: $ KMOD= gsched_as SRCS= gs_as.c # ../Makefile.inc automatically included .include geom_sched/sys/modules/geom/geom_sched/gs_sched/Makefile000644 000423 000000 00000000200 11163114112 024121 0ustar00luigiwheel000000 000000 # $FreeBSD: $ KMOD= geom_sched SRCS= g_sched.c subr_disk.c # ../Makefile.inc automatically included .include geom_sched/sys/geom/sched/000755 000423 000000 00000000000 11247450342 016246 5ustar00luigiwheel000000 000000 geom_sched/sys/geom/sched/gs_scheduler.h000644 000423 000000 00000016307 11360626514 021077 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id$ * $FreeBSD$ * * Prototypes for GEOM-based disk scheduling algorithms. * See g_sched.c for generic documentation. * * This file is used by the kernel modules implementing the various * scheduling algorithms. They should provide all the methods * defined in struct g_gsched, and also invoke the macro * DECLARE_GSCHED_MODULE * which registers the scheduling algorithm with the geom_sched module. * * The various scheduling algorithms do not need to know anything * about geom, they only need to handle the 'bio' requests they * receive, pass them down when needed, and use the locking interface * defined below. */ #ifndef _G_GSCHED_H_ #define _G_GSCHED_H_ #ifdef _KERNEL #include #include #include #include #include #include #include "g_sched.h" /* * This is the interface exported to scheduling modules. * * gs_init() is called when our scheduling algorithm * starts being used by a geom 'sched' * * gs_fini() is called when the algorithm is released. * * gs_start() is called when a new request comes in. It should * enqueue the request and return 0 if success, or return non-zero * in case of failure (meaning the request is passed down). * The scheduler can use bio->bio_caller1 to store a non-null * pointer meaning the request is under its control. * * gs_next() is called in a loop by g_sched_dispatch(), right after * gs_start(), or on timeouts or 'done' events. It should return * immediately, either a pointer to the bio to be served or NULL * if no bio should be served now. If force is specified, a * work-conserving behavior is expected. * * gs_done() is called when a request under service completes. * In turn the scheduler may decide to call the dispatch loop * to serve other pending requests (or make sure there is a pending * timeout to avoid stalls). * * gs_init_class() is called when a new client (as determined by * the classifier) starts being used. * * gs_hash_unref() is called right before the class hashtable is * destroyed; after this call, the scheduler is supposed to hold no * more references to the elements in the table. */ /* Forward declarations for prototypes. */ struct g_geom; struct g_sched_class; typedef void *gs_init_t (struct g_geom *geom); typedef void gs_fini_t (void *data); typedef int gs_start_t (void *data, struct bio *bio); typedef void gs_done_t (void *data, struct bio *bio); typedef struct bio *gs_next_t (void *data, int force); typedef int gs_init_class_t (void *data, void *priv); typedef void gs_fini_class_t (void *data, void *priv); typedef void gs_hash_unref_t (void *data); struct g_gsched { const char *gs_name; int gs_refs; int gs_priv_size; gs_init_t *gs_init; gs_fini_t *gs_fini; gs_start_t *gs_start; gs_done_t *gs_done; gs_next_t *gs_next; g_dumpconf_t *gs_dumpconf; gs_init_class_t *gs_init_class; gs_fini_class_t *gs_fini_class; gs_hash_unref_t *gs_hash_unref; LIST_ENTRY(g_gsched) glist; }; #define KTR_GSCHED KTR_SPARE4 MALLOC_DECLARE(M_GEOM_SCHED); /* * Basic classification mechanism. Each request is associated to * a g_sched_class, and each scheduler has the opportunity to set * its own private data for the given (class, geom) pair. The * private data have a base type of g_sched_private, and are * extended at the end with the actual private fields of each * scheduler. */ struct g_sched_class { int gsc_refs; int gsc_expire; u_long gsc_key; LIST_ENTRY(g_sched_class) gsc_clist; void *gsc_priv[0]; }; /* * Manipulate the classifier's data. g_sched_get_class() gets a reference * to the the class corresponding to bp in gp, allocating and initializing * it if necessary. g_sched_put_class() releases the reference. * The returned value points to the private data for the class. */ void *g_sched_get_class(struct g_geom *gp, struct bio *bp); void g_sched_put_class(struct g_geom *gp, void *priv); static inline struct g_sched_class * g_sched_priv2class(void *priv) { return ((struct g_sched_class *)((u_long)priv - offsetof(struct g_sched_class, gsc_priv))); } static inline void g_sched_priv_ref(void *priv) { struct g_sched_class *gsc; gsc = g_sched_priv2class(priv); gsc->gsc_refs++; } /* * Locking interface. When each operation registered with the * scheduler is invoked, a per-instance lock is taken to protect * the data associated with it. If the scheduler needs something * else to access the same data (e.g., a callout) it must use * these functions. */ void g_sched_lock(struct g_geom *gp); void g_sched_unlock(struct g_geom *gp); /* * Restart request dispatching. Must be called with the per-instance * mutex held. */ void g_sched_dispatch(struct g_geom *geom); /* * Simple gathering of statistical data, used by schedulers to collect * info on process history. Just keep an exponential average of the * samples, with some extra bits of precision. */ struct g_savg { uint64_t gs_avg; unsigned int gs_smpl; }; static inline void g_savg_add_sample(struct g_savg *ss, uint64_t sample) { /* EMA with alpha = 0.125, fixed point, 3 bits of precision. */ ss->gs_avg = sample + ss->gs_avg - (ss->gs_avg >> 3); ss->gs_smpl = 1 + ss->gs_smpl - (ss->gs_smpl >> 3); } static inline int g_savg_valid(struct g_savg *ss) { /* We want at least 8 samples to deem an average as valid. */ return (ss->gs_smpl > 7); } static inline uint64_t g_savg_read(struct g_savg *ss) { return (ss->gs_avg / ss->gs_smpl); } /* * Declaration of a scheduler module. */ int g_gsched_modevent(module_t mod, int cmd, void *arg); #define DECLARE_GSCHED_MODULE(name, gsched) \ static moduledata_t name##_mod = { \ #name, \ g_gsched_modevent, \ gsched, \ }; \ DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); \ MODULE_DEPEND(name, geom_sched, 0, 0, 0); #endif /* _KERNEL */ #endif /* _G_GSCHED_H_ */ geom_sched/sys/geom/sched/g_sched.h000644 000423 000000 00000011456 11360626233 020022 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _G_SCHED_H_ #define _G_SCHED_H_ /* * $Id$ * $FreeBSD$ * * Header for the geom_sched class (userland library and kernel part). * See g_sched.c for documentation. * The userland code only needs the three G_SCHED_* values below. */ #define G_SCHED_CLASS_NAME "SCHED" #define G_SCHED_VERSION 0 #define G_SCHED_SUFFIX ".sched." #ifdef _KERNEL #define G_SCHED_DEBUG(lvl, ...) do { \ if (me.gs_debug >= (lvl)) { \ printf("GEOM_SCHED"); \ if (me.gs_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define G_SCHED_LOGREQ(bp, ...) do { \ if (me.gs_debug >= 2) { \ printf("GEOM_SCHED[2]: "); \ printf(__VA_ARGS__); \ printf(" "); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) LIST_HEAD(g_hash, g_sched_class); /* * Descriptor of a scheduler. * In addition to the obvious fields, sc_flushing and sc_pending * support dynamic switching of scheduling algorithm. * Normally, sc_flushing is 0, and requests that are scheduled are * also added to the sc_pending queue, and removed when we receive * the 'done' event. * * When we are transparently inserted on an existing provider, * sc_proxying is set. The detach procedure is slightly different. * * When switching schedulers, sc_flushing is set so requests bypass us, * and at the same time we update the pointer in the pending bios * to ignore us when they return up. * XXX it would be more efficient to implement sc_pending with * a generation number: the softc generation is increased when * we change scheduling algorithm, we store the current generation * number in the pending bios, and when they come back we ignore * the done() call if the generation number do not match. */ struct g_sched_softc { /* * Generic fields used by any scheduling algorithm: * a mutex, the class descriptor, flags, list of pending * requests (used when flushing the module) and support * for hash tables where we store per-flow queues. */ struct mtx sc_mtx; struct g_gsched *sc_gsched; /* Scheduler descriptor. */ int sc_pending; /* Pending requests. */ int sc_flags; /* Various flags. */ /* * Hash tables to store per-flow queues are generally useful * so we handle them in the common code. * sc_hash and sc_mask are parameters of the hash table, * the last two fields are used to periodically remove * expired items from the hash table. */ struct g_hash *sc_hash; u_long sc_mask; int sc_flush_ticks; /* Next tick for a flush. */ int sc_flush_bucket; /* Next bucket to flush. */ /* * Pointer to the algorithm's private data, which is the value * returned by sc_gsched->gs_init() . A NULL here means failure. * XXX intptr_t might be more appropriate. */ void *sc_data; }; #define G_SCHED_PROXYING 1 #define G_SCHED_FLUSHING 2 /* * Temporary- our own version of the disksort, because the * version in 7.x and 8.x before march 2009 is buggy. */ void gs_bioq_init(struct bio_queue_head *); void gs_bioq_remove(struct bio_queue_head *, struct bio *); void gs_bioq_flush(struct bio_queue_head *, struct devstat *, int); void gs_bioq_insert_head(struct bio_queue_head *, struct bio *); void gs_bioq_insert_tail(struct bio_queue_head *, struct bio *); struct bio *gs_bioq_first(struct bio_queue_head *); struct bio *gs_bioq_takefirst(struct bio_queue_head *); void gs_bioq_disksort(struct bio_queue_head *, struct bio *); #endif /* _KERNEL */ #endif /* _G_SCHED_H_ */ geom_sched/sys/geom/sched/gs_ssd.c000644 000423 000000 00000032373 11360626542 017707 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo, Paolo Valente * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id$ * $FreeBSD$ * * This code implements a round-robin anticipatory scheduler, with * per-client queues. * The goal of this scheduler is to give weighted-fair service to * devices where seek times are not relevant (e.g. flash memories). * In these environments, and in presence of synchronous requests * as typically issued by disk clients, we might be unable to achieve * weighted-fair service because flows are never backlogged after * their request has been served. * We use anticipation to simulate an arrival rate that approximates the * weight of each flow. In detail: * * The system is organized in rounds, during which each flow can receive * an amount of service proportional to its weight. * For each flow we keep the amount of service received so far, * and also the 'round' in which it should be served first. * * The system maintains a 'current round' index, and three round-robin * lists of flows: * CURR: flows with pending requests for the current round, * NEXT: flows with pending requests that cannot be served in this round; * STAGING: flows which received service in the current round but have * no pending requests. * The scheduler has three states: READY, BUSY, IDLING. * * Initially, all flows start idle and with a service=0. * * When a request arrives for an idle flow, it is put at the end of * CURR or NEXT depending on the amount of service * received so far. Requests for busy flows are simply appended to the * list of pending requests for the same flow. * We then generate a 'dispatch' event. * * On a dispatch event: * if state == BUSY, just return; otherwise, * if CURR is not empty, serve the first request from it; * if CURR is empty, state is READY (i.e. not IDLING), and NEXT * contains entries, the current round number is incremented, and * we call dispatch again; * otherwise, just return. * * When a flow is served, the scheduler goes into BUSY state, stops a * timer if any, charges the flow for the request, and depending on * residual requests and the amount of work received, the flow * - remains at the head of CURR if it has pending requests * and is still within the budget; * - is moved to the tail of the NEXT queue if it has pending * requests but it has exhausted its budget; * - is moved to the STAGING queue if it has no more pending requests. * * When a service request is completed: * if CURR is not empty, or both CURR and STAGING are empty, * change state to READY and call dispatch; * otherwise (CURR is empty but STAGING is not empty), set state to IDLING, * start a timer and return; * * When a timer expires, drain the STAGING queue, change state to READY * and call dispatch * */ #include #include #include #include #include #include #include #include #include #include #include #include "gs_scheduler.h" struct g_ssd_softc; /* * Per client queue structure. Each client in the system is * represented by this structure where we store the client's * requests. */ struct g_ssd_queue { struct g_ssd_softc *q_sc; /* reference to the parent */ struct bio_queue_head q_bioq; unsigned int q_service; unsigned int q_budget; int q_round; int q_wait_end; /* idling window end, in ticks */ LIST_ENTRY(g_ssd_queue) q_staging; /* staging list link field */ TAILQ_ENTRY(g_ssd_queue) q_tailq; /* RR list link field */ }; /* List types. */ TAILQ_HEAD(g_ssd_tailq, g_ssd_queue); LIST_HEAD(g_ssd_staging, g_ssd_queue); /* Default slice for RR between queues. */ #define G_SSD_DEFAULT_BUDGET 0x00800000 /* * Per device descriptor, holding the Round Robin list of queues * accessing the disk, a reference to the geom, the timer * and the hash table where we store the existing entries. */ struct g_ssd_softc { struct g_geom *sc_geom; int sc_nqueues; /* Number of active queues. */ struct callout sc_wait; /* Timer for sc_active. */ struct g_ssd_tailq sc_ssd_tailq; /* The round-robin list. */ struct g_ssd_staging sc_staging; /* List of staging queues. */ int sc_round; int sc_next_round; int sc_in_flight; /* Requests in the driver. */ int sc_wait_ticks; int sc_waiting; }; /* Descriptor for bounded values. */ struct x_bound { int x_min; int x_cur; int x_max; }; /* * Parameters, config and stats. */ struct g_ssd_params { int queue_depth; /* Max nr. of parallel requests. */ int units; /* How many instances. */ int queues; /* Total number of queues. */ int qrefs; /* Total number of refs to queues. */ struct x_bound wait_ms; /* Wait time in milliseconds. */ struct x_bound slice_kb; /* slice size in Kb (1024 bytes) */ }; static struct g_ssd_params me = { .queue_depth = 8, .wait_ms = { 1, 10, 30}, .slice_kb = { 16, 8192, 65536}, }; SYSCTL_DECL(_kern_geom_sched); SYSCTL_NODE(_kern_geom_sched, OID_AUTO, ssd, CTLFLAG_RW, 0, "GEOM_SCHED ANTICIPATORY SSD stuff"); SYSCTL_UINT(_kern_geom_sched_ssd, OID_AUTO, units, CTLFLAG_RD, &me.units, 0, "Scheduler instances"); SYSCTL_UINT(_kern_geom_sched_ssd, OID_AUTO, queues, CTLFLAG_RD, &me.queues, 0, "Total ssd queues"); SYSCTL_UINT(_kern_geom_sched_ssd, OID_AUTO, wait_ms, CTLFLAG_RW, &me.wait_ms.x_cur, 0, "Wait time milliseconds"); SYSCTL_UINT(_kern_geom_sched_ssd, OID_AUTO, slice_kb, CTLFLAG_RW, &me.slice_kb.x_cur, 0, "Slice size Kbytes"); SYSCTL_UINT(_kern_geom_sched_ssd, OID_AUTO, queue_depth, CTLFLAG_RW, &me.queue_depth, 0, "Maximum simultaneous requests"); /* * Get a bounded value, optionally convert to a min of t_min ticks. */ static int get_bounded(struct x_bound *v, int t_min) { int x; x = v->x_cur; if (x < v->x_min) x = v->x_min; else if (x > v->x_max) x = v->x_max; if (t_min) { x = x * hz / 1000; /* convert to ticks */ if (x < t_min) x = t_min; } return (x); } /* * Get a reference to the queue that holds requests for tp, allocating * it if necessary. */ static int g_ssd_init_class(void *data, void *priv, struct thread *tp) { struct g_ssd_softc *sc = data; struct g_ssd_queue *qp = priv; qp->q_sc = sc; gs_bioq_init(&qp->q_bioq); /* compute the slice size in bytes */ qp->q_budget = 1024 * get_bounded(&me.slice_kb, 0); qp->q_sc->sc_nqueues++; me.queues++; return (0); } static struct g_ssd_queue * g_ssd_queue_get(struct g_ssd_softc *sc, struct bio *bp) { return (g_sched_get_class(sc->sc_geom, bp)); } /* * Release a reference to the queue. */ static void g_ssd_queue_put(struct g_ssd_queue *qp) { g_sched_put_class(qp->q_sc->sc_geom, qp); } /* * Release a reference to the queue. */ static void g_ssd_fini_class(void *data, void *priv) { struct g_ssd_queue *qp = priv; KASSERT(gs_bioq_first(&qp->q_bioq) == NULL, ("released nonempty queue")); qp->q_sc->sc_nqueues--; me.queues--; } static inline int g_ssd_queue_expired(struct g_ssd_queue *qp) { return (qp->q_service >= qp->q_budget); } static inline int g_ssd_next_round(struct g_ssd_softc *sc, struct g_ssd_queue *qp) { return (qp->q_round == sc->sc_round + 1); } static void g_ssd_cleanup_staging(struct g_ssd_softc *sc, int force) { struct g_ssd_queue *qp, *qp2; LIST_FOREACH_SAFE(qp, &sc->sc_staging, q_staging, qp2) { if ((ticks - qp->q_wait_end) >= 0 || force) { qp->q_service = 0; LIST_REMOVE(qp, q_staging); g_ssd_queue_put(qp); } } } /* * Callout executed when a queue times out waiting for a new request. */ static void g_ssd_wait_timeout(void *data) { struct g_ssd_softc *sc = data; struct g_geom *geom = sc->sc_geom; g_sched_lock(geom); sc->sc_waiting = 0; g_ssd_cleanup_staging(sc, 1); g_sched_dispatch(geom); g_sched_unlock(geom); } /* * called on a request arrival, timeout or completion. * Try to serve a request among those queued. */ static struct bio * g_ssd_next(void *data, int force) { struct g_ssd_softc *sc = data; struct g_ssd_queue *qp; struct bio *bp, *next; g_ssd_cleanup_staging(sc, force); if (!force && sc->sc_in_flight >= me.queue_depth) return (NULL); qp = TAILQ_FIRST(&sc->sc_ssd_tailq); if (qp == NULL) return (NULL); if (!LIST_EMPTY(&sc->sc_staging) && g_ssd_next_round(sc, qp)) { if (!sc->sc_waiting) { callout_reset(&sc->sc_wait, sc->sc_wait_ticks, g_ssd_wait_timeout, sc); sc->sc_waiting = 1; } return (NULL); } /* Select the new queue for service. */ sc->sc_round = qp->q_round; bp = gs_bioq_takefirst(&qp->q_bioq); qp->q_service += bp->bio_length; next = gs_bioq_first(&qp->q_bioq); if (g_ssd_queue_expired(qp)) { TAILQ_REMOVE(&sc->sc_ssd_tailq, qp, q_tailq); qp->q_service = 0; if (next) { qp->q_round++; sc->sc_next_round = qp->q_round; TAILQ_INSERT_TAIL(&sc->sc_ssd_tailq, qp, q_tailq); } else g_ssd_queue_put(qp); } else if (!next) { TAILQ_REMOVE(&sc->sc_ssd_tailq, qp, q_tailq); if (!force) { qp->q_wait_end = ticks + 30 * sc->sc_wait_ticks; LIST_INSERT_HEAD(&sc->sc_staging, qp, q_staging); } else g_ssd_queue_put(qp); } sc->sc_in_flight++; return (bp); } /* * Called when a real request for disk I/O arrives. * Locate the queue associated with the client, and dispatch * immediately if it cannot be found. * If the queue is the one we are anticipating for, reset its timeout; * if the queue is not in the round robin list, insert it in the list. * Finally, call dispatch. */ static int g_ssd_start(void *data, struct bio *bp) { struct g_ssd_softc *sc = data; struct g_ssd_queue *qp; /* Get the queue for the thread that issued the request. */ qp = g_ssd_queue_get(sc, bp); if (qp == NULL) return (-1); /* allocation failed, tell upstream */ if (gs_bioq_first(&qp->q_bioq) == NULL) { /* * We are inserting into an empty queue; check whether * this is the one for which we are doing anticipation, * in which case stop the timer. * Otherwise insert the queue in the rr list. */ if (sc->sc_round == qp->q_round && qp->q_service != 0) { LIST_REMOVE(qp, q_staging); callout_stop(&sc->sc_wait); sc->sc_waiting = 0; TAILQ_INSERT_HEAD(&sc->sc_ssd_tailq, qp, q_tailq); } else { /* * ... this is the first request, we need to * activate the queue. */ g_sched_priv_ref(qp); qp->q_round = sc->sc_next_round; TAILQ_INSERT_TAIL(&sc->sc_ssd_tailq, qp, q_tailq); KASSERT(qp->q_service == 0, ("invalid service")); } } /* * Each request holds a reference to the queue containing it: * inherit the "caller" one. */ bp->bio_caller1 = qp; gs_bioq_disksort(&qp->q_bioq, bp); return (0); } /* * Module glue -- allocate descriptor, initialize the hash table and * the callout structure. */ static void * g_ssd_init(struct g_geom *geom) { struct g_ssd_softc *sc; sc = malloc(sizeof *sc, M_GEOM_SCHED, M_WAITOK | M_ZERO); sc->sc_geom = geom; TAILQ_INIT(&sc->sc_ssd_tailq); LIST_INIT(&sc->sc_staging); callout_init(&sc->sc_wait, CALLOUT_MPSAFE); sc->sc_wait_ticks = get_bounded(&me.wait_ms, 2); me.units++; return (sc); } /* * Module glue -- drain the callout structure, destroy the * hash table and its element, and free the descriptor. */ static void g_ssd_fini(void *data) { struct g_ssd_softc *sc = data; callout_drain(&sc->sc_wait); g_ssd_cleanup_staging(sc, 1); KASSERT(TAILQ_EMPTY(&sc->sc_ssd_tailq), ("still scheduled queues")); KASSERT(LIST_EMPTY(&sc->sc_staging), ("still staging queues")); me.units--; free(sc, M_GEOM_SCHED); } /* * Called when the request under service terminates. */ static void g_ssd_done(void *data, struct bio *bp) { struct g_ssd_softc *sc; struct g_ssd_queue *qp; sc = data; sc->sc_in_flight--; qp = bp->bio_caller1; qp->q_wait_end = ticks + sc->sc_wait_ticks; g_ssd_queue_put(qp); if (!sc->sc_in_flight) g_sched_dispatch(sc->sc_geom); } static struct g_gsched g_ssd = { .gs_name = "ssd", .gs_priv_size = sizeof(struct g_ssd_queue), .gs_init = g_ssd_init, .gs_fini = g_ssd_fini, .gs_start = g_ssd_start, .gs_done = g_ssd_done, .gs_next = g_ssd_next, .gs_init_class = g_ssd_init_class, .gs_fini_class = g_ssd_fini_class, }; DECLARE_GSCHED_MODULE(ssd, &g_ssd); geom_sched/sys/geom/sched/g_sched.c000644 000423 000000 00000131777 11360634237 020031 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id$ * $FreeBSD$ * * Main control module for geom-based disk schedulers ('sched'). * * USER VIEW * A 'sched' node is typically inserted transparently between * an existing provider pp and its original geom gp * * [pp --> gp ..] * * using the command "geom sched insert " and * resulting in the following topology * * [pp --> sched_gp --> cp] [new_pp --> gp ... ] * * Deletion "geom sched destroy .sched." restores the * original chain. The normal "geom sched create " * is also supported. * * INTERNALS * Internally, the 'sched' uses the following data structures * * geom{} g_sched_softc{} g_gsched{} * +----------+ +---------------+ +-------------+ * | softc *-|--->| sc_gsched *-|-->| gs_init | * | ... | | | | gs_fini | * | | | [ hash table] | | gs_start | * +----------+ | | | ... | * | | +-------------+ * | | * | | g_*_softc{} * | | +-------------+ * | sc_data *-|-->| | * +---------------+ | algorithm- | * | specific | * +-------------+ * * A g_sched_softc{} is created with a "geom sched insert" call. * In turn this instantiates a specific scheduling algorithm, * which sets sc_gsched to point to the algorithm callbacks, * and calls gs_init() to create the g_*_softc{} . * The other callbacks (gs_start, gs_next, ...) are invoked * as needed * * g_sched_softc{} is defined in g_sched.h and mostly used here; * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h; * g_*_softc{} is defined/implemented by each algorithm (gs_*.c) * * DATA MOVING * When a bio is received on the provider, it goes to the * g_sched_start() which calls gs_start() to initially queue it; * then we call g_sched_dispatch() that loops around gs_next() * to select zero or more bio's to be sent downstream. * * g_sched_dispatch() can also be called as a result of a timeout, * e.g. when doing anticipation or pacing requests. * * When a bio comes back, it goes to g_sched_done() which in turn * calls gs_done(). The latter does any necessary housekeeping in * the scheduling algorithm, and may decide to call g_sched_dispatch() * to send more bio's downstream. * * If an algorithm needs per-flow queues, these are created * calling gs_init_class() and destroyed with gs_fini_class(), * and they are also inserted in the hash table implemented in * the g_sched_softc{} * * If an algorithm is replaced, or a transparently-inserted node is * removed with "geom sched destroy", we need to remove all references * to the g_*_softc{} and g_sched_softc from the bio's still in * the scheduler. g_sched_forced_dispatch() helps doing this. * XXX need to explain better. */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* we access curthread */ #include #include "gs_scheduler.h" #include "g_sched.h" /* geom hooks */ /* * Size of the per-geom hash table storing traffic classes. * We may decide to change it at a later time, it has no ABI * implications as it is only used for run-time allocations. */ #define G_SCHED_HASH_SIZE 32 static int g_sched_destroy(struct g_geom *gp, boolean_t force); static int g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static void g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb); static struct g_geom *g_sched_taste(struct g_class *mp, struct g_provider *pp, int flags __unused); static void g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_sched_init(struct g_class *mp); static void g_sched_fini(struct g_class *mp); struct g_class g_sched_class = { .name = G_SCHED_CLASS_NAME, .version = G_VERSION, .ctlreq = g_sched_config, .taste = g_sched_taste, .destroy_geom = g_sched_destroy_geom, .init = g_sched_init, .fini = g_sched_fini }; MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures"); /* * Global variables describing the state of the geom_sched module. * There is only one static instance of this structure. */ LIST_HEAD(gs_list, g_gsched); /* type, link field */ struct geom_sched_vars { struct mtx gs_mtx; struct gs_list gs_scheds; /* list of algorithms */ u_int gs_debug; u_int gs_sched_count; /* how many algorithms ? */ u_int gs_patched; /* g_io_request was patched */ u_int gs_initialized; u_int gs_expire_secs; /* expiration of hash entries */ struct bio_queue_head gs_pending; u_int gs_npending; /* The following are for stats, usually protected by gs_mtx. */ u_long gs_requests; /* total requests */ u_long gs_done; /* total done */ u_int gs_in_flight; /* requests in flight */ u_int gs_writes_in_flight; u_int gs_bytes_in_flight; u_int gs_write_bytes_in_flight; char gs_names[256]; /* names of schedulers */ }; static struct geom_sched_vars me = { .gs_expire_secs = 10, }; SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0, "GEOM_SCHED stuff"); SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD, &me.gs_write_bytes_in_flight, 0, "Write bytes in flight"); SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD, &me.gs_bytes_in_flight, 0, "Bytes in flight"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD, &me.gs_writes_in_flight, 0, "Write Requests in flight"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD, &me.gs_in_flight, 0, "Requests in flight"); SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD, &me.gs_done, 0, "Total done"); SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD, &me.gs_requests, 0, "Total requests"); SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD, &me.gs_names, 0, "Algorithm names"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD, &me.gs_sched_count, 0, "Number of algorithms"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW, &me.gs_debug, 0, "Debug level"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW, &me.gs_expire_secs, 0, "Expire time in seconds"); /* * g_sched calls the scheduler algorithms with this lock held. * The locking functions are exposed so the scheduler algorithms can also * protect themselves e.g. when running a callout handler. */ void g_sched_lock(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; mtx_lock(&sc->sc_mtx); } void g_sched_unlock(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; mtx_unlock(&sc->sc_mtx); } /* * Support functions to handle references to the module, * which are coming from devices using this scheduler. */ static inline void g_gsched_ref(struct g_gsched *gsp) { atomic_add_int(&gsp->gs_refs, 1); } static inline void g_gsched_unref(struct g_gsched *gsp) { atomic_add_int(&gsp->gs_refs, -1); } /* * Update the stats when this request is done. */ static void g_sched_update_stats(struct bio *bio) { me.gs_done++; me.gs_in_flight--; me.gs_bytes_in_flight -= bio->bio_length; if (bio->bio_cmd & BIO_WRITE) { me.gs_writes_in_flight--; me.gs_write_bytes_in_flight -= bio->bio_length; } } /* * Dispatch any pending request. */ static void g_sched_forced_dispatch(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; struct g_gsched *gsp = sc->sc_gsched; struct bio *bp; KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during forced dispatch")); while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL) g_io_request(bp, LIST_FIRST(&gp->consumer)); } /* * The main dispatch loop, called either here after the start * routine, or by scheduling algorithms when they receive a timeout * or a 'done' notification. Does not share code with the forced * dispatch path, since the gs_done() callback can call us. */ void g_sched_dispatch(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; struct g_gsched *gsp = sc->sc_gsched; struct bio *bp; KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch")); if ((sc->sc_flags & G_SCHED_FLUSHING)) return; while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL) g_io_request(bp, LIST_FIRST(&gp->consumer)); } /* * Recent (8.0 and above) versions of FreeBSD have support to * register classifiers of disk requests. The classifier is * invoked by g_io_request(), and stores the information into * bp->bio_classifier1. * * Support for older versions, which is left here only for * documentation purposes, relies on two hacks: * 1. classification info is written into the bio_caller1 * field of the topmost node in the bio chain. This field * is rarely used, but this module is incompatible with * those that use bio_caller1 for other purposes, * such as ZFS and gjournal; * 2. g_io_request() is patched in-memory when the module is * loaded, so that the function calls a classifier as its * first thing. g_io_request() is restored when the module * is unloaded. This functionality is only supported for * x86 and amd64, other architectures need source code changes. */ /* * Lookup the identity of the issuer of the original request. * In the current implementation we use the curthread of the * issuer, but different mechanisms may be implemented later * so we do not make assumptions on the return value which for * us is just an opaque identifier. */ static inline u_long g_sched_classify(struct bio *bp) { #if __FreeBSD_version > 800098 /* we have classifier fields in the struct bio */ #define HAVE_BIO_CLASSIFIER return ((u_long)bp->bio_classifier1); #else #warning old version!!! while (bp->bio_parent != NULL) bp = bp->bio_parent; return ((u_long)bp->bio_caller1); #endif } /* Return the hash chain for the given key. */ static inline struct g_hash * g_sched_hash(struct g_sched_softc *sc, u_long key) { return (&sc->sc_hash[key & sc->sc_mask]); } /* * Helper function for the children classes, which takes * a geom and a bio and returns the private descriptor * associated to the request. This involves fetching * the classification field and [al]locating the * corresponding entry in the hash table. */ void * g_sched_get_class(struct g_geom *gp, struct bio *bp) { struct g_sched_softc *sc; struct g_sched_class *gsc; struct g_gsched *gsp; struct g_hash *bucket; u_long key; sc = gp->softc; key = g_sched_classify(bp); bucket = g_sched_hash(sc, key); LIST_FOREACH(gsc, bucket, gsc_clist) { if (key == gsc->gsc_key) { gsc->gsc_refs++; return (gsc->gsc_priv); } } gsp = sc->sc_gsched; gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size, M_GEOM_SCHED, M_NOWAIT | M_ZERO); if (!gsc) return (NULL); if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) { free(gsc, M_GEOM_SCHED); return (NULL); } gsc->gsc_refs = 2; /* 1 for the hash table, 1 for the caller. */ gsc->gsc_key = key; LIST_INSERT_HEAD(bucket, gsc, gsc_clist); gsc->gsc_expire = ticks + me.gs_expire_secs * hz; return (gsc->gsc_priv); } /* * Release a reference to the per-client descriptor, */ void g_sched_put_class(struct g_geom *gp, void *priv) { struct g_sched_class *gsc; struct g_sched_softc *sc; gsc = g_sched_priv2class(priv); gsc->gsc_expire = ticks + me.gs_expire_secs * hz; if (--gsc->gsc_refs > 0) return; sc = gp->softc; sc->sc_gsched->gs_fini_class(sc->sc_data, priv); LIST_REMOVE(gsc, gsc_clist); free(gsc, M_GEOM_SCHED); } static void g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask, struct g_gsched *gsp, void *data) { struct g_sched_class *cp, *cp2; int i; if (!hp) return; if (data && gsp->gs_hash_unref) gsp->gs_hash_unref(data); for (i = 0; i < G_SCHED_HASH_SIZE; i++) { LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2) g_sched_put_class(gp, cp->gsc_priv); } hashdestroy(hp, M_GEOM_SCHED, mask); } static struct g_hash * g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags) { struct g_hash *hash; if (gsp->gs_priv_size == 0) return (NULL); hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags); return (hash); } static void g_sched_flush_classes(struct g_geom *gp) { struct g_sched_softc *sc; struct g_sched_class *cp, *cp2; int i; sc = gp->softc; if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0) return; for (i = 0; i < G_SCHED_HASH_SIZE; i++) { LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) { if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0) g_sched_put_class(gp, cp->gsc_priv); } } sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz; } /* * Wait for the completion of any outstanding request. To ensure * that this does not take forever the caller has to make sure that * no new request enter the scehduler before calling us. * * Must be called with the gp mutex held and topology locked. */ static int g_sched_wait_pending(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; int endticks = ticks + hz; g_topology_assert(); while (sc->sc_pending && endticks - ticks >= 0) msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4); return (sc->sc_pending ? ETIMEDOUT : 0); } static int g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp) { struct g_sched_softc *sc = gp->softc; int error; /* Set the flushing flag: new bios will not enter the scheduler. */ sc->sc_flags |= G_SCHED_FLUSHING; g_sched_forced_dispatch(gp); error = g_sched_wait_pending(gp); if (error) goto failed; /* No more requests pending or in flight from the old gsp. */ g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); sc->sc_hash = NULL; /* * Avoid deadlock here by releasing the gp mutex and reacquiring * it once done. It should be safe, since no reconfiguration or * destruction can take place due to the geom topology lock; no * new request can use the current sc_data since we flagged the * geom as being flushed. */ g_sched_unlock(gp); gsp->gs_fini(sc->sc_data); g_sched_lock(gp); sc->sc_gsched = NULL; sc->sc_data = NULL; g_gsched_unref(gsp); failed: sc->sc_flags &= ~G_SCHED_FLUSHING; return (error); } static int g_sched_remove(struct g_geom *gp, struct g_gsched *gsp) { int error; g_sched_lock(gp); error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */ g_sched_unlock(gp); return (error); } /* * Support function for create/taste -- locate the desired * algorithm and grab a reference to it. */ static struct g_gsched * g_gsched_find(const char *name) { struct g_gsched *gsp = NULL; mtx_lock(&me.gs_mtx); LIST_FOREACH(gsp, &me.gs_scheds, glist) { if (strcmp(name, gsp->gs_name) == 0) { g_gsched_ref(gsp); break; } } mtx_unlock(&me.gs_mtx); return (gsp); } /* * Rebuild the list of scheduler names. * To be called with me.gs_mtx lock held. */ static void g_gsched_build_names(struct g_gsched *gsp) { int pos, l; struct g_gsched *cur; pos = 0; LIST_FOREACH(cur, &me.gs_scheds, glist) { l = strlen(cur->gs_name); if (l + pos + 1 + 1 < sizeof(me.gs_names)) { if (pos != 0) me.gs_names[pos++] = ' '; strcpy(me.gs_names + pos, cur->gs_name); pos += l; } } me.gs_names[pos] = '\0'; } /* * Register or unregister individual scheduling algorithms. */ static int g_gsched_register(struct g_gsched *gsp) { struct g_gsched *cur; int error = 0; mtx_lock(&me.gs_mtx); LIST_FOREACH(cur, &me.gs_scheds, glist) { if (strcmp(gsp->gs_name, cur->gs_name) == 0) break; } if (cur != NULL) { G_SCHED_DEBUG(0, "A scheduler named %s already" "exists.", gsp->gs_name); error = EEXIST; } else { LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist); gsp->gs_refs = 1; me.gs_sched_count++; g_gsched_build_names(gsp); } mtx_unlock(&me.gs_mtx); return (error); } struct g_gsched_unregparm { struct g_gsched *gup_gsp; int gup_error; }; static void g_gsched_unregister(void *arg, int flag) { struct g_gsched_unregparm *parm = arg; struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp; struct g_sched_softc *sc; struct g_geom *gp, *gp_tmp; int error; parm->gup_error = 0; g_topology_assert(); if (flag == EV_CANCEL) return; mtx_lock(&me.gs_mtx); LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) { if (gp->class != &g_sched_class) continue; /* Should not happen. */ sc = gp->softc; if (sc->sc_gsched == gsp) { error = g_sched_remove(gp, gsp); if (error) goto failed; } } LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) { if (cur != gsp) continue; if (gsp->gs_refs != 1) { G_SCHED_DEBUG(0, "%s still in use.", gsp->gs_name); parm->gup_error = EBUSY; } else { LIST_REMOVE(gsp, glist); me.gs_sched_count--; g_gsched_build_names(gsp); } break; } if (cur == NULL) { G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name); parm->gup_error = ENOENT; } failed: mtx_unlock(&me.gs_mtx); } static inline void g_gsched_global_init(void) { if (!me.gs_initialized) { G_SCHED_DEBUG(0, "Initializing global data."); mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF); LIST_INIT(&me.gs_scheds); gs_bioq_init(&me.gs_pending); me.gs_initialized = 1; } } /* * Module event called when a scheduling algorithm module is loaded or * unloaded. */ int g_gsched_modevent(module_t mod, int cmd, void *arg) { struct g_gsched *gsp = arg; struct g_gsched_unregparm parm; int error; G_SCHED_DEBUG(0, "Modevent %d.", cmd); /* * If the module is loaded at boot, the geom thread that calls * g_sched_init() might actually run after g_gsched_modevent(), * so make sure that the module is properly initialized. */ g_gsched_global_init(); error = EOPNOTSUPP; switch (cmd) { case MOD_LOAD: error = g_gsched_register(gsp); G_SCHED_DEBUG(0, "Loaded module %s error %d.", gsp->gs_name, error); if (error == 0) g_retaste(&g_sched_class); break; case MOD_UNLOAD: parm.gup_gsp = gsp; parm.gup_error = 0; error = g_waitfor_event(g_gsched_unregister, &parm, M_WAITOK, NULL); if (error == 0) error = parm.gup_error; G_SCHED_DEBUG(0, "Unloaded module %s error %d.", gsp->gs_name, error); break; }; return (error); } #ifdef KTR #define TRC_BIO_EVENT(e, bp) g_sched_trace_bio_ ## e (bp) static inline int g_sched_issuer_pid(struct bio *bp) { struct thread *thread = g_sched_issuer(bp); return (thread->td_tid); } static inline char g_sched_type(struct bio *bp) { if (0 != (bp->bio_cmd & BIO_READ)) return ('R'); else if (0 != (bp->bio_cmd & BIO_WRITE)) return ('W'); return ('U'); } static inline void g_sched_trace_bio_START(struct bio *bp) { CTR5(KTR_GSCHED, "S %d %c %lu/%lu %lu", g_sched_issuer_pid(bp), g_sched_type(bp), bp->bio_offset / ULONG_MAX, bp->bio_offset, bp->bio_length); } static inline void g_sched_trace_bio_DONE(struct bio *bp) { CTR5(KTR_GSCHED, "D %d %c %lu/%lu %lu", g_sched_issuer_pid(bp), g_sched_type(bp), bp->bio_offset / ULONG_MAX, bp->bio_offset, bp->bio_length); } #else #define TRC_BIO_EVENT(e, bp) #endif /* * g_sched_done() and g_sched_start() dispatch the geom requests to * the scheduling algorithm in use. */ static void g_sched_done(struct bio *bio) { struct g_geom *gp = bio->bio_caller2; struct g_sched_softc *sc = gp->softc; TRC_BIO_EVENT(DONE, bio); KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done")); g_sched_lock(gp); g_sched_update_stats(bio); sc->sc_gsched->gs_done(sc->sc_data, bio); if (!--sc->sc_pending) wakeup(gp); g_sched_flush_classes(gp); g_sched_unlock(gp); g_std_done(bio); } static void g_sched_start(struct bio *bp) { struct g_geom *gp = bp->bio_to->geom; struct g_sched_softc *sc = gp->softc; struct bio *cbp; TRC_BIO_EVENT(START, bp); G_SCHED_LOGREQ(bp, "Request received."); cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_sched_done; cbp->bio_to = LIST_FIRST(&gp->provider); KASSERT(cbp->bio_to != NULL, ("NULL provider")); /* We only schedule reads and writes. */ if (0 == (bp->bio_cmd & (BIO_READ | BIO_WRITE))) goto bypass; G_SCHED_LOGREQ(cbp, "Sending request."); g_sched_lock(gp); /* * Call the algorithm's gs_start to queue the request in the * scheduler. If gs_start fails then pass the request down, * otherwise call g_sched_dispatch() which tries to push * one or more requests down. */ if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) || sc->sc_gsched->gs_start(sc->sc_data, cbp)) { g_sched_unlock(gp); goto bypass; } /* * We use bio_caller1 to mark requests that are scheduled * so make sure it is not NULL. */ if (cbp->bio_caller1 == NULL) cbp->bio_caller1 = &me; /* anything not NULL */ cbp->bio_caller2 = gp; sc->sc_pending++; /* Update general stats. */ me.gs_in_flight++; me.gs_requests++; me.gs_bytes_in_flight += bp->bio_length; if (bp->bio_cmd & BIO_WRITE) { me.gs_writes_in_flight++; me.gs_write_bytes_in_flight += bp->bio_length; } g_sched_dispatch(gp); g_sched_unlock(gp); return; bypass: cbp->bio_done = g_std_done; cbp->bio_caller1 = NULL; /* not scheduled */ g_io_request(cbp, LIST_FIRST(&gp->consumer)); } /* * The next few functions are the geom glue. */ static void g_sched_orphan(struct g_consumer *cp) { g_topology_assert(); g_sched_destroy(cp->geom, 1); } static int g_sched_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; int error; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); error = g_access(cp, dr, dw, de); return (error); } static void g_sched_temporary_start(struct bio *bio) { mtx_lock(&me.gs_mtx); me.gs_npending++; gs_bioq_disksort(&me.gs_pending, bio); mtx_unlock(&me.gs_mtx); } static void g_sched_flush_pending(g_start_t *start) { struct bio *bp; while ((bp = gs_bioq_takefirst(&me.gs_pending))) start(bp); } static int g_insert_proxy(struct g_geom *gp, struct g_provider *newpp, struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp) { struct g_sched_softc *sc = gp->softc; g_start_t *saved_start, *flush = g_sched_start; int error = 0, endticks = ticks + hz; g_cancel_event(newpp); /* prevent taste() */ /* copy private fields */ newpp->private = pp->private; newpp->index = pp->index; /* Queue all the early requests coming for us. */ me.gs_npending = 0; saved_start = pp->geom->start; dstgp->start = g_sched_temporary_start; while (pp->nstart - pp->nend != me.gs_npending && endticks - ticks >= 0) tsleep(pp, PRIBIO, "-", hz/10); if (pp->nstart - pp->nend != me.gs_npending) { flush = saved_start; error = ETIMEDOUT; goto fail; } /* link pp to this geom */ LIST_REMOVE(pp, provider); pp->geom = gp; LIST_INSERT_HEAD(&gp->provider, pp, provider); /* * replicate the counts from the parent in the * new provider and consumer nodes */ cp->acr = newpp->acr = pp->acr; cp->acw = newpp->acw = pp->acw; cp->ace = newpp->ace = pp->ace; sc->sc_flags |= G_SCHED_PROXYING; fail: dstgp->start = saved_start; g_sched_flush_pending(flush); return (error); } /* * Create a geom node for the device passed as *pp. * If successful, add a reference to this gsp. */ static int g_sched_create(struct gctl_req *req, struct g_class *mp, struct g_provider *pp, struct g_gsched *gsp, int proxy) { struct g_sched_softc *sc = NULL; struct g_geom *gp, *dstgp; struct g_provider *newpp = NULL; struct g_consumer *cp = NULL; char name[64]; int error; g_topology_assert(); snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX); LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) { gctl_error(req, "Geom %s already exists.", name); return (EEXIST); } } gp = g_new_geomf(mp, name); dstgp = proxy ? pp->geom : gp; /* where do we link the provider */ if (gp == NULL) { gctl_error(req, "Cannot create geom %s.", name); error = ENOMEM; goto fail; } sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); sc->sc_gsched = gsp; sc->sc_data = gsp->gs_init(gp); if (sc->sc_data == NULL) { error = ENOMEM; goto fail; } sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK); /* * Do not initialize the flush mechanism, will be initialized * on the first insertion on the hash table. */ mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF); gp->softc = sc; gp->start = g_sched_start; gp->orphan = g_sched_orphan; gp->access = g_sched_access; gp->dumpconf = g_sched_dumpconf; newpp = g_new_providerf(dstgp, gp->name); if (newpp == NULL) { gctl_error(req, "Cannot create provider %s.", name); error = ENOMEM; goto fail; } newpp->mediasize = pp->mediasize; newpp->sectorsize = pp->sectorsize; cp = g_new_consumer(gp); if (cp == NULL) { gctl_error(req, "Cannot create consumer for %s.", gp->name); error = ENOMEM; goto fail; } error = g_attach(cp, proxy ? newpp : pp); if (error != 0) { gctl_error(req, "Cannot attach to provider %s.", pp->name); goto fail; } g_error_provider(newpp, 0); if (proxy) { error = g_insert_proxy(gp, newpp, dstgp, pp, cp); if (error) goto fail; } G_SCHED_DEBUG(0, "Device %s created.", gp->name); g_gsched_ref(gsp); return (0); fail: if (cp != NULL) { if (cp->provider != NULL) g_detach(cp); g_destroy_consumer(cp); } if (newpp != NULL) g_destroy_provider(newpp); if (sc && sc->sc_hash) { g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); } if (sc && sc->sc_data) gsp->gs_fini(sc->sc_data); if (gp != NULL) { if (gp->softc != NULL) g_free(gp->softc); g_destroy_geom(gp); } return (error); } /* * Support for dynamic switching of scheduling algorithms. * First initialize the data structures for the new algorithm, * then call g_sched_remove_locked() to flush all references * to the old one, finally link the new algorithm. */ static int g_sched_change_algo(struct gctl_req *req, struct g_class *mp, struct g_provider *pp, struct g_gsched *gsp) { struct g_sched_softc *sc; struct g_geom *gp; struct g_hash *newh; void *data; u_long mask; int error = 0; gp = pp->geom; sc = gp->softc; data = gsp->gs_init(gp); if (data == NULL) return (ENOMEM); newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK); if (gsp->gs_priv_size && !newh) { error = ENOMEM; goto fail; } g_sched_lock(gp); if (sc->sc_gsched) { /* can be NULL in some cases */ error = g_sched_remove_locked(gp, sc->sc_gsched); if (error) goto fail; } g_gsched_ref(gsp); sc->sc_gsched = gsp; sc->sc_data = data; sc->sc_hash = newh; sc->sc_mask = mask; g_sched_unlock(gp); return (0); fail: if (newh) g_sched_hash_fini(gp, newh, mask, gsp, data); if (data) gsp->gs_fini(data); g_sched_unlock(gp); return (error); } /* * Stop the request flow directed to the proxy, redirecting the new * requests to the me.gs_pending queue. */ static struct g_provider * g_detach_proxy(struct g_geom *gp) { struct g_consumer *cp; struct g_provider *pp, *newpp; do { pp = LIST_FIRST(&gp->provider); if (pp == NULL) break; cp = LIST_FIRST(&gp->consumer); if (cp == NULL) break; newpp = cp->provider; if (newpp == NULL) break; me.gs_npending = 0; pp->geom->start = g_sched_temporary_start; return (pp); } while (0); printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name); return (NULL); } static void g_sched_blackhole(struct bio *bp) { g_io_deliver(bp, ENXIO); } static inline void g_reparent_provider(struct g_provider *pp, struct g_geom *gp, struct g_provider *newpp) { LIST_REMOVE(pp, provider); if (newpp) { pp->private = newpp->private; pp->index = newpp->index; } pp->geom = gp; LIST_INSERT_HEAD(&gp->provider, pp, provider); } static inline void g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp) { struct g_geom *gp = oldpp->geom; g_reparent_provider(oldpp, newpp->geom, newpp); /* * Hackish: let the system destroy the old provider for us, just * in case someone attached a consumer to it, in which case a * direct call to g_destroy_provider() would not work. */ g_reparent_provider(newpp, gp, NULL); } /* * Complete the proxy destruction, linking the old provider to its * original geom, and destroying the proxy provider. Also take care * of issuing the pending requests collected in me.gs_pending (if any). */ static int g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp) { struct g_consumer *cp; struct g_provider *newpp; do { cp = LIST_FIRST(&gp->consumer); if (cp == NULL) break; newpp = cp->provider; if (newpp == NULL) break; /* Relink the provider to its original geom. */ g_unproxy_provider(oldpp, newpp); /* Detach consumer from provider, and destroy provider. */ cp->acr = newpp->acr = 0; cp->acw = newpp->acw = 0; cp->ace = newpp->ace = 0; g_detach(cp); /* Send the pending bios through the right start function. */ g_sched_flush_pending(oldpp->geom->start); return (0); } while (0); printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name); /* We cannot send the pending bios anywhere... */ g_sched_flush_pending(g_sched_blackhole); return (EINVAL); } static int g_sched_destroy(struct g_geom *gp, boolean_t force) { struct g_provider *pp, *oldpp = NULL; struct g_sched_softc *sc; struct g_gsched *gsp; int error; g_topology_assert(); sc = gp->softc; if (sc == NULL) return (ENXIO); if (!(sc->sc_flags & G_SCHED_PROXYING)) { pp = LIST_FIRST(&gp->provider); if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { const char *msg = force ? "but we force removal" : "cannot remove"; G_SCHED_DEBUG(!force, "Device %s is still open (r%dw%de%d), %s.", pp->name, pp->acr, pp->acw, pp->ace, msg); if (!force) return (EBUSY); } else { G_SCHED_DEBUG(0, "Device %s removed.", gp->name); } } else oldpp = g_detach_proxy(gp); gsp = sc->sc_gsched; if (gsp) { /* * XXX bad hack here: force a dispatch to release * any reference to the hash table still held by * the scheduler. */ g_sched_lock(gp); /* * We are dying here, no new requests should enter * the scheduler. This is granted by the topolgy, * either in case we were proxying (new bios are * being redirected) or not (see the access check * above). */ g_sched_forced_dispatch(gp); error = g_sched_wait_pending(gp); if (error) { /* * Not all the requests came home: this might happen * under heavy load, or if we were waiting for any * bio which is served in the event path (see * geom_slice.c for an example of how this can * happen). Try to restore a working configuration * if we can fail. */ if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { g_sched_flush_pending(force ? g_sched_blackhole : g_sched_start); } /* * In the forced destroy case there is not so much * we can do, we have pending bios that will call * g_sched_done() somehow, and we don't want them * to crash the system using freed memory. We tell * the user that something went wrong, and leak some * memory here. * Note: the callers using force = 1 ignore the * return value. */ if (force) { G_SCHED_DEBUG(0, "Pending requests while " " destroying geom, some memory leaked."); } return (error); } g_sched_unlock(gp); g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); sc->sc_hash = NULL; gsp->gs_fini(sc->sc_data); g_gsched_unref(gsp); sc->sc_gsched = NULL; } if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { error = g_destroy_proxy(gp, oldpp); if (error) { if (force) { G_SCHED_DEBUG(0, "Unrecoverable error while " "destroying a proxy geom, leaking some " " memory."); } return (error); } } mtx_destroy(&sc->sc_mtx); g_free(gp->softc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (error); } static int g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { return (g_sched_destroy(gp, 0)); } /* * Functions related to the classification of requests. * * On recent FreeBSD versions (8.0 and above), we store a reference * to the issuer of a request in bp->bio_classifier1 as soon * as the bio is posted to the geom queue (and not later, because * requests are managed by the g_down thread afterwards). * * On older versions of the system (but this code is not used * in any existing release), we [ab]use the caller1 field in the * root element of the bio tree to store the classification info. * The marking is done at the beginning of g_io_request() * and only if we find that the field is NULL. * * To avoid rebuilding the kernel, this module will patch the * initial part of g_io_request() so it jumps to some hand-coded * assembly that does the marking and then executes the original * body of g_io_request(). * * fake_ioreq[] is architecture-specific machine code * that implements the above. CODE_SIZE, STORE_SIZE etc. * are constants used in the patching routine. Look at the * code in g_ioreq_patch() for the details. */ #ifndef HAVE_BIO_CLASSIFIER /* * Support for old FreeBSD versions */ #if defined(__i386__) #define CODE_SIZE 29 #define STORE_SIZE 5 #define EPILOGUE 5 #define SIZE (CODE_SIZE + STORE_SIZE + EPILOGUE) static u_char fake_ioreq[SIZE] = { 0x8b, 0x44, 0x24, 0x04, /* mov bp, %eax */ /* 1: */ 0x89, 0xc2, /* mov %eax, %edx # edx = bp */ 0x8b, 0x40, 0x64, /* mov bp->bio_parent, %eax */ 0x85, 0xc0, /* test %eax, %eax */ 0x75, 0xf7, /* jne 1b */ 0x8b, 0x42, 0x30, /* mov bp->bp_caller1, %eax */ 0x85, 0xc0, /* test %eax, %eax */ 0x75, 0x09, /* jne 2f */ 0x64, 0xa1, 0x00, 0x00, /* mov %fs:0, %eax */ 0x00, 0x00, 0x89, 0x42, 0x30, /* mov %eax, bp->bio_caller1 */ /* 2: */ 0x55, 0x89, 0xe5, 0x57, 0x56, 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp back... */ }; #elif defined(__amd64) #define CODE_SIZE 38 #define STORE_SIZE 6 #define EPILOGUE 5 #define SIZE (CODE_SIZE + STORE_SIZE + EPILOGUE) static u_char fake_ioreq[SIZE] = { 0x48, 0x89, 0xf8, /* mov bp, %rax */ /* 1: */ 0x48, 0x89, 0xc2, /* mov %rax, %rdx # rdx = bp */ 0x48, 0x8b, 0x82, 0xa8, /* mov bp->bio_parent, %rax */ 0x00, 0x00, 0x00, 0x48, 0x85, 0xc0, /* test %rax, %rax */ 0x75, 0xf1, /* jne 1b */ 0x48, 0x83, 0x7a, 0x58, /* cmp $0, bp->bp_caller1 */ 0x00, 0x75, 0x0d, /* jne 2f */ 0x65, 0x48, 0x8b, 0x04, /* mov %gs:0, %rax */ 0x25, 0x00, 0x00, 0x00, 0x00, 0x48, 0x89, 0x42, 0x58, /* mov %rax, bp->bio_caller1 */ /* 2: */ 0x55, 0x48, 0x89, 0xe5, 0x41, 0x56, 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp back... */ }; #else /* neither x86 nor amd64 */ static void g_new_io_request(struct bio *bp, struct g_consumer *cp) { struct bio *top = bp; /* * bio classification: if bio_caller1 is available in the * root of the 'struct bio' tree, store there the thread id * of the thread that originated the request. * More sophisticated classification schemes can be used. */ while (top->bio_parent) top = top->bio_parent; if (top->bio_caller1 == NULL) top->bio_caller1 = curthread; } #error please add the code above in g_new_io_request() to the beginning of \ /sys/geom/geom_io.c::g_io_request(), and remove this line. #endif /* end of arch-specific code */ static int g_ioreq_patch(void) { u_char *original; u_long ofs; int found; if (me.gs_patched) return (-1); original = (u_char *)g_io_request; found = !bcmp(original, fake_ioreq + CODE_SIZE, STORE_SIZE); if (!found) return (-1); /* Jump back to the original + STORE_SIZE. */ ofs = (original + STORE_SIZE) - (fake_ioreq + SIZE); bcopy(&ofs, fake_ioreq + CODE_SIZE + STORE_SIZE + 1, 4); /* Patch the original address with a jump to the trampoline. */ *original = 0xe9; /* jump opcode */ ofs = fake_ioreq - (original + 5); bcopy(&ofs, original + 1, 4); me.gs_patched = 1; return (0); } /* * Restore the original code, this is easy. */ static void g_ioreq_restore(void) { u_char *original; if (me.gs_patched) { original = (u_char *)g_io_request; bcopy(fake_ioreq + CODE_SIZE, original, STORE_SIZE); me.gs_patched = 0; } } static inline void g_classifier_ini(void) { g_ioreq_patch(); } static inline void g_classifier_fini(void) { g_ioreq_restore(); } /*--- end of support code for older FreeBSD versions */ #else /* HAVE_BIO_CLASSIFIER */ /* * Classifier support for recent FreeBSD versions: we use * a very simple classifier, only use curthread to tag a request. * The classifier is registered at module load, and unregistered * at module unload. */ static int g_sched_tag(void *arg, struct bio *bp) { bp->bio_classifier1 = curthread; return (1); } static struct g_classifier_hook g_sched_classifier = { .func = g_sched_tag, }; static inline void g_classifier_ini(void) { g_register_classifier(&g_sched_classifier); } static inline void g_classifier_fini(void) { g_unregister_classifier(&g_sched_classifier); } #endif /* HAVE_BIO_CLASSIFIER */ static void g_sched_init(struct g_class *mp) { g_gsched_global_init(); G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.", mp, &g_sched_class); /* Patch g_io_request to store classification info in the bio. */ g_classifier_ini(); } static void g_sched_fini(struct g_class *mp) { g_classifier_fini(); G_SCHED_DEBUG(0, "Unloading..."); KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers")); mtx_destroy(&me.gs_mtx); } /* * Read the i-th argument for a request, skipping the /dev/ * prefix if present. */ static const char * g_sched_argi(struct gctl_req *req, int i) { static const char *dev_prefix = "/dev/"; const char *name; char param[16]; int l = strlen(dev_prefix); snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) gctl_error(req, "No 'arg%d' argument", i); else if (strncmp(name, dev_prefix, l) == 0) name += l; return (name); } /* * Fetch nargs and do appropriate checks. */ static int g_sched_get_nargs(struct gctl_req *req) { int *nargs; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No 'nargs' argument"); return (0); } if (*nargs <= 0) gctl_error(req, "Missing device(s)."); return (*nargs); } /* * Check whether we should add the class on certain volumes when * this geom is created. Right now this is under control of a kenv * variable containing the names of all devices that we care about. * Probably we should only support transparent insertion as the * preferred mode of operation. */ static struct g_geom * g_sched_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_gsched *gsp = NULL; /* the . algorithm we want */ const char *s; /* generic string pointer */ const char *taste_names; /* devices we like */ int l; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); G_SCHED_DEBUG(2, "Tasting %s.", pp->name); do { /* do not taste on ourselves */ if (pp->geom->class == mp) break; taste_names = getenv("geom.sched.taste"); if (taste_names == NULL) break; l = strlen(pp->name); for (s = taste_names; *s && (s = strstr(s, pp->name)); s++) { /* further checks for an exact match */ if ( (s == taste_names || s[-1] == ' ') && (s[l] == '\0' || s[l] == ' ') ) break; } if (s == NULL) break; G_SCHED_DEBUG(0, "Attach device %s match [%s]\n", pp->name, s); /* look up the provider name in the list */ s = getenv("geom.sched.algo"); if (s == NULL) s = "rr"; gsp = g_gsched_find(s); /* also get a reference */ if (gsp == NULL) { G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s); break; } /* XXX create with 1 as last argument ? */ g_sched_create(NULL, mp, pp, gsp, 0); g_gsched_unref(gsp); } while (0); return NULL; } static void g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy) { struct g_provider *pp; struct g_gsched *gsp; const char *name; int i, nargs; g_topology_assert(); name = gctl_get_asciiparam(req, "algo"); if (name == NULL) { gctl_error(req, "No '%s' argument", "algo"); return; } gsp = g_gsched_find(name); /* also get a reference */ if (gsp == NULL) { gctl_error(req, "Bad algorithm '%s'", name); return; } nargs = g_sched_get_nargs(req); /* * Run on the arguments, and break on any error. * We look for a device name, but skip the /dev/ prefix if any. */ for (i = 0; i < nargs; i++) { name = g_sched_argi(req, i); if (name == NULL) break; pp = g_provider_by_name(name); if (pp == NULL) { G_SCHED_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); break; } if (g_sched_create(req, mp, pp, gsp, proxy) != 0) break; } g_gsched_unref(gsp); } static void g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_provider *pp; struct g_gsched *gsp; const char *name; int i, nargs; g_topology_assert(); name = gctl_get_asciiparam(req, "algo"); if (name == NULL) { gctl_error(req, "No '%s' argument", "algo"); return; } gsp = g_gsched_find(name); /* also get a reference */ if (gsp == NULL) { gctl_error(req, "Bad algorithm '%s'", name); return; } nargs = g_sched_get_nargs(req); /* * Run on the arguments, and break on any error. * We look for a device name, but skip the /dev/ prefix if any. */ for (i = 0; i < nargs; i++) { name = g_sched_argi(req, i); if (name == NULL) break; pp = g_provider_by_name(name); if (pp == NULL || pp->geom->class != mp) { G_SCHED_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); break; } if (g_sched_change_algo(req, mp, pp, gsp) != 0) break; } g_gsched_unref(gsp); } static struct g_geom * g_sched_find_geom(struct g_class *mp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) return (gp); } return (NULL); } static void g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp) { int nargs, *force, error, i; struct g_geom *gp; const char *name; g_topology_assert(); nargs = g_sched_get_nargs(req); force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No 'force' argument"); return; } for (i = 0; i < nargs; i++) { name = g_sched_argi(req, i); if (name == NULL) break; gp = g_sched_find_geom(mp, name); if (gp == NULL) { G_SCHED_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); break; } error = g_sched_destroy(gp, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", gp->name, error); break; } } } static void g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_SCHED_VERSION) { gctl_error(req, "Userland and kernel parts are " "out of sync."); return; } if (strcmp(verb, "create") == 0) { g_sched_ctl_create(req, mp, 0); return; } else if (strcmp(verb, "insert") == 0) { g_sched_ctl_create(req, mp, 1); return; } else if (strcmp(verb, "configure") == 0) { g_sched_ctl_configure(req, mp); return; } else if (strcmp(verb, "destroy") == 0) { g_sched_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_sched_softc *sc = gp->softc; struct g_gsched *gsp = sc->sc_gsched; if (indent == NULL) { /* plaintext */ sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--"); } if (gsp->gs_dumpconf) gsp->gs_dumpconf(sb, indent, gp, cp, pp); } DECLARE_GEOM_CLASS(g_sched_class, g_sched); MODULE_VERSION(geom_sched, 0); geom_sched/sys/geom/sched/gs_as.c000644 000423 000000 00000021547 11360626257 017525 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id$ * $FreeBSD$ * * A simple algorithm for Anticipatory disk Scheduler (AS). * This version does not track process state or behaviour, and is * just a proof of concept to show how non work-conserving policies * can be implemented within this framework. */ /* * Anticipatory scheduling without per-client state. * * The goal of this implementation is to improve throughput compared * to the pure elevator algorithm, but also make sure that clients * do not starve. * * To this purpose, we use anticipation to help reducing seeks, but * set a maximum service share (in time or data) to prevent starvation. * Also, we mark requests as "good" or "bad" depending on how well they * lend to clustering. To improve throughput, we try to avoid serving * too many "bad" requests in a row. * * The scheduler can be in one of three states * READY immediately serve the first pending request * BUSY one request is under service, wait for completion * IDLING do not serve incoming requests immediately, unless * they are "eligible" as defined later. * * The event that triggers the state machine is mainly gs_next(), * the body of the dispatch loop, called after all events. * The calls to the scheduling algorithm (gs_start(), gs_done() * and gs_timeout()) do the following: * * gs_start() arrival of a new request. * Just enqueue; the dispatch loop will be called * right after. * * gs_done() completion of the request under service. * Set state to IDLING, start timer and * call the dispatch loop. * * gs_timeout() timeout while idling. * Set state to READY, call the dispatch loop. * * In turn, gs_next() does the following (in pseudocode, * see the implementation for more details): * * bio = * if (bio == NULL) { * state = READY; * } else if (state == BUSY) { * bio = NULL; // do nothing * } else if (state == READY) { * state = BUSY; // dispatch request * } else { // state is IDLING, main anticipation logic goes here * if (!in_sequence(bio) && !expired()) { * bio = NULL; // do nothing * } else if (!expired()) { * state = BUSY; // dispatch request * } else { * mark(bio, GOOD); * rotate_queue(); * bio = // surely not null * state = BUSY; // dispatch request * } * } * if (bio != NULL) * ; stop timer; * return bio; * * in_sequence() returns true if the request is right after the * current head position (so it will not cause a seek). * * expired(bio) returns true if the current sequence is longer * than the maximum allowed. * * rotate_queue() implements the selection of the next batch. * In the simplest case we just jump to the next request * in round-robin order; more complex policies are possible * (and likely necessary) to avoid that a seeky client * receives an exceedingly high amount of service. */ #include #include #include #include #include #include #include #include #include #include #include #include "gs_scheduler.h" /* * Status values for AS. */ enum g_as_status { G_AS_READY = 0, /* Not waiting at all. */ G_AS_BUSY, /* Waiting a request to complete. */ G_AS_IDLING /* Waiting a new request. */ }; struct g_as_softc { struct g_geom *sc_geom; enum g_as_status sc_status; long sc_service; /* received so far */ int sc_start_tick; /* starting tick */ off_t sc_last_offset; /* configuration parameters */ int sc_wait_ticks; int sc_budget; int sc_max_ticks; struct callout sc_wait; struct bio_queue_head sc_bioq; }; static int g_as_in_sequence(struct g_as_softc *sc, struct bio *bp) { return (bp->bio_offset == sc->sc_last_offset); } static int g_as_expired(struct g_as_softc *sc) { return (sc->sc_service > sc->sc_budget || ticks - sc->sc_start_tick > sc->sc_max_ticks); } static struct bio * g_as_rotate(struct bio_queue_head *head) { struct bio *bp; bp = gs_bioq_takefirst(head); gs_bioq_disksort(head, bp); return (gs_bioq_first(head)); } /* * The body of the dispatch loop. Returns the bio to be dispatched * (and in case set sc_status == G_AS_BUSY) * or NULL when no bio needs to be dispatched. */ static struct bio * g_as_next(void *data, int force) { struct g_as_softc *sc = data; struct bio *bio; bio = gs_bioq_first(&sc->sc_bioq); if (bio == NULL || force) { /* reset parameters */ sc->sc_start_tick = ticks; sc->sc_service = 0; if (force) sc->sc_status = G_AS_READY; } else if (sc->sc_status == G_AS_BUSY) { /* We were called after _start. */ bio = NULL; } else if (sc->sc_status == G_AS_READY) { /* Dispatch the request. */ sc->sc_status = G_AS_BUSY; } else { /* we are IDLING here */ if (!g_as_in_sequence(sc, bio) && !g_as_expired(sc)) { bio = NULL; } else if (!g_as_expired(sc)) { sc->sc_status = G_AS_BUSY; } else { bio = g_as_rotate(&sc->sc_bioq); sc->sc_start_tick = ticks; sc->sc_service = 0; } } if (bio != NULL) { /* Remove the bio we decided to serve. */ gs_bioq_remove(&sc->sc_bioq, bio); sc->sc_service += bio->bio_length; sc->sc_last_offset = bio->bio_offset + bio->bio_length; callout_stop(&sc->sc_wait); } return (bio); } static void g_as_wait_timeout(void *data) { struct g_as_softc *sc = data; struct g_geom *geom = sc->sc_geom; g_sched_lock(geom); /* * If we timed out waiting for a new request for the current * client, just dispatch whatever we have. * Otherwise ignore the timeout (should not happen). */ if (sc->sc_status == G_AS_IDLING) { sc->sc_status = G_AS_READY; g_sched_dispatch(geom); } g_sched_unlock(geom); } /* * Called when there is a schedulable disk I/O request. * Just enqueue, the dispatch loop will take care of things. */ static int g_as_start(void *data, struct bio *bio) { struct g_as_softc *sc = data; gs_bioq_disksort(&sc->sc_bioq, bio); return 0; } /* * Callback from the geom when a request is complete. * Change to idling, start timer and call the dispatch loop. */ static void g_as_done(void *data, struct bio *bio) { struct g_as_softc *sc = data; sc->sc_status = G_AS_IDLING; callout_reset(&sc->sc_wait, sc->sc_wait_ticks, g_as_wait_timeout, sc); g_sched_dispatch(sc->sc_geom); } /* * Module glue, called when the module is loaded. * Allocate a descriptor and initialize its fields, including the * callout queue for timeouts, and a bioq to store pending requests. * * The fini routine deallocates everything. */ static void * g_as_init(struct g_geom *geom) { struct g_as_softc *sc; sc = malloc(sizeof(*sc), M_GEOM_SCHED, M_WAITOK | M_ZERO); sc->sc_geom = geom; sc->sc_status = G_AS_READY; sc->sc_wait_ticks = (hz >= 400) ? hz/200 : 2; sc->sc_max_ticks = (hz >= 40) ? hz/20 : 2; sc->sc_budget = 0x00800000; /* 8 MB */ callout_init(&sc->sc_wait, CALLOUT_MPSAFE); gs_bioq_init(&sc->sc_bioq); return sc; } static void g_as_fini(void *data) { struct g_as_softc *sc = data; /* * geom should guarantee that _fini is only called when there * are no more bio's active (GEOM does not know about the queue, * but it can count existing bio's associated to the geom). */ KASSERT(gs_bioq_first(&sc->sc_bioq) == NULL, ("Requests still pending.")); callout_drain(&sc->sc_wait); free(sc, M_GEOM_SCHED); } static struct g_gsched g_as = { .gs_name = "as", .gs_init = g_as_init, .gs_fini = g_as_fini, .gs_start = g_as_start, .gs_done = g_as_done, .gs_next = g_as_next, }; DECLARE_GSCHED_MODULE(as, &g_as); geom_sched/sys/geom/sched/gs_rr.c000644 000423 000000 00000045063 11360626461 017541 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id$ * $FreeBSD$ * * A round-robin (RR) anticipatory scheduler, with per-client queues. * * The goal of this implementation is to improve throughput compared * to the pure elevator algorithm, and insure some fairness among * clients. * * Requests coming from the same client are put in the same queue. * We use anticipation to help reducing seeks, and each queue * is never served continuously for more than a given amount of * time or data. Queues are then served in a round-robin fashion. * * Each queue can be in any of the following states: * READY immediately serve the first pending request; * BUSY one request is under service, wait for completion; * IDLING do not serve incoming requests immediately, unless * they are "eligible" as defined later. * * Scheduling is made looking at the status of all queues, * and the first one in round-robin order is privileged. */ #include #include #include #include #include #include #include #include #include #include #include "gs_scheduler.h" /* possible states of the scheduler */ enum g_rr_state { G_QUEUE_READY = 0, /* Ready to dispatch. */ G_QUEUE_BUSY, /* Waiting for a completion. */ G_QUEUE_IDLING /* Waiting for a new request. */ }; /* possible queue flags */ enum g_rr_flags { G_FLAG_COMPLETED = 1, /* Completed a req. in the current budget. */ }; struct g_rr_softc; /* * Queue descriptor, containing reference count, scheduling * state, a queue of pending requests, configuration parameters. * Queues with pending request(s) and not under service are also * stored in a Round Robin (RR) list. */ struct g_rr_queue { struct g_rr_softc *q_sc; /* link to the parent */ enum g_rr_state q_status; unsigned int q_service; /* service received so far */ int q_slice_end; /* actual slice end in ticks */ enum g_rr_flags q_flags; /* queue flags */ struct bio_queue_head q_bioq; /* Scheduling parameters */ unsigned int q_budget; /* slice size in bytes */ unsigned int q_slice_duration; /* slice size in ticks */ unsigned int q_wait_ticks; /* wait time for anticipation */ /* Stats to drive the various heuristics. */ struct g_savg q_thinktime; /* Thinktime average. */ struct g_savg q_seekdist; /* Seek distance average. */ int q_bionum; /* Number of requests. */ off_t q_lastoff; /* Last submitted req. offset. */ int q_lastsub; /* Last submitted req. time. */ /* Expiration deadline for an empty queue. */ int q_expire; TAILQ_ENTRY(g_rr_queue) q_tailq; /* RR list link field */ }; /* List types. */ TAILQ_HEAD(g_rr_tailq, g_rr_queue); /* list of scheduler instances */ LIST_HEAD(g_scheds, g_rr_softc); /* Default quantum for RR between queues. */ #define G_RR_DEFAULT_BUDGET 0x00800000 /* * Per device descriptor, holding the Round Robin list of queues * accessing the disk, a reference to the geom, and the timer. */ struct g_rr_softc { struct g_geom *sc_geom; /* * sc_active is the queue we are anticipating for. * It is set only in gs_rr_next(), and possibly cleared * only in gs_rr_next() or on a timeout. * The active queue is never in the Round Robin list * even if it has requests queued. */ struct g_rr_queue *sc_active; struct callout sc_wait; /* timer for sc_active */ struct g_rr_tailq sc_rr_tailq; /* the round-robin list */ int sc_nqueues; /* number of queues */ /* Statistics */ int sc_in_flight; /* requests in the driver */ LIST_ENTRY(g_rr_softc) sc_next; }; /* Descriptor for bounded values, min and max are constant. */ struct x_bound { const int x_min; int x_cur; const int x_max; }; /* * parameters, config and stats */ struct g_rr_params { int queues; /* total number of queues */ int w_anticipate; /* anticipate writes */ int bypass; /* bypass scheduling writes */ int units; /* how many instances */ /* sc_head is used for debugging */ struct g_scheds sc_head; /* first scheduler instance */ struct x_bound queue_depth; /* max parallel requests */ struct x_bound wait_ms; /* wait time, milliseconds */ struct x_bound quantum_ms; /* quantum size, milliseconds */ struct x_bound quantum_kb; /* quantum size, Kb (1024 bytes) */ /* statistics */ int wait_hit; /* success in anticipation */ int wait_miss; /* failure in anticipation */ }; /* * Default parameters for the scheduler. The quantum sizes target * a 80MB/s disk; if the hw is faster or slower the minimum of the * two will have effect: the clients will still be isolated but * the fairness may be limited. A complete solution would involve * the on-line measurement of the actual disk throughput to derive * these parameters. Or we may just choose to ignore service domain * fairness and accept what can be achieved with time-only budgets. */ static struct g_rr_params me = { .sc_head = LIST_HEAD_INITIALIZER(&me.sc_head), .w_anticipate = 1, .queue_depth = { 1, 1, 50 }, .wait_ms = { 1, 10, 30 }, .quantum_ms = { 1, 100, 500 }, .quantum_kb = { 16, 8192, 65536 }, }; struct g_rr_params *gs_rr_me = &me; SYSCTL_DECL(_kern_geom_sched); SYSCTL_NODE(_kern_geom_sched, OID_AUTO, rr, CTLFLAG_RW, 0, "GEOM_SCHED ROUND ROBIN stuff"); SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, units, CTLFLAG_RD, &me.units, 0, "Scheduler instances"); SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, queues, CTLFLAG_RD, &me.queues, 0, "Total rr queues"); SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_ms, CTLFLAG_RW, &me.wait_ms.x_cur, 0, "Wait time milliseconds"); SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, quantum_ms, CTLFLAG_RW, &me.quantum_ms.x_cur, 0, "Quantum size milliseconds"); SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, bypass, CTLFLAG_RW, &me.bypass, 0, "Bypass scheduler"); SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, w_anticipate, CTLFLAG_RW, &me.w_anticipate, 0, "Do anticipation on writes"); SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, quantum_kb, CTLFLAG_RW, &me.quantum_kb.x_cur, 0, "Quantum size Kbytes"); SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, queue_depth, CTLFLAG_RW, &me.queue_depth.x_cur, 0, "Maximum simultaneous requests"); SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_hit, CTLFLAG_RW, &me.wait_hit, 0, "Hits in anticipation"); SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_miss, CTLFLAG_RW, &me.wait_miss, 0, "Misses in anticipation"); #ifdef DEBUG_QUEUES /* print the status of a queue */ static void gs_rr_dump_q(struct g_rr_queue *qp, int index) { int l = 0; struct bio *bp; TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) { l++; } printf("--- rr queue %d %p status %d len %d ---\n", index, qp, qp->q_status, l); } /* * Dump the scheduler status when writing to this sysctl variable. * XXX right now we only dump the status of the last instance created. * not a severe issue because this is only for debugging */ static int gs_rr_sysctl_status(SYSCTL_HANDLER_ARGS) { int error, val = 0; struct g_rr_softc *sc; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); printf("called %s\n", __FUNCTION__); LIST_FOREACH(sc, &me.sc_head, sc_next) { int i, tot = 0; printf("--- sc %p active %p nqueues %d " "callout %d in_flight %d ---\n", sc, sc->sc_active, sc->sc_nqueues, callout_active(&sc->sc_wait), sc->sc_in_flight); for (i = 0; i < G_RR_HASH_SIZE; i++) { struct g_rr_queue *qp; LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) { gs_rr_dump_q(qp, tot); tot++; } } } return (0); } SYSCTL_PROC(_kern_geom_sched_rr, OID_AUTO, status, CTLTYPE_UINT | CTLFLAG_RW, 0, sizeof(int), gs_rr_sysctl_status, "I", "status"); #endif /* DEBUG_QUEUES */ /* * Get a bounded value, optionally convert to a min of t_min ticks. */ static int get_bounded(struct x_bound *v, int t_min) { int x; x = v->x_cur; if (x < v->x_min) x = v->x_min; else if (x > v->x_max) x = v->x_max; if (t_min) { x = x * hz / 1000; /* convert to ticks */ if (x < t_min) x = t_min; } return x; } /* * Get a reference to the queue for bp, using the generic * classification mechanism. */ static struct g_rr_queue * g_rr_queue_get(struct g_rr_softc *sc, struct bio *bp) { return (g_sched_get_class(sc->sc_geom, bp)); } static int g_rr_init_class(void *data, void *priv) { struct g_rr_softc *sc = data; struct g_rr_queue *qp = priv; gs_bioq_init(&qp->q_bioq); /* * Set the initial parameters for the client: * slice size in bytes and ticks, and wait ticks. * Right now these are constant, but we could have * autoconfiguration code to adjust the values based on * the actual workload. */ qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0); qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); qp->q_sc = sc; /* link to the parent */ qp->q_sc->sc_nqueues++; me.queues++; return (0); } /* * Release a reference to the queue. */ static void g_rr_queue_put(struct g_rr_queue *qp) { g_sched_put_class(qp->q_sc->sc_geom, qp); } static void g_rr_fini_class(void *data, void *priv) { struct g_rr_queue *qp = priv; KASSERT(gs_bioq_first(&qp->q_bioq) == NULL, ("released nonempty queue")); qp->q_sc->sc_nqueues--; me.queues--; } static inline int g_rr_queue_expired(struct g_rr_queue *qp) { if (qp->q_service >= qp->q_budget) return (1); if ((qp->q_flags & G_FLAG_COMPLETED) && ticks - qp->q_slice_end >= 0) return (1); return (0); } static inline int g_rr_should_anticipate(struct g_rr_queue *qp, struct bio *bp) { int wait = get_bounded(&me.wait_ms, 2); if (!me.w_anticipate && (bp->bio_cmd & BIO_WRITE)) return (0); if (g_savg_valid(&qp->q_thinktime) && g_savg_read(&qp->q_thinktime) > wait) return (0); if (g_savg_valid(&qp->q_seekdist) && g_savg_read(&qp->q_seekdist) > 8192) return (0); return (1); } /* * Called on a request arrival, timeout or completion. * Try to serve a request among those queued. */ static struct bio * g_rr_next(void *data, int force) { struct g_rr_softc *sc = data; struct g_rr_queue *qp; struct bio *bp, *next; int expired; qp = sc->sc_active; if (me.bypass == 0 && !force) { if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0)) return (NULL); /* Try with the queue under service first. */ if (qp != NULL && qp->q_status != G_QUEUE_READY) { /* * Queue is anticipating, ignore request. * We should check that we are not past * the timeout, but in that case the timeout * will fire immediately afterwards so we * don't bother. */ return (NULL); } } else if (qp != NULL && qp->q_status != G_QUEUE_READY) { g_rr_queue_put(qp); sc->sc_active = qp = NULL; } /* * No queue under service, look for the first in RR order. * If we find it, select if as sc_active, clear service * and record the end time of the slice. */ if (qp == NULL) { qp = TAILQ_FIRST(&sc->sc_rr_tailq); if (qp == NULL) return (NULL); /* no queues at all, return */ /* otherwise select the new queue for service. */ TAILQ_REMOVE(&sc->sc_rr_tailq, qp, q_tailq); sc->sc_active = qp; qp->q_service = 0; qp->q_flags &= ~G_FLAG_COMPLETED; } bp = gs_bioq_takefirst(&qp->q_bioq); /* surely not NULL */ qp->q_service += bp->bio_length; /* charge the service */ /* * The request at the head of the active queue is always * dispatched, and gs_rr_next() will be called again * immediately. * We need to prepare for what to do next: * * 1. have we reached the end of the (time or service) slice ? * If so, clear sc_active and possibly requeue the previous * active queue if it has more requests pending; * 2. do we have more requests in sc_active ? * If yes, do not anticipate, as gs_rr_next() will run again; * if no, decide whether or not to anticipate depending * on read or writes (e.g., anticipate only on reads). */ expired = g_rr_queue_expired(qp); /* are we expired ? */ next = gs_bioq_first(&qp->q_bioq); /* do we have one more ? */ if (expired) { sc->sc_active = NULL; /* Either requeue or release reference. */ if (next != NULL) TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); else g_rr_queue_put(qp); } else if (next != NULL) { qp->q_status = G_QUEUE_READY; } else { if (!force && g_rr_should_anticipate(qp, bp)) { /* anticipate */ qp->q_status = G_QUEUE_BUSY; } else { /* do not anticipate, release reference */ g_rr_queue_put(qp); sc->sc_active = NULL; } } /* If sc_active != NULL, its q_status is always correct. */ sc->sc_in_flight++; return (bp); } static inline void g_rr_update_thinktime(struct g_rr_queue *qp) { int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2); if (qp->q_sc->sc_active != qp) return; qp->q_lastsub = ticks; delta = (delta > 2 * wait) ? 2 * wait : delta; if (qp->q_bionum > 7) g_savg_add_sample(&qp->q_thinktime, delta); } static inline void g_rr_update_seekdist(struct g_rr_queue *qp, struct bio *bp) { off_t dist; if (qp->q_lastoff > bp->bio_offset) dist = qp->q_lastoff - bp->bio_offset; else dist = bp->bio_offset - qp->q_lastoff; if (dist > (8192 * 8)) dist = 8192 * 8; qp->q_lastoff = bp->bio_offset + bp->bio_length; if (qp->q_bionum > 7) g_savg_add_sample(&qp->q_seekdist, dist); } /* * Called when a real request for disk I/O arrives. * Locate the queue associated with the client. * If the queue is the one we are anticipating for, reset its timeout; * if the queue is not in the round robin list, insert it in the list. * On any error, do not queue the request and return -1, the caller * will take care of this request. */ static int g_rr_start(void *data, struct bio *bp) { struct g_rr_softc *sc = data; struct g_rr_queue *qp; if (me.bypass) return (-1); /* bypass the scheduler */ /* Get the queue for the request. */ qp = g_rr_queue_get(sc, bp); if (qp == NULL) return (-1); /* allocation failed, tell upstream */ if (gs_bioq_first(&qp->q_bioq) == NULL) { /* * We are inserting into an empty queue. * Reset its state if it is sc_active, * otherwise insert it in the RR list. */ if (qp == sc->sc_active) { qp->q_status = G_QUEUE_READY; callout_stop(&sc->sc_wait); } else { g_sched_priv_ref(qp); TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); } } qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3); g_rr_update_thinktime(qp); g_rr_update_seekdist(qp, bp); /* Inherit the reference returned by g_rr_queue_get(). */ bp->bio_caller1 = qp; gs_bioq_disksort(&qp->q_bioq, bp); return (0); } /* * Callout executed when a queue times out anticipating a new request. */ static void g_rr_wait_timeout(void *data) { struct g_rr_softc *sc = data; struct g_geom *geom = sc->sc_geom; g_sched_lock(geom); /* * We can race with other events, so check if * sc_active is still valid. */ if (sc->sc_active != NULL) { /* Release the reference to the queue. */ g_rr_queue_put(sc->sc_active); sc->sc_active = NULL; me.wait_hit--; me.wait_miss++; /* record the miss */ } g_sched_dispatch(geom); g_sched_unlock(geom); } /* * Module glue: allocate descriptor, initialize its fields. */ static void * g_rr_init(struct g_geom *geom) { struct g_rr_softc *sc; /* XXX check whether we can sleep */ sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO); sc->sc_geom = geom; TAILQ_INIT(&sc->sc_rr_tailq); callout_init(&sc->sc_wait, CALLOUT_MPSAFE); LIST_INSERT_HEAD(&me.sc_head, sc, sc_next); me.units++; return (sc); } /* * Module glue -- drain the callout structure, destroy the * hash table and its element, and free the descriptor. */ static void g_rr_fini(void *data) { struct g_rr_softc *sc = data; callout_drain(&sc->sc_wait); KASSERT(sc->sc_active == NULL, ("still a queue under service")); KASSERT(TAILQ_EMPTY(&sc->sc_rr_tailq), ("still scheduled queues")); LIST_REMOVE(sc, sc_next); me.units--; free(sc, M_GEOM_SCHED); } /* * Called when the request under service terminates. * Start the anticipation timer if needed. */ static void g_rr_done(void *data, struct bio *bp) { struct g_rr_softc *sc = data; struct g_rr_queue *qp; sc->sc_in_flight--; qp = bp->bio_caller1; if (qp == sc->sc_active && qp->q_status == G_QUEUE_BUSY) { if (!(qp->q_flags & G_FLAG_COMPLETED)) { qp->q_flags |= G_FLAG_COMPLETED; /* in case we want to make the slice adaptive */ qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); qp->q_slice_end = ticks + qp->q_slice_duration; } /* The queue is trying anticipation, start the timer. */ qp->q_status = G_QUEUE_IDLING; /* may make this adaptive */ qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); me.wait_hit++; callout_reset(&sc->sc_wait, qp->q_wait_ticks, g_rr_wait_timeout, sc); } else g_sched_dispatch(sc->sc_geom); /* Release a reference to the queue. */ g_rr_queue_put(qp); } static void g_rr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { if (indent == NULL) { /* plaintext */ sbuf_printf(sb, " units %d queues %d", me.units, me.queues); } } static struct g_gsched g_rr = { .gs_name = "rr", .gs_priv_size = sizeof(struct g_rr_queue), .gs_init = g_rr_init, .gs_fini = g_rr_fini, .gs_start = g_rr_start, .gs_done = g_rr_done, .gs_next = g_rr_next, .gs_dumpconf = g_rr_dumpconf, .gs_init_class = g_rr_init_class, .gs_fini_class = g_rr_fini_class, }; DECLARE_GSCHED_MODULE(rr, &g_rr); geom_sched/sys/geom/sched/subr_disk.c000644 000423 000000 00000014171 11360626600 020401 0ustar00luigiwheel000000 000000 /*- * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- * * The bioq_disksort() (and the specification of the bioq API) * have been written by Luigi Rizzo and Fabio Checconi under the same * license as above. */ #include __FBSDID("$FreeBSD: stable/7/sys/kern/subr_disk.c 189502 2009-03-08 00:11:26Z luigi $"); //#include "opt_geom.h" #include #include #include #include #include #include #include "g_sched.h" /* * BIO queue implementation * * Please read carefully the description below before making any change * to the code, or you might change the behaviour of the data structure * in undesirable ways. * * A bioq stores disk I/O request (bio), normally sorted according to * the distance of the requested position (bio->bio_offset) from the * current head position (bioq->last_offset) in the scan direction, i.e. * * (uoff_t)(bio_offset - last_offset) * * Note that the cast to unsigned (uoff_t) is fundamental to insure * that the distance is computed in the scan direction. * * The main methods for manipulating the bioq are: * * bioq_disksort() performs an ordered insertion; * * bioq_first() return the head of the queue, without removing; * * bioq_takefirst() return and remove the head of the queue, * updating the 'current head position' as * bioq->last_offset = bio->bio_offset + bio->bio_length; * * When updating the 'current head position', we assume that the result of * bioq_takefirst() is dispatched to the device, so bioq->last_offset * represents the head position once the request is complete. * * If the bioq is manipulated using only the above calls, it starts * with a sorted sequence of requests with bio_offset >= last_offset, * possibly followed by another sorted sequence of requests with * 0 <= bio_offset < bioq->last_offset * * NOTE: historical behaviour was to ignore bio->bio_length in the * update, but its use tracks the head position in a better way. * Historical behaviour was also to update the head position when * the request under service is complete, rather than when the * request is extracted from the queue. However, the current API * has no method to update the head position; secondly, once * a request has been submitted to the disk, we have no idea of * the actual head position, so the final one is our best guess. * * --- Direct queue manipulation --- * * A bioq uses an underlying TAILQ to store requests, so we also * export methods to manipulate the TAILQ, in particular: * * bioq_insert_tail() insert an entry at the end. * It also creates a 'barrier' so all subsequent * insertions through bioq_disksort() will end up * after this entry; * * bioq_insert_head() insert an entry at the head, update * bioq->last_offset = bio->bio_offset so that * all subsequent insertions through bioq_disksort() * will end up after this entry; * * bioq_remove() remove a generic element from the queue, act as * bioq_takefirst() if invoked on the head of the queue. * * The semantic of these methods is the same of the operations * on the underlying TAILQ, but with additional guarantees on * subsequent bioq_disksort() calls. E.g. bioq_insert_tail() * can be useful for making sure that all previous ops are flushed * to disk before continuing. * * Updating bioq->last_offset on a bioq_insert_head() guarantees * that the bio inserted with the last bioq_insert_head() will stay * at the head of the queue even after subsequent bioq_disksort(). * * Note that when the direct queue manipulation functions are used, * the queue may contain multiple inversion points (i.e. more than * two sorted sequences of requests). * */ void gs_bioq_init(struct bio_queue_head *head) { TAILQ_INIT(&head->queue); head->last_offset = 0; head->insert_point = NULL; } void gs_bioq_remove(struct bio_queue_head *head, struct bio *bp) { if (bp == TAILQ_FIRST(&head->queue)) head->last_offset = bp->bio_offset + bp->bio_length; if (bp == head->insert_point) head->insert_point = NULL; TAILQ_REMOVE(&head->queue, bp, bio_queue); } void gs_bioq_flush(struct bio_queue_head *head, struct devstat *stp, int error) { struct bio *bp; while ((bp = gs_bioq_takefirst(head)) != NULL) biofinish(bp, stp, error); } void gs_bioq_insert_head(struct bio_queue_head *head, struct bio *bp) { head->last_offset = bp->bio_offset; TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue); } void gs_bioq_insert_tail(struct bio_queue_head *head, struct bio *bp) { TAILQ_INSERT_TAIL(&head->queue, bp, bio_queue); head->insert_point = bp; } struct bio * gs_bioq_first(struct bio_queue_head *head) { return (TAILQ_FIRST(&head->queue)); } struct bio * gs_bioq_takefirst(struct bio_queue_head *head) { struct bio *bp; bp = TAILQ_FIRST(&head->queue); if (bp != NULL) gs_bioq_remove(head, bp); return (bp); } /* * Compute the sorting key. The cast to unsigned is * fundamental for correctness, see the description * near the beginning of the file. */ static inline uoff_t gs_bioq_bio_key(struct bio_queue_head *head, struct bio *bp) { return ((uoff_t)(bp->bio_offset - head->last_offset)); } /* * Seek sort for disks. * * Sort all requests in a single queue while keeping * track of the current position of the disk with last_offset. * See above for details. */ void gs_bioq_disksort(struct bio_queue_head *head, struct bio *bp) { struct bio *cur, *prev = NULL; uoff_t key = gs_bioq_bio_key(head, bp); cur = TAILQ_FIRST(&head->queue); if (head->insert_point) cur = head->insert_point; while (cur != NULL && key >= gs_bioq_bio_key(head, cur)) { prev = cur; cur = TAILQ_NEXT(cur, bio_queue); } if (prev == NULL) TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue); else TAILQ_INSERT_AFTER(&head->queue, prev, bp, bio_queue); } geom_sched/sys/geom/sched/gs_bfq.c000644 000423 000000 00000072224 11360626312 017660 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo, Paolo Valente * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id$ * $FreeBSD$ * * Budget Fair Queueing: mixed service/time-domain fair queueing * scheduling of disk access, among per-process queues. */ #include #include #include #include #include #include #include #include #include #include #include #include "gs_scheduler.h" /* Possible states of the scheduler. */ enum g_bfq_state { G_QUEUE_READY = 0, /* Ready to dispatch. */ G_QUEUE_BUSY, /* Waiting for a completion. */ G_QUEUE_IDLING /* Waiting for a new request. */ }; /* Possible queue flags. */ enum g_bfq_flags { G_FLAG_COMPLETED = 1, /* Completed a req. in the current budget. */ }; /* Possible expiration reasons. */ enum g_bfq_reason { G_BUDGET_EXHAUSTED, G_TIME_EXHAUSTED, G_TIMED_OUT, }; struct g_bfq_softc; RB_HEAD(vtree, g_bfq_queue); /* * Queue descriptor, containing reference count, scheduling * state, a queue of pending requests, configuration parameters. * Queues with pending request(s) and not under service are also * stored in the BFQ tree. */ struct g_bfq_queue { struct g_bfq_softc *q_sc; /* Link to parent. */ enum g_bfq_state q_status; unsigned int q_service; /* Service received so far. */ int q_slice_end; /* Actual slice end in ticks. */ enum g_bfq_flags q_flags; /* Queue flags. */ struct bio_queue_head q_bioq; /* Scheduling parameters */ uint64_t q_finish; /* Finish time. */ uint64_t q_start; /* Start time. */ struct vtree *q_tree; /* Tree we are enqueued into. */ RB_ENTRY(g_bfq_queue) q_node; /* RB tree node. */ uint64_t q_minstart; /* Min. ts_start in our subtree. */ unsigned q_weight; /* Weight. */ unsigned q_newweight; /* Target weight after a renice. */ uint64_t q_budget; /* Entitled service for next slice. */ unsigned int q_slice_duration; /* Slice size in ticks. */ unsigned int q_wait_ticks; /* Wait time for anticipation. */ /* Stats to drive the various heuristics. */ struct g_savg q_thinktime; /* Thinktime average. */ struct g_savg q_seekdist; /* Seek distance average. */ int q_bionum; /* Number of requests. */ off_t q_lastoff; /* Last submitted req. offset. */ int q_lastsub; /* Last submitted req. time. */ /* Expiration deadline for an empty queue. */ int q_expire; }; /* List of scheduler instances. */ LIST_HEAD(g_scheds, g_bfq_softc); /* Parameters for feedback on budget. */ #define G_BFQ_BUDGET_STEP 0x00180000 #define G_BFQ_BUDGET_MIN 0x00200000 /* Bits for fixed point precision in timestamp calculations. */ #define G_BFQ_SERVICE_SHIFT 22 /* * Per device descriptor, holding the BFQ tree of the per-process queues * accessing the disk, a reference to the geom, and the timer. */ struct g_bfq_softc { struct g_geom *sc_geom; /* * sc_current is the queue we are anticipating for. * It is set only in gs_bfq_next(), and possibly cleared * only in gs_bfq_next() or on a timeout. * The active queue is never in the Round Robin list * even if it has requests queued. */ struct g_bfq_queue *sc_current; struct callout sc_wait; /* Timer for sc_current. */ struct vtree sc_active; /* Active tree. */ struct vtree sc_idle; /* Idle tree. */ struct g_bfq_queue *sc_firstidle; /* First idle queue. */ struct g_bfq_queue *sc_lastidle; /* Last idle queue. */ uint64_t sc_vtime; /* Virtual time. */ unsigned long sc_error; /* Fractional part of vtime. */ unsigned sc_wsum; /* Weight sum. */ int sc_nqueues; /* Number of queues. */ /* Statistics */ int sc_in_flight; /* Requests in the driver. */ LIST_ENTRY(g_bfq_softc) sc_next; }; /* Descriptor for bounded values, min and max are constant. */ struct x_bound { const int x_min; int x_cur; const int x_max; }; /* * Parameters, config and stats. */ struct g_bfq_params { int queues; /* Total number of queues. */ int w_anticipate; /* Anticipate writes. */ int bypass; /* Bypass scheduling writes. */ int units; /* How many instances. */ /* sc_head is used for debugging. */ struct g_scheds sc_head; /* First scheduler instance. */ struct x_bound queue_depth; /* Max parallel requests. */ struct x_bound wait_ms; /* Wait time, milliseconds. */ struct x_bound quantum_ms; /* Quantum size, milliseconds. */ struct x_bound quantum_kb; /* Quantum size, Kb (1024 bytes). */ /* statistics */ int wait_hit; /* Success in anticipation. */ int wait_miss; /* Failure in anticipation. */ }; /* * Default parameters for the scheduler. The quantum sizes target * a 80MB/s disk; if the hw is faster or slower the minimum of the * two will have effect: the clients will still be isolated but * the fairness may be limited. A complete solution would involve * the on-line measurement of the actual disk throughput to derive * these parameters. Or we may just choose to ignore service domain * fairness and accept what can be achieved with time-only budgets. */ static struct g_bfq_params me = { .sc_head = LIST_HEAD_INITIALIZER(&me.sc_head), .w_anticipate = 1, .queue_depth = { 1, 1, 50 }, .wait_ms = { 1, 10, 30 }, .quantum_ms = { 1, 100, 500 }, .quantum_kb = { 16, 8192, 65536 }, }; struct g_bfq_params *gs_bfq_me = &me; SYSCTL_DECL(_kern_geom_sched); SYSCTL_NODE(_kern_geom_sched, OID_AUTO, bfq, CTLFLAG_RW, 0, "GEOM_SCHED BFQ stuff"); SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, units, CTLFLAG_RD, &me.units, 0, "Scheduler instances"); SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, queues, CTLFLAG_RD, &me.queues, 0, "Total BFQ queues"); SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, wait_ms, CTLFLAG_RW, &me.wait_ms.x_cur, 0, "Wait time milliseconds"); SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, quantum_ms, CTLFLAG_RW, &me.quantum_ms.x_cur, 0, "Quantum size milliseconds"); SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, bypass, CTLFLAG_RW, &me.bypass, 0, "Bypass scheduler"); SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, w_anticipate, CTLFLAG_RW, &me.w_anticipate, 0, "Do anticipation on writes"); SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, quantum_kb, CTLFLAG_RW, &me.quantum_kb.x_cur, 0, "Quantum size Kbytes"); SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, queue_depth, CTLFLAG_RW, &me.queue_depth.x_cur, 0, "Maximum simultaneous requests"); SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, wait_hit, CTLFLAG_RW, &me.wait_hit, 0, "Hits in anticipation"); SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, wait_miss, CTLFLAG_RW, &me.wait_miss, 0, "Misses in anticipation"); #ifdef DEBUG_QUEUES /* print the status of a queue */ static void gs_bfq_dump_q(struct g_bfq_queue *qp, int index) { int l = 0; struct bio *bp; TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) { l++; } printf("--- bfq queue %d %p status %d len %d ---\n", index, qp, qp->q_status, l); } /* * Dump the scheduler status when writing to this sysctl variable. * XXX right now we only dump the status of the last instance created. * not a severe issue because this is only for debugging */ static int gs_bfq_sysctl_status(SYSCTL_HANDLER_ARGS) { int error, val = 0; struct g_bfq_softc *sc; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); printf("called %s\n", __FUNCTION__); LIST_FOREACH(sc, &me.sc_head, sc_next) { int i, tot = 0; printf("--- sc %p active %p nqueues %d " "callout %d in_flight %d ---\n", sc, sc->sc_current, sc->sc_nqueues, callout_active(&sc->sc_wait), sc->sc_in_flight); for (i = 0; i < G_RR_HASH_SIZE; i++) { struct g_bfq_queue *qp; LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) { gs_bfq_dump_q(qp, tot); tot++; } } } return (0); } SYSCTL_PROC(_kern_geom_sched_bfq, OID_AUTO, status, CTLTYPE_UINT | CTLFLAG_RW, 0, sizeof(int), gs_bfq_sysctl_status, "I", "status"); #endif /* DEBUG_QUEUES */ /* * Get a bounded value, optionally convert to a min of t_min ticks. */ static int get_bounded(struct x_bound *v, int t_min) { int x; x = v->x_cur; if (x < v->x_min) x = v->x_min; else if (x > v->x_max) x = v->x_max; if (t_min) { x = x * hz / 1000; /* convert to ticks */ if (x < t_min) x = t_min; } return (x); } /* * Get a reference to the queue for bp, using the generic * classification mechanism. */ static struct g_bfq_queue * g_bfq_queue_get(struct g_bfq_softc *sc, struct bio *bp) { return (g_sched_get_class(sc->sc_geom, bp)); } static int g_bfq_init_class(void *data, void *priv) { struct g_bfq_softc *sc = data; struct g_bfq_queue *qp = priv; gs_bioq_init(&qp->q_bioq); /* * Set the initial parameters for the client: * slice size in bytes and ticks, and wait ticks. * Right now these are constant, but we could have * autoconfiguration code to adjust the values based on * the actual workload. */ qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0); qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); qp->q_sc = sc; /* link to the parent */ qp->q_sc->sc_nqueues++; /* * XXX should use the nice value of the issuer, or * anything smarter than this... */ qp->q_weight = qp->q_newweight = 1; me.queues++; return (0); } /* * Release a reference to the queue. */ static void g_bfq_queue_put(struct g_bfq_queue *qp) { g_sched_put_class(qp->q_sc->sc_geom, qp); } static void g_bfq_fini_class(void *data, void *priv) { struct g_bfq_queue *qp = priv; KASSERT(!gs_bioq_first(&qp->q_bioq), ("released nonempty queue")); qp->q_sc->sc_nqueues--; me.queues--; } static inline int g_bfq_gt(uint64_t a, uint64_t b) { return ((int64_t)(a - b) > 0); } static inline int g_bfq_cmp(struct g_bfq_queue *a, struct g_bfq_queue *b) { int64_t v = (int64_t)(a->q_finish - b->q_finish); return (v < 0 ? -1 : 1); } RB_GENERATE_STATIC(vtree, g_bfq_queue, q_node, g_bfq_cmp); static inline uint64_t g_bfq_delta(uint64_t service, unsigned weight) { return ((service << G_BFQ_SERVICE_SHIFT) / weight); } static inline void g_bfq_inc_vtime(struct g_bfq_softc *sc, uint64_t service) { uint64_t delta = (service << G_BFQ_SERVICE_SHIFT) + sc->sc_error; sc->sc_error = delta % sc->sc_wsum; sc->sc_vtime += delta / sc->sc_wsum; } static inline void g_bfq_update_finish(struct g_bfq_queue *qp) { uint64_t delta; if (qp->q_service < qp->q_budget) { delta = qp->q_budget - qp->q_service; qp->q_finish -= g_bfq_delta(delta, qp->q_weight); } qp->q_service = 0; } static inline void g_bfq_calc_finish(struct g_bfq_queue *qp, uint64_t service) { KASSERT(!qp->q_service, ("q_service not zero")); qp->q_finish = qp->q_start + g_bfq_delta(service, qp->q_weight); } static inline void g_bfq_extract(struct vtree *root, struct g_bfq_queue *qp) { KASSERT(qp->q_tree == root, ("Extracting from the wrong tree")); qp->q_tree = NULL; RB_REMOVE(vtree, root, qp); } static void g_bfq_idle_extract(struct g_bfq_softc *sc, struct g_bfq_queue *qp) { struct g_bfq_queue *next; KASSERT(qp->q_tree == &sc->sc_idle, ("Not on idle tree")); if (qp == sc->sc_firstidle) { next = RB_NEXT(vtree, &sc->sc_idle, qp); sc->sc_firstidle = next; } if (qp == sc->sc_lastidle) { next = RB_PREV(vtree, &sc->sc_idle, qp); sc->sc_lastidle = next; } g_bfq_extract(&sc->sc_idle, qp); } static void g_bfq_forget(struct g_bfq_softc *sc, struct g_bfq_queue *qp) { sc->sc_nqueues--; sc->sc_wsum -= qp->q_weight; g_bfq_queue_put(qp); } static void g_bfq_forget_idle(struct g_bfq_softc *sc) { struct g_bfq_queue *firstidle = sc->sc_firstidle; if (firstidle && !g_bfq_gt(firstidle->q_finish, sc->sc_vtime)) { g_bfq_idle_extract(sc, firstidle); g_bfq_forget(sc, firstidle); } } static void g_bfq_insert(struct vtree *root, struct g_bfq_queue *qp) { RB_INSERT(vtree, root, qp); qp->q_tree = root; } static void g_bfq_idle_insert(struct g_bfq_softc *sc, struct g_bfq_queue *qp) { struct g_bfq_queue *firstidle = sc->sc_firstidle; struct g_bfq_queue *lastidle = sc->sc_lastidle; if (!firstidle || g_bfq_gt(firstidle->q_finish, qp->q_finish)) sc->sc_firstidle = qp; if (!lastidle || g_bfq_gt(qp->q_finish, lastidle->q_finish)) sc->sc_lastidle = qp; g_bfq_insert(&sc->sc_idle, qp); } /* * Given qp and one of its two children child, update the qp->q_minstart * if necessary. */ static inline void g_bfq_update_min(struct g_bfq_queue *qp, struct g_bfq_queue *child) { if (child && g_bfq_gt(qp->q_minstart, child->q_minstart)) qp->q_minstart = child->q_minstart; } /* * When qp changes position or one of its children has moved, this function * updates qp->q_minstart. The left and right subtrees are assumed to hold * a correct ts_minstart value. */ static inline void g_bfq_update_active_node(struct g_bfq_queue *qp) { qp->q_minstart = qp->q_start; g_bfq_update_min(qp, RB_RIGHT(qp, q_node)); g_bfq_update_min(qp, RB_LEFT(qp, q_node)); } /* * qp must be the deepest modified node after an update. This function * updates its ts_minstart using the values held by its children, assuming * that they did not change, and then updates all the nodes that may have * changed in the path to the root. The only nodes that may have changed * are those in the path or their siblings. */ static void g_bfq_update_active_tree(struct g_bfq_queue *qp) { struct g_bfq_queue *parent; up: g_bfq_update_active_node(qp); parent = RB_PARENT(qp, q_node); if (!parent) return; if (qp == RB_LEFT(parent, q_node) && RB_RIGHT(parent, q_node)) g_bfq_update_active_node(RB_RIGHT(parent, q_node)); else if (RB_LEFT(parent, q_node)) g_bfq_update_active_node(RB_LEFT(parent, q_node)); qp = parent; goto up; } /* * The active tree is ordered by finish time, but an extra key is kept * per each node, containing the minimum value for the start times of * its children (and the node itself), so it's possible to search for * the eligible node with the lowest finish time. */ static void g_bfq_active_insert(struct g_bfq_softc *sc, struct g_bfq_queue *qp) { g_bfq_insert(&sc->sc_active, qp); if (RB_LEFT(qp, q_node)) qp = RB_LEFT(qp, q_node); else if (RB_RIGHT(qp, q_node)) qp = RB_RIGHT(qp, q_node); g_bfq_update_active_tree(qp); } /* * Do the first step of an extraction in an rb tree, looking for the * node that will replace qp, and returning the deepest node that the * following modifications to the tree can touch. If qp is the * last node in the tree return NULL. */ static struct g_bfq_queue * g_bfq_find_deepest(struct g_bfq_queue *qp) { struct g_bfq_queue *deepest; if (!RB_RIGHT(qp, q_node) && !RB_LEFT(qp, q_node)) deepest = RB_PARENT(qp, q_node); else if (!RB_RIGHT(qp, q_node)) deepest = RB_LEFT(qp, q_node); else if (!RB_LEFT(qp, q_node)) deepest = RB_RIGHT(qp, q_node); else { deepest = RB_NEXT(vtree, NULL, qp); if (RB_RIGHT(deepest, q_node)) deepest = RB_RIGHT(deepest, q_node); else if (RB_PARENT(deepest, q_node) != qp) deepest = RB_PARENT(deepest, q_node); } return (deepest); } static void g_bfq_active_extract(struct g_bfq_softc *sc, struct g_bfq_queue *qp) { struct g_bfq_queue *deepest; deepest = g_bfq_find_deepest(qp); g_bfq_extract(&sc->sc_active, qp); if (deepest) g_bfq_update_active_tree(deepest); } static inline void g_bfq_update_weight(struct g_bfq_softc *sc, struct g_bfq_queue *qp) { if (qp->q_weight != qp->q_newweight) { sc->sc_wsum += qp->q_newweight - qp->q_weight; qp->q_weight = qp->q_newweight; } } static void g_bfq_enqueue(struct g_bfq_softc *sc, struct g_bfq_queue *qp) { if (qp == sc->sc_current) { KASSERT(!qp->q_tree, ("Current queue is on a tree")); /* * If we are requeueing the current entity we have * to take care of not charging to it service it has * not received. */ g_bfq_update_finish(qp); qp->q_start = qp->q_finish; sc->sc_current = NULL; } else if (qp->q_tree == &sc->sc_active) { g_bfq_active_extract(sc, qp); } else if (qp->q_tree) { /* * Must be on the idle tree, tdq_idle_extract() will * check for that. */ g_bfq_idle_extract(sc, qp); if (g_bfq_gt(sc->sc_vtime, qp->q_finish)) qp->q_start = sc->sc_vtime; else qp->q_start = qp->q_finish; } else { /* * The finish time of the entity can be invalid, and * it is in the past for sure, otherwise the entity * would have been on the idle tree. */ qp->q_start = sc->sc_vtime; sc->sc_nqueues++; sc->sc_wsum += qp->q_weight; g_sched_priv_ref(qp); } //printf("ENQ: %d S=%llu F=%llu P=%llu\n", ts_id(qp), qp->q_start, // qp->q_finish, sc->sc_vtime); g_bfq_update_weight(sc, qp); g_bfq_calc_finish(qp, qp->q_budget); g_bfq_active_insert(sc, qp); } static inline void g_bfq_go_idle(struct g_bfq_softc *sc, struct g_bfq_queue *qp) { if (g_bfq_gt(qp->q_finish, sc->sc_vtime)) g_bfq_idle_insert(sc, qp); else g_bfq_forget(sc, qp); if (qp == sc->sc_current) sc->sc_current = NULL; } /* * Search the first schedulable entity, starting from the root of the tree * and going on the left every time on this side there is a subtree with at * least one eligible (start <= vtime) entity. The path on the right is * followed only if a) the left subtree contains no eligible entity and b) * no eligible entity has been found yet. */ static struct g_bfq_queue * g_bfq_first_active(struct g_bfq_softc *sc) { struct g_bfq_queue *qp = RB_ROOT(&sc->sc_active), *next, *first = NULL; while (qp != NULL) { if (!g_bfq_gt(qp->q_start, sc->sc_vtime)) first = qp; KASSERT(!g_bfq_gt(qp->q_minstart, sc->sc_vtime), ("No eligible entries on this subtree")); if (RB_LEFT(qp, q_node)) { next = RB_LEFT(qp, q_node); if (!g_bfq_gt(next->q_minstart, sc->sc_vtime)) { qp = next; continue; } } if (first != NULL) break; qp = RB_RIGHT(qp, q_node); } return (first); } /* * If necessary, update the queue vtime to have at least one eligible * entity, skipping to its start time. Assumes that the active tree * of the queue is not empty. */ static void g_bfq_update_vtime(struct g_bfq_softc *sc) { struct g_bfq_queue *qp = RB_ROOT(&sc->sc_active); if (g_bfq_gt(qp->q_minstart, sc->sc_vtime)) { sc->sc_vtime = qp->q_minstart; sc->sc_error = 0; g_bfq_forget_idle(sc); } } static struct g_bfq_queue * g_bfq_getnext(struct g_bfq_softc *sc) { struct g_bfq_queue *qp; KASSERT(!sc->sc_current || !extract, ("Bad tdq_current")); if (RB_EMPTY(&sc->sc_active)) return (NULL); g_bfq_update_vtime(sc); qp = g_bfq_first_active(sc); g_bfq_active_extract(sc, qp); KASSERT(!g_bfq_gt(qp->q_start, sc->sc_vtime), ("Selected an entity that was not eligible")); //printf("NXT: %d S=%llu F=%llu P=%llu\n", ts_id(qp), qp->q_start, // qp->q_finish, sc->sc_vtime); return (qp); } static void g_bfq_served(struct g_bfq_softc *sc, struct g_bfq_queue *qp, uint64_t served) { struct g_bfq_queue *cur = sc->sc_current; KASSERT(cur == qp, ("Wrong qp")); KASSERT(cur, ("No current queue")); KASSERT(cur->q_budget >= cur->q_service + served, ("Too much service received")); cur->q_service += served; g_bfq_inc_vtime(sc, served); //printf("P=%llu\n", sc->sc_vtime); g_bfq_forget_idle(sc); } static inline int g_bfq_queue_expired(struct g_bfq_queue *qp) { if ((qp->q_flags & G_FLAG_COMPLETED) && ticks - qp->q_slice_end >= 0) return (1); return (0); } static inline int g_bfq_should_anticipate(struct g_bfq_queue *qp, struct bio *bp) { int wait = get_bounded(&me.wait_ms, 2); if (!me.w_anticipate && (bp->bio_cmd & BIO_WRITE)) return (0); if (g_savg_valid(&qp->q_thinktime) && g_savg_read(&qp->q_thinktime) > wait) return (0); if (g_savg_valid(&qp->q_seekdist) && g_savg_read(&qp->q_seekdist) > 8192) return (0); return (1); } static void g_bfq_expire(struct g_bfq_softc *sc, int reason) { struct g_bfq_queue *qp = sc->sc_current; struct bio *first; uint64_t new_budget; switch (reason) { case G_BUDGET_EXHAUSTED: case G_TIME_EXHAUSTED: new_budget = qmax(qp->q_budget + G_BFQ_BUDGET_STEP, 1024 * get_bounded(&me.quantum_kb, 0)); break; case G_TIMED_OUT: if (qp->q_budget > G_BFQ_BUDGET_MIN + G_BFQ_BUDGET_STEP) { new_budget = qp->q_budget - G_BFQ_BUDGET_STEP; break; } default: new_budget = qp->q_budget; break; } // printf("%ld -> %ld\n", (unsigned long)qp->q_budget, // (unsigned long)new_budget); first = gs_bioq_first(&qp->q_bioq); if (first) { qp->q_budget = qmax(new_budget, first->bio_length); g_bfq_enqueue(sc, qp); } else { qp->q_budget = new_budget; g_bfq_go_idle(sc, qp); } } /* * Called on a request arrival, timeout or completion. * Try to serve a request among those queued. */ static struct bio * g_bfq_next(void *data, int force) { struct g_bfq_softc *sc = data; struct g_bfq_queue *qp; struct bio *bp; qp = sc->sc_current; if (me.bypass == 0 && !force) { if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0)) return (NULL); /* Try with the queue under service first. */ if (qp != NULL && qp->q_status != G_QUEUE_READY) { /* * Queue is anticipating, ignore request. * We should check that we are not past * the timeout, but in that case the timeout * will fire immediately afterwards so we * don't bother. */ return (NULL); } } else if (qp != NULL && qp->q_status != G_QUEUE_READY) { g_bfq_queue_put(qp); qp = NULL; } if (qp == NULL) { retry: qp = g_bfq_getnext(sc); sc->sc_current = qp; if (qp == NULL) return (NULL); /* No queues at all, return. */ /* Select the new queue for service. */ qp->q_service = 0; qp->q_flags &= ~G_FLAG_COMPLETED; } bp = gs_bioq_first(&qp->q_bioq); /* surely not NULL */ if (bp->bio_length > qp->q_budget - qp->q_service) { /* * This awkward retry loop is there for the following * corner case: a new request is inserted at the head * of the bioq of sc_current, and the queue has not * enough budget to serve it. The scheduling decision * is not retried more than once. */ g_bfq_expire(sc, G_BUDGET_EXHAUSTED); goto retry; } gs_bioq_remove(&qp->q_bioq, bp); g_bfq_served(sc, qp, bp->bio_length); if (g_bfq_queue_expired(qp)) { g_bfq_expire(sc, G_TIME_EXHAUSTED); } else if (gs_bioq_first(&qp->q_bioq)) { qp->q_status = G_QUEUE_READY; } else { if (!force && g_bfq_should_anticipate(qp, bp)) { /* Anticipate. */ qp->q_status = G_QUEUE_BUSY; } else { /* Do not anticipate. */ g_bfq_go_idle(sc, qp); } } /* * If sc_current != NULL, its q_status should always * be correct here... */ sc->sc_in_flight++; return (bp); } static inline void g_bfq_update_thinktime(struct g_bfq_queue *qp) { int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2); if (qp->q_sc->sc_current != qp) return; qp->q_lastsub = ticks; delta = (delta > 2 * wait) ? 2 * wait : delta; if (qp->q_bionum > 7) g_savg_add_sample(&qp->q_thinktime, delta); } static inline void g_bfq_update_seekdist(struct g_bfq_queue *qp, struct bio *bp) { off_t dist; if (qp->q_lastoff > bp->bio_offset) dist = qp->q_lastoff - bp->bio_offset; else dist = bp->bio_offset - qp->q_lastoff; if (dist > 8192 * 8) dist = 8192 * 8; qp->q_lastoff = bp->bio_offset + bp->bio_length; if (qp->q_bionum > 7) g_savg_add_sample(&qp->q_seekdist, dist); } /* * Insert a new request in qp, activating the queue if necessary; * update the budget of active queues if their front request changes * and the old budget value is not enough to serve their first request. */ static void g_bfq_disksort(struct g_bfq_softc *sc, struct g_bfq_queue *qp, struct bio *bp) { /* Enqueue the new bio. */ gs_bioq_disksort(&qp->q_bioq, bp); /* Queue was not empty, and we don't need to update its budget. */ if (bp != gs_bioq_first(&qp->q_bioq)) return; /* Cannot update the budget of the queue under service, return. */ if (qp == sc->sc_current) return; /* New budget value: be sure we have enough room for the first bio. */ qp->q_budget = qmax(qp->q_budget, bp->bio_length); /* (Re-)enqueue qp. */ g_bfq_enqueue(sc, qp); } /* * Called when a real request for disk I/O arrives. * Locate the queue associated with the client. * If the queue is the one we are anticipating for, reset its timeout; * if the queue is not in the round robin list, insert it in the list. * On any error, do not queue the request and return -1, the caller * will take care of this request. */ static int g_bfq_start(void *data, struct bio *bp) { struct g_bfq_softc *sc = data; struct g_bfq_queue *qp; int empty; if (me.bypass) return (-1); /* Bypass the scheduler. */ /* Get the queue for the request. */ qp = g_bfq_queue_get(sc, bp); if (qp == NULL) return (-1); /* Allocation failed, tell upstream. */ empty = !gs_bioq_first(&qp->q_bioq); bp->bio_caller1 = qp; g_bfq_disksort(sc, qp, bp); if (empty && qp == sc->sc_current) { /* We were anticipating this request... */ qp->q_status = G_QUEUE_READY; callout_stop(&sc->sc_wait); } qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3); g_bfq_update_thinktime(qp); g_bfq_update_seekdist(qp, bp); return (0); } /* * Callout executed when a queue times out anticipating a new request. */ static void g_bfq_wait_timeout(void *data) { struct g_bfq_softc *sc = data; struct g_geom *geom = sc->sc_geom; g_sched_lock(geom); /* * We can race with other events, so check if * sc_current is still valid. */ if (sc->sc_current != NULL) { g_bfq_expire(sc, G_TIMED_OUT); me.wait_hit--; me.wait_miss++; /* record the miss */ } g_sched_dispatch(geom); g_sched_unlock(geom); } /* * Module glue: allocate descriptor, initialize its fields. */ static void * g_bfq_init(struct g_geom *geom) { struct g_bfq_softc *sc; sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO); sc->sc_geom = geom; callout_init(&sc->sc_wait, CALLOUT_MPSAFE); RB_INIT(&sc->sc_active); RB_INIT(&sc->sc_idle); LIST_INSERT_HEAD(&me.sc_head, sc, sc_next); me.units++; return (sc); } static void g_bfq_hash_unref(void *data) { struct g_bfq_softc *sc = data; struct g_bfq_queue *qp, *tmp; RB_FOREACH_SAFE(qp, vtree, &sc->sc_idle, tmp) { g_bfq_idle_extract(sc, qp); g_bfq_forget(sc, qp); } KASSERT(sc->sc_current == NULL, ("still a queue under service")); KASSERT(RB_EMPTY(&sc->sc_active), ("still scheduled queues")); KASSERT(RB_EMPTY(&sc->sc_idle), ("still idle queues")); } /* * Module glue -- drain the callout structure, destroy the * hash table and its element, and free the descriptor. */ static void g_bfq_fini(void *data) { struct g_bfq_softc *sc = data; callout_drain(&sc->sc_wait); KASSERT(sc->sc_current == NULL, ("still a queue under service")); KASSERT(RB_EMPTY(&sc->sc_active), ("still scheduled queues")); KASSERT(RB_EMPTY(&sc->sc_idle), ("still idle queues")); LIST_REMOVE(sc, sc_next); me.units--; free(sc, M_GEOM_SCHED); } /* * Called when the request under service terminates. * Start the anticipation timer if needed. */ static void g_bfq_done(void *data, struct bio *bp) { struct g_bfq_softc *sc = data; struct g_bfq_queue *qp; sc->sc_in_flight--; qp = bp->bio_caller1; if (qp == sc->sc_current && qp->q_status == G_QUEUE_BUSY) { if (!(qp->q_flags & G_FLAG_COMPLETED)) { qp->q_flags |= G_FLAG_COMPLETED; /* in case we want to make the slice adaptive */ qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); qp->q_slice_end = ticks + qp->q_slice_duration; } /* The queue is trying anticipation, start the timer. */ qp->q_status = G_QUEUE_IDLING; /* may make this adaptive */ qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); me.wait_hit++; callout_reset(&sc->sc_wait, qp->q_wait_ticks, g_bfq_wait_timeout, sc); } else g_sched_dispatch(sc->sc_geom); /* Release the bio's reference to the queue. */ g_bfq_queue_put(qp); } static void g_bfq_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { if (!indent) { /* plaintext */ sbuf_printf(sb, " units %d queues %d", me.units, me.queues); } } static struct g_gsched g_bfq = { .gs_name = "bfq", .gs_priv_size = sizeof(struct g_bfq_queue), .gs_init = g_bfq_init, .gs_fini = g_bfq_fini, .gs_start = g_bfq_start, .gs_done = g_bfq_done, .gs_next = g_bfq_next, .gs_dumpconf = g_bfq_dumpconf, .gs_init_class = g_bfq_init_class, .gs_fini_class = g_bfq_fini_class, .gs_hash_unref = g_bfq_hash_unref, }; DECLARE_GSCHED_MODULE(bfq, &g_bfq);