ipfw/dn_heap.c000644 000423 000000 00000030207 11334513576 014035 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * A binary heap data structure used in dummynet */ #include #include #ifdef _KERNEL __FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/dn_heap.c 203382 2010-02-02 16:18:51Z luigi $"); #include #include #include #include #ifndef log #define log(x, arg...) #endif #else /* !_KERNEL */ #include #include #include #include #include "dn_heap.h" #define log(x, arg...) fprintf(stderr, ## arg) #define panic(x...) fprintf(stderr, ## x), exit(1) #define MALLOC_DEFINE(a, b, c) static void *my_malloc(int s) { return malloc(s); } static void my_free(void *p) { free(p); } #define malloc(s, t, w) my_malloc(s) #define free(p, t) my_free(p) #endif /* !_KERNEL */ MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap"); /* * Heap management functions. * * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. * Some macros help finding parent/children so we can optimize them. * * heap_init() is called to expand the heap when needed. * Increment size in blocks of 16 entries. * Returns 1 on error, 0 on success */ #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) #define HEAP_LEFT(x) ( (x)+(x) + 1 ) #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } #define HEAP_INCREMENT 15 static int heap_resize(struct dn_heap *h, unsigned int new_size) { struct dn_heap_entry *p; if (h->size >= new_size ) /* have enough room */ return 0; #if 1 /* round to the next power of 2 */ new_size |= new_size >> 1; new_size |= new_size >> 2; new_size |= new_size >> 4; new_size |= new_size >> 8; new_size |= new_size >> 16; #else new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT; #endif p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT); if (p == NULL) { printf("--- %s, resize %d failed\n", __func__, new_size ); return 1; /* error */ } if (h->size > 0) { bcopy(h->p, p, h->size * sizeof(*p) ); free(h->p, M_DN_HEAP); } h->p = p; h->size = new_size; return 0; } int heap_init(struct dn_heap *h, int size, int ofs) { if (heap_resize(h, size)) return 1; h->elements = 0; h->ofs = ofs; return 0; } /* * Insert element in heap. Normally, p != NULL, we insert p in * a new position and bubble up. If p == NULL, then the element is * already in place, and key is the position where to start the * bubble-up. * Returns 1 on failure (cannot allocate new heap entry) * * If ofs > 0 the position (index, int) of the element in the heap is * also stored in the element itself at the given offset in bytes. */ #define SET_OFFSET(h, i) do { \ if (h->ofs > 0) \ *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i; \ } while (0) /* * RESET_OFFSET is used for sanity checks. It sets ofs * to an invalid value. */ #define RESET_OFFSET(h, i) do { \ if (h->ofs > 0) \ *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16; \ } while (0) int heap_insert(struct dn_heap *h, uint64_t key1, void *p) { int son = h->elements; //log("%s key %llu p %p\n", __FUNCTION__, key1, p); if (p == NULL) { /* data already there, set starting point */ son = key1; } else { /* insert new element at the end, possibly resize */ son = h->elements; if (son == h->size) /* need resize... */ // XXX expand by 16 or so if (heap_resize(h, h->elements+16) ) return 1; /* failure... */ h->p[son].object = p; h->p[son].key = key1; h->elements++; } /* make sure that son >= father along the path */ while (son > 0) { int father = HEAP_FATHER(son); struct dn_heap_entry tmp; if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) break; /* found right position */ /* son smaller than father, swap and repeat */ HEAP_SWAP(h->p[son], h->p[father], tmp); SET_OFFSET(h, son); son = father; } SET_OFFSET(h, son); return 0; } /* * remove top element from heap, or obj if obj != NULL */ void heap_extract(struct dn_heap *h, void *obj) { int child, father, max = h->elements - 1; if (max < 0) { printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h); return; } if (obj == NULL) father = 0; /* default: move up smallest child */ else { /* extract specific element, index is at offset */ if (h->ofs <= 0) panic("%s: extract from middle not set on %p\n", __FUNCTION__, h); father = *((int *)((char *)obj + h->ofs)); if (father < 0 || father >= h->elements) { panic("%s: father %d out of bound 0..%d\n", __FUNCTION__, father, h->elements); } } /* * below, father is the index of the empty element, which * we replace at each step with the smallest child until we * reach the bottom level. */ // XXX why removing RESET_OFFSET increases runtime by 10% ? RESET_OFFSET(h, father); while ( (child = HEAP_LEFT(father)) <= max ) { if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) child++; /* take right child, otherwise left */ h->p[father] = h->p[child]; SET_OFFSET(h, father); father = child; } h->elements--; if (father != max) { /* * Fill hole with last entry and bubble up, * reusing the insert code */ h->p[father] = h->p[max]; heap_insert(h, father, NULL); } } #if 0 /* * change object position and update references * XXX this one is never used! */ static void heap_move(struct dn_heap *h, uint64_t new_key, void *object) { int temp, i, max = h->elements-1; struct dn_heap_entry *p, buf; if (h->ofs <= 0) panic("cannot move items on this heap"); p = h->p; /* shortcut */ i = *((int *)((char *)object + h->ofs)); if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */ p[i].key = new_key; for (; i>0 && DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key); i = temp ) { /* bubble up */ HEAP_SWAP(p[i], p[temp], buf); SET_OFFSET(h, i); } } else { /* must move down */ p[i].key = new_key; while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */ if (temp != max && DN_KEY_LT(p[temp+1].key, p[temp].key)) temp++; /* select child with min key */ if (DN_KEY_LT(>p[temp].key, new_key)) { /* go down */ HEAP_SWAP(p[i], p[temp], buf); SET_OFFSET(h, i); } else break; i = temp; } } SET_OFFSET(h, i); } #endif /* heap_move, unused */ /* * heapify() will reorganize data inside an array to maintain the * heap property. It is needed when we delete a bunch of entries. */ static void heapify(struct dn_heap *h) { int i; for (i = 0; i < h->elements; i++ ) heap_insert(h, i , NULL); } int heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t), uintptr_t arg) { int i, ret, found; for (i = found = 0 ; i < h->elements ;) { ret = fn(h->p[i].object, arg); if (ret & HEAP_SCAN_DEL) { h->elements-- ; h->p[i] = h->p[h->elements] ; found++ ; } else i++ ; if (ret & HEAP_SCAN_END) break; } if (found) heapify(h); return found; } /* * cleanup the heap and free data structure */ void heap_free(struct dn_heap *h) { if (h->size >0 ) free(h->p, M_DN_HEAP); bzero(h, sizeof(*h) ); } /* * hash table support. */ struct dn_ht { int buckets; /* how many buckets */ int entries; /* how many entries */ int ofs; /* offset of link field */ uint32_t (*hash)(uintptr_t, int, void *arg); int (*match)(void *_el, uintptr_t key, int, void *); void *(*new)(uintptr_t, int, void *); void **ht; /* bucket heads */ }; /* * Initialize, allocating bucket pointers inline. * Recycle previous record if possible. * If the 'new' function is not supplied, we assume that the * key passed to ht_find is the same object to be stored in. */ struct dn_ht * dn_ht_init(struct dn_ht *ht, int buckets, int ofs, uint32_t (*h)(uintptr_t, int, void *), int (*match)(void *, uintptr_t, int, void *), void *(*new)(uintptr_t, int, void *)) { int l; // printf("%s buckets %d ofs %d\n", __FUNCTION__, buckets, ofs); if (h == NULL || match == NULL) { printf("--- missing hash or match function"); return NULL; } if (buckets < 1 || buckets > 65536) return NULL; if (ht) { /* see if we can reuse */ if (buckets <= ht->buckets) { ht->buckets = buckets; } else { /* free pointers if not allocated inline */ if (ht->ht != (void *)(ht + 1)) free(ht->ht, M_DN_HEAP); free(ht, M_DN_HEAP); ht = NULL; } } if (ht == NULL) { l = sizeof(*ht) + buckets * sizeof(void **); ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO); } if (ht) { ht->ht = (void **)(ht + 1); ht->buckets = buckets; ht->ofs = ofs; ht->hash = h; ht->match = match; ht->new = new; } return ht; } /* dummy callback for dn_ht_free to unlink all */ static int do_del(void *obj, void *arg) { return DNHT_SCAN_DEL; } void dn_ht_free(struct dn_ht *ht, int flags) { if (ht == NULL) return; if (flags & DNHT_REMOVE) { (void)dn_ht_scan(ht, do_del, NULL); } else { if (ht->ht && ht->ht != (void *)(ht + 1)) free(ht->ht, M_DN_HEAP); free(ht, M_DN_HEAP); } } int dn_ht_entries(struct dn_ht *ht) { return ht ? ht->entries : 0; } /* lookup and optionally create or delete element */ void * dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg) { int i; void **pp, *p; if (ht == NULL) /* easy on an empty hash */ return NULL; i = (ht->buckets == 1) ? 0 : (ht->hash(key, flags, arg) % ht->buckets); // printf("%s key %p in bucket %d entries %d\n", // __FUNCTION__, (void *)key, i, ht->entries); for (pp = &ht->ht[i]; (p = *pp); pp = (void **)((char *)p + ht->ofs)) { if (flags & DNHT_MATCH_PTR) { if (key == (uintptr_t)p) break; } else if (ht->match(p, key, flags, arg)) /* found match */ break; } if (p) { if (flags & DNHT_REMOVE) { /* link in the next element */ *pp = *(void **)((char *)p + ht->ofs); *(void **)((char *)p + ht->ofs) = NULL; ht->entries--; } } else if (flags & DNHT_INSERT) { // printf("%s before calling new, bucket %d ofs %d\n", // __FUNCTION__, i, ht->ofs); p = ht->new ? ht->new(key, flags, arg) : (void *)key; // printf("%s new returns %p\n", __FUNCTION__, p); if (p) { ht->entries++; *(void **)((char *)p + ht->ofs) = ht->ht[i]; ht->ht[i] = p; } } return p; } /* * do a scan with the option to delete the object. Extract next before * running the callback because the element may be destroyed there. */ int dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg) { int i, ret, found = 0; void **curp, *cur, *next; // printf("%p ht %p fn %p\n", __FUNCTION__, ht, fn); if (ht == NULL || fn == NULL) return 0; for (i = 0; i < ht->buckets; i++) { curp = &ht->ht[i]; while ( (cur = *curp) != NULL) { next = *(void **)((char *)cur + ht->ofs); ret = fn(cur, arg); if (ret & DNHT_SCAN_DEL) { // printf("element %p removed\n", cur); found++; ht->entries--; *curp = next; } else { curp = (void **)((char *)cur + ht->ofs); } if (ret & DNHT_SCAN_END) return found; } } return found; } ipfw/dn_heap.h000644 000423 000000 00000016263 11326710075 014042 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/dn_heap.h 202695 2010-01-20 13:34:05Z luigi $ */ #ifndef _IP_DN_HEAP_H #define _IP_DN_HEAP_H #define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) #define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) /* * This module implements a binary heap supporting random extraction. * * A heap entry contains an uint64_t key and a pointer to object. * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b' * * The heap is a struct dn_heap plus a dynamically allocated * array of dn_heap_entry entries. 'size' represents the size of * the array, 'elements' count entries in use. The topmost * element has the smallest key. * The heap supports ordered insert, and extract from the top. * To extract an object from the middle of the heap, we the object * must reserve an 'int32_t' to store the position of the object * in the heap itself, and the location of this field must be * passed as an argument to heap_init() -- use -1 if the feature * is not used. */ struct dn_heap_entry { uint64_t key; /* sorting key, smallest comes first */ void *object; /* object pointer */ }; struct dn_heap { int size; /* the size of the array */ int elements; /* elements in use */ int ofs; /* offset in the object of heap index */ struct dn_heap_entry *p; /* array of "size" entries */ }; enum { HEAP_SCAN_DEL = 1, HEAP_SCAN_END = 2, }; /* * heap_init() reinitializes the heap setting the size and the offset * of the index for random extraction (use -1 if not used). * The 'elements' counter is set to 0. * * SET_HEAP_OFS() indicates where, in the object, is stored the index * for random extractions from the heap. * * heap_free() frees the memory associated to a heap. * * heap_insert() adds a key-pointer pair to the heap * * HEAP_TOP() returns a pointer to the top element of the heap, * but makes no checks on its existance (XXX should we change ?) * * heap_extract() removes the entry at the top, returing the pointer. * (the key should have been read before). * * heap_scan() invokes a callback on each entry of the heap. * The callback can return a combination of HEAP_SCAN_DEL and * HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must * be removed, and HEAP_SCAN_END means to terminate the scan. * heap_scan() returns the number of elements removed. * Because the order is not guaranteed, we should use heap_scan() * only as a last resort mechanism. */ #define HEAP_TOP(h) ((h)->p) #define SET_HEAP_OFS(h, n) do { (h)->ofs = n; } while (0) int heap_init(struct dn_heap *h, int size, int ofs); int heap_insert(struct dn_heap *h, uint64_t key1, void *p); void heap_extract(struct dn_heap *h, void *obj); void heap_free(struct dn_heap *h); int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t); /*------------------------------------------------------ * This module implements a generic hash table with support for * running callbacks on the entire table. To avoid allocating * memory during hash table operations, objects must reserve * space for a link field. XXX if the heap is moderately full, * an SLIST suffices, and we can tolerate the cost of a hash * computation on each removal. * * dn_ht_init() initializes the table, setting the number of * buckets, the offset of the link field, the main callbacks. * Callbacks are: * * hash(key, flags, arg) called to return a bucket index. * match(obj, key, flags, arg) called to determine if key * matches the current 'obj' in the heap * new(key, flags, arg) optional, used to allocate a new * object during insertions. * * dn_ht_free() frees the heap or unlink elements. * DNHT_REMOVE unlink elements, 0 frees the heap. * You need two calls to do both. * * dn_ht_find() is the main lookup function, which can also be * used to insert or delete elements in the hash table. * The final 'arg' is passed to all callbacks. * * dn_ht_scan() is used to invoke a callback on all entries of * the heap, or possibly on just one bucket. The callback * is invoked with a pointer to the object, and must return * one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the * removal of the object from the heap and the end of the * scan, respectively. * * A combination of flags can be used to modify the operation * of the dn_ht_find(), and of the callbacks: * * DNHT_KEY_IS_OBJ means the key is the object pointer. * It is usally of interest for the hash and match functions. * * DNHT_MATCH_PTR during a lookup, match pointers instead * of calling match(). Normally used when removing specific * entries. Does not imply KEY_IS_OBJ as the latter _is_ used * by the match function. * * DNHT_INSERT insert the element if not found. * Calls new() to allocates a new object unless * DNHT_KEY_IS_OBJ is set. * * DNHT_UNIQUE only insert if object not found. * XXX should it imply DNHT_INSERT ? * * DNHT_REMOVE remove objects if we find them. */ struct dn_ht; /* should be opaque */ struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs, uint32_t (*hash)(uintptr_t, int, void *), int (*match)(void *, uintptr_t, int, void *), void *(*new)(uintptr_t, int, void *)); void dn_ht_free(struct dn_ht *, int flags); void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *); int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *); int dn_ht_entries(struct dn_ht *); enum { /* flags values. * first two are returned by the scan callback to indicate * to delete the matching element or to end the scan */ DNHT_SCAN_DEL = 0x0001, DNHT_SCAN_END = 0x0002, DNHT_KEY_IS_OBJ = 0x0004, /* key is the obj pointer */ DNHT_MATCH_PTR = 0x0008, /* match by pointer, not match() */ DNHT_INSERT = 0x0010, /* insert if not found */ DNHT_UNIQUE = 0x0020, /* report error if already there */ DNHT_REMOVE = 0x0040, /* remove on find or dn_ht_free */ }; #endif /* _IP_DN_HEAP_H */ ipfw/dn_sched.h000644 000423 000000 00000013162 11334513576 014214 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * The API to write a packet scheduling algorithm for dummynet. */ #ifndef _DN_SCHED_H #define _DN_SCHED_H #define DN_MULTIQUEUE 0x01 /* * Descriptor for a scheduling algorithm. * Contains all function pointers for a given scheduler * This is typically created when a module is loaded, and stored * in a global list of schedulers. */ struct dn_alg { uint32_t type; /* the scheduler type */ const char *name; /* scheduler name */ uint32_t flags; /* DN_MULTIQUEUE if supports multiple queues */ /* * The following define the size of 3 optional data structures * that may need to be allocated at runtime, and are appended * to each of the base data structures: scheduler, sched.inst, * and queue. We don't have a per-flowset structure. */ /* + parameters attached to the template, e.g. * default queue sizes, weights, quantum size, and so on; */ size_t schk_datalen; /* + per-instance parameters, such as timestamps, * containers for queues, etc; */ size_t si_datalen; size_t q_datalen; /* per-queue parameters (e.g. S,F) */ /* * Methods implemented by the scheduler: * enqueue enqueue packet 'm' on scheduler 's', queue 'q'. * q is NULL for !MULTIQUEUE. * Return 0 on success, 1 on drop (packet consumed anyways). * * dequeue Called when scheduler instance 's' can * dequeue a packet. Return NULL if none are available. * XXX what about non work-conserving ? * * config called on 'sched X config ...', normally writes * in the area of size sch_arg * * destroy called on 'sched delete', frees everything * in sch_arg (other parts are handled by more specific * functions) * * new_sched called when a new instance is created, e.g. * to create the local queue for !MULTIQUEUE, set V or * copy parameters for WFQ, and so on. * * free_sched called when deleting an instance, cleans * extra data in the per-instance area. * * new_fsk called when a flowset is linked to a scheduler, * e.g. to validate parameters such as weights etc. * free_fsk when a flowset is unlinked from a scheduler. * (probably unnecessary) * * new_queue called to set the per-queue parameters, * e.g. S and F, adjust sum of weights in the parent, etc. * If the queue has packets in it, add them to the scheduler * as well. * * free_queue actions related to a queue removal, e.g. undo * all the above. If the queue has data in it, also remove * from the scheduler. This can e.g. happen during a reconfigure. */ int (*enqueue)(struct dn_sch_inst *, struct dn_queue *, struct mbuf *); struct mbuf * (*dequeue)(struct dn_sch_inst *); int (*config)(struct dn_schk *); int (*destroy)(struct dn_schk*); int (*new_sched)(struct dn_sch_inst *); int (*free_sched)(struct dn_sch_inst *); int (*new_fsk)(struct dn_fsk *f); int (*free_fsk)(struct dn_fsk *f); int (*new_queue)(struct dn_queue *q); int (*free_queue)(struct dn_queue *q); /* run-time fields */ int ref_count; /* XXX number of instances in the system */ SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */ }; /* MSVC does not support initializers so we need this ugly macro */ #ifdef _WIN32 #define _SI(fld) #else #define _SI(fld) fld #endif /* * Additionally, dummynet exports some functions and macros * to be used by schedulers: */ void dn_free_pkts(struct mbuf *mnext); int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop); /* bound a variable between min and max */ int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg); /* * Extract the head of a queue, update stats. Must be the very last * thing done on a dequeue as the queue itself may go away. */ static __inline struct mbuf* dn_dequeue(struct dn_queue *q) { struct mbuf *m = q->mq.head; if (m == NULL) return NULL; q->mq.head = m->m_nextpkt; q->ni.length--; q->ni.len_bytes -= m->m_pkthdr.len; if (q->_si) { q->_si->ni.length--; q->_si->ni.len_bytes -= m->m_pkthdr.len; } return m; } int dn_sched_modevent(module_t mod, int cmd, void *arg); #define DECLARE_DNSCHED_MODULE(name, dnsched) \ static moduledata_t name##_mod = { \ #name, dn_sched_modevent, dnsched \ }; \ DECLARE_MODULE(name, name##_mod, \ SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \ MODULE_DEPEND(name, dummynet, 3, 3, 3); #endif /* _DN_SCHED_H */ ipfw/dn_sched_fifo.c000644 000423 000000 00000007257 11334513576 015222 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifdef _KERNEL #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include /* ipfw_rule_ref */ #include /* flow_id */ #include #include #include #include #else #include #endif /* * This file implements a FIFO scheduler for a single queue. * The queue is allocated as part of the scheduler instance, * and there is a single flowset is in the template which stores * queue size and policy. * Enqueue and dequeue use the default library functions. */ static int fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m) { /* XXX if called with q != NULL and m=NULL, this is a * re-enqueue from an existing scheduler, which we should * handle. */ q = (struct dn_queue *)(si+1); mq_append(&q->mq, m); q->ni.length++; q->ni.tot_bytes += m->m_pkthdr.len; return 0; } static struct mbuf * fifo_dequeue(struct dn_sch_inst *si) { return dn_dequeue((struct dn_queue *)(si + 1)); } static int fifo_new_sched(struct dn_sch_inst *si) { /* This scheduler instance contains the queue */ struct dn_queue *q = (struct dn_queue *)(si + 1); set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q)); q->_si = si; q->fs = si->sched->fs; return 0; } static int fifo_free_sched(struct dn_sch_inst *si) { struct dn_queue *q = (struct dn_queue *)(si + 1); dn_free_pkts(q->mq.head); bzero(q, sizeof(*q)); return 0; } /* * FIFO scheduler descriptor * contains the type of the scheduler, the name, the size of extra * data structures, and function pointers. */ static struct dn_alg fifo_desc = { _SI( .type = ) DN_SCHED_FIFO, _SI( .name = ) "FIFO", _SI( .flags = ) 0, _SI( .schk_datalen = ) 0, _SI( .si_datalen = ) sizeof(struct dn_queue), _SI( .q_datalen = ) 0, _SI( .enqueue = ) fifo_enqueue, _SI( .dequeue = ) fifo_dequeue, _SI( .config = ) NULL, _SI( .destroy = ) NULL, _SI( .new_sched = ) fifo_new_sched, _SI( .free_sched = ) fifo_free_sched, _SI( .new_fsk = ) NULL, _SI( .free_fsk = ) NULL, _SI( .new_queue = ) NULL, _SI( .free_queue = ) NULL, }; DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc); ipfw/dn_sched_kps.c000644 000423 000000 00000042277 11333321023 015054 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2010 .... * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifdef _KERNEL #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include /* ipfw_rule_ref */ #include /* flow_id */ #include #include #include #include #else #include #endif #define DN_SCHED_KPS 18 // XXX Where? /* Maximum number of levels and number of buckets per level. */ #define KPS_MAX_LEVELS 32 #define KPS_MAX_BUCKETS 256 /* Mask used to retrieve the bucket number. */ #define KPS_BUCKETS_MASK (KPS_MAX_BUCKETS - 1) /* Allowed weights are in [1, 2^KPS_MAX_WSHIFT]. */ #define KPS_MAX_WSHIFT 16 #define KPS_MAX_WEIGHT (1<prev = (l)->next = (l); } while (0) #define list_empty(l) ( (l)->next == l ) static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { next->prev = new; new->next = next; new->prev = prev; prev->next = new; } static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } #define list_first_entry(pL, ty, member) \ (ty *)((char *)((pL)->next) - offsetof(ty, member)) static inline void __list_del(struct list_head *prev, struct list_head *next) { next->prev = prev; prev->next = next; } static inline void list_del(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->next = entry->prev = NULL; } #endif /* INIT_LIST_HEAD */ typedef uint64_t u64; typedef int64_t s64; typedef uint32_t u32; typedef uint32_t bitmap; #define test_bit(ix, pData) ((*pData) & (1<<(ix))) #define __set_bit(ix, pData) (*pData) |= (1<<(ix)) #define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) /* SIWFQ flow descriptor. */ struct kps_class { struct dn_queue _q; /* Link to create a per-bucket linked list. */ struct list_head list; // tailq entry u64 S, F; /* Precise timestamps of the class. */ unsigned int weight; unsigned int lmax; /* Max packet length for this class. */ unsigned int level; /* Class' level. */ }; /* Interleaved-Stratified Timer Wheel */ struct kps_istw { bitmap levelbits; /* Bits set indicate flows at the given level. */ /* Bits set indicate flows in the head bucket at the given level. */ bitmap frontbits; int nr_flows[KPS_MAX_LEVELS]; /* Number of flows per each level. */ /* The actual buckets. */ struct list_head buckets[KPS_MAX_LEVELS][KPS_MAX_BUCKETS]; }; /* scheduler descriptor. */ struct kps_sched { u64 V; /* Precise virtual time. */ /* Active and blocked containers. */ struct kps_istw blocked; struct kps_istw active; struct mq tx_queue; /* List of packets ready to be transmitted. */ unsigned long wsum; /* Weights sum up to one, keep a counter to enforce it. */ }; static inline int kps_get_level(u32 mask) { if (mask & 1) return 0; return ffs(mask >> 1); } static u64 kps_get_rel_rate(unsigned int weight) { u64 rate = (u64)weight << KPS_FP_SHIFT; return rate / KPS_MAX_WSUM; } static inline int kps_floor_log2(unsigned int x) { x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16; return ffs((x ^ (x >> 1)) >> 1); } static int kps_get_rate_level(unsigned int plen, unsigned int rate) { u64 slot_size = ((u64)plen << KPS_FP_SHIFT) / rate; return kps_floor_log2(slot_size >> KPS_MIN_PSHIFT); } #if 0 static int kps_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg) { struct kps_sched *q = qdisc_priv(sch); struct kps_class *cl = (struct kps_class *)*arg; struct nlattr *tb[TCA_KPS_MAX + 1]; spinlock_t *lock; u32 weight, lmax; int err; if (tca[TCA_OPTIONS] == NULL) return -EINVAL; err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], kps_policy); if (err < 0) return err; if (tb[TCA_KPS_WEIGHT]) { weight = nla_get_u32(tb[TCA_KPS_WEIGHT]); if (!weight || weight > (1 << KPS_MAX_WSHIFT)) return -EINVAL; } else weight = 1; if (tb[TCA_KPS_LMAX]) { lmax = nla_get_u32(tb[TCA_KPS_LMAX]); if (lmax < (1UL << KPS_MIN_PSHIFT) || lmax > (1UL << KPS_MTU_SHIFT)) return -EINVAL; } else lmax = 1UL << KPS_MTU_SHIFT; if (cl != NULL) { if (tca[TCA_RATE]) { lock = qdisc_root_sleeping_lock(sch); err = gen_replace_estimator(&cl->bstats, &cl->rate_est, lock, tca[TCA_RATE]); if (err) return err; } err = 0; sch_tree_lock(sch); if (tb[TCA_KPS_WEIGHT]) { if (q->wsum - cl->weight + weight > KPS_MAX_WSUM) return -EINVAL; q->wsum += weight - cl->weight; cl->weight = weight; } sch_tree_unlock(sch); return 0; } cl = kmem_cache_alloc(kps_pool, GFP_KERNEL | __GFP_ZERO); if (cl == NULL) return -ENOBUFS; cl->refcnt = 1; cl->common.classid = classid; cl->weight = weight; cl->rate = kps_get_rel_rate(weight); INIT_LIST_HEAD(&cl->list); cl->level = kps_get_rate_level(lmax, cl->rate); q->wsum += cl->weight; cl->qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, &pfifo_qdisc_ops, classid); if (cl->qdisc == NULL) cl->qdisc = &noop_qdisc; if (tca[TCA_RATE]) { lock = qdisc_root_sleeping_lock(sch); err = gen_replace_estimator(&cl->bstats, &cl->rate_est, lock, tca[TCA_RATE]); if (err) { qdisc_destroy(cl->qdisc); kmem_cache_free(kps_pool, cl); return err; } } sch_tree_lock(sch); qdisc_class_hash_insert(&q->clhash, &cl->common); sch_tree_unlock(sch); qdisc_class_hash_grow(sch, &q->clhash); *arg = (unsigned long)cl; return 0; } static void kps_destroy_class(struct Qdisc *sch, struct kps_class *cl) { //struct kps_sched *q = (struct kps_sched *)sch; /* Do not support the destruction of an active class... */ gen_kill_estimator(&cl->bstats, &cl->rate_est); qdisc_destroy(cl->qdisc); kmem_cache_free(kps_pool, cl); } static int kps_delete_class(struct Qdisc *sch, unsigned long arg) { struct kps_sched *q = qdisc_priv(sch); struct kps_class *cl = (struct kps_class *)arg; if (cl->filter_cnt > 0) return -EBUSY; sch_tree_lock(sch); kps_purge_queue(cl); qdisc_class_hash_remove(&q->clhash, &cl->common); if (--cl->refcnt == 0) kps_destroy_class(sch, cl); sch_tree_unlock(sch); return 0; } #endif /* Compare two timestamps, handling wraparound. */ static inline int kps_gt(u64 a, u64 b) { return (s64)(a - b) > 0; } static inline int kps_slot_gt(u64 a, u64 b) { return (s64)((a << KPS_SLOT_SHIFT) - (b << KPS_SLOT_SHIFT)) > 0; } static inline void kps_calc_finish(struct kps_class *cl, unsigned int len) { u64 delta = ((u64)len << KPS_FP_SHIFT)/cl->weight; cl->F = cl->S + delta; } static inline u64 kps_slot_to_bucket(u64 slot, int level) { return (slot - (1 << level)) >> (level + 1); } static inline u64 kps_bucket_to_slot(u64 bucket, int level) { return (bucket << (level + 1)) + (1 << level); } static inline u64 kps_round_slot(u64 slot, int level) { return kps_bucket_to_slot(kps_slot_to_bucket(slot, level), level); } static inline u64 kps_vtime_slot(u64 t) { return (u64)(t >> KPS_SLOT_SHIFT); } static inline struct list_head *kps_get_bucket(struct kps_istw *istw, u64 slot, int level) { int bucket = kps_slot_to_bucket(slot, level) & KPS_BUCKETS_MASK; return &istw->buckets[level][bucket]; } static inline void kps_istw_insert(struct kps_istw *istw, struct kps_class *cl, u64 slot, int level) { struct list_head *bucket = kps_get_bucket(istw, slot, level); istw->nr_flows[level]++; __set_bit(level, &istw->levelbits); list_add_tail(&cl->list, bucket); } /* Insert a flow into given ISTW container. */ static inline void kps_insert_flow(struct kps_sched *q, struct kps_class *cl, unsigned int len) { u64 slot; kps_calc_finish(cl, len); /* Calculate the start time slot. */ slot = kps_vtime_slot(cl->S) - (2ULL << cl->level); if (!kps_slot_gt(kps_round_slot(slot, cl->level), kps_vtime_slot(q->V))) { /* We're eligible, insert into active at the right slot. */ slot = kps_vtime_slot(cl->F) + (2ULL << cl->level); kps_istw_insert(&q->active, cl, slot, cl->level); ND("ENQ %p ACT slot = %llx, len = %d, weight = %u", cl, slot, len, cl->weight); } else { /* Ineligible, insert into blocked. */ kps_istw_insert(&q->blocked, cl, slot, cl->level); ND("ENQ %p BLK slot = %llx, len = %d, weight = %u", cl, slot, len, cl->weight); } } /* Remove the first flow from a slot/level. */ static inline struct kps_class * kps_remove(struct kps_istw *istw, u64 slot, int level) { struct list_head *bucket = kps_get_bucket(istw, slot, level); struct kps_class *cl; cl = list_first_entry(bucket, struct kps_class, list); list_del(&cl->list); istw->nr_flows[level]--; if (!istw->nr_flows[level]) __clear_bit(level, &istw->levelbits); return cl; } static inline struct kps_class * kps_istw_search(struct kps_istw *istw, u64 slot) { int level, step; /* No flows, nothing to search. */ if (!istw->levelbits) return NULL; /* Start from the first slot of the lowest level with some flows. */ level = kps_get_level(istw->levelbits); slot = kps_round_slot(slot, level); /* Skip levels below the first non-empty one. */ step = 1 << level; /* * Iterate over the buckets, calculating on-the-fly the level of the * slot we are looking up. */ while (list_empty(kps_get_bucket(istw, slot, level))) { slot += step; level = kps_get_level(slot); } /* Return the first element, removing it from the container.*/ return kps_remove(istw, slot, level); } static struct kps_class * kps_istw_scan(struct kps_istw *istw, u64 slot) { int level = kps_get_level(slot); struct kps_class *cl; /* Update frontbits if we entered a non-empty slot. */ if (!list_empty(kps_get_bucket(istw, slot, level))) __set_bit(level, &istw->frontbits); /* Nothing to return. */ if (!istw->frontbits) return NULL; /* The first non-empty bucket is at the level indicated by frontbits. */ level = kps_get_level(istw->frontbits); cl = kps_remove(istw, slot, level); /* Update frontbits if necessary. */ if (list_empty(kps_get_bucket(istw, slot, level))) __clear_bit(level, &istw->frontbits); return cl; } struct mbuf * kps_dequeue(struct dn_sch_inst *_si) { struct kps_sched *q = (struct kps_sched *)(_si + 1); struct mbuf *m = NULL, *m2; struct kps_class *cl; u64 slot, sslot, old_vslot; struct list_head *bucket; int level; int len; old_vslot = kps_vtime_slot(q->V); m = q->tx_queue.head; if (m != NULL) { q->tx_queue.head = m->m_nextpkt; m->m_nextpkt = NULL; cl = NULL; } else { if (!q->active.levelbits) goto out; cl = kps_istw_search(&q->active, old_vslot); m = dn_dequeue(&cl->_q); if (!m) return NULL; /* Update flow timestamps. */ cl->S = cl->F; } /* Update system virtual time. */ q->V += (u64)m->m_pkthdr.len << KPS_ONE_SHIFT; if (cl && cl->_q.mq.head) { len = cl->_q.mq.head->m_pkthdr.len; kps_insert_flow(q, cl, len); } /* * Unlike what's in the paper, it seems that we need to start one * slot before the virtual time: While the slot is effectively cleared * before the virtual time increases of the same span covered by * the slot, it seems that due to unaligment between the beginning * of the slot and the value of the virtual time, starting exactly * from old_vslot leaves flows behind. (Should prove that or * something...) */ slot = old_vslot ; // - 1; // XXX do we need -1 ? while (kps_slot_gt(kps_vtime_slot(q->V), slot) && q->blocked.levelbits) { if (!q->active.levelbits && !q->blocked.frontbits) { cl = kps_istw_search(&q->blocked, slot); slot = kps_vtime_slot(cl->S); slot = kps_round_slot(slot, cl->level) - (2 << cl->level); /* * Another difference with the paper: This should * be the only case where virtual time jumps. */ if (kps_slot_gt(slot, kps_vtime_slot(q->V))) { q->V = slot << KPS_SLOT_SHIFT; old_vslot = slot; ND("JMP %llx/%llx\n", q->V, slot); } } else { cl = kps_istw_scan(&q->blocked, slot); slot++; } if (cl) { /* Do the transfer. */ sslot = kps_vtime_slot(cl->F) + (2ULL << cl->level); kps_istw_insert(&q->active, cl, sslot, cl->level); } } slot = old_vslot; while (kps_slot_gt(kps_vtime_slot(q->V), slot)) { level = kps_get_level(slot); bucket = kps_get_bucket(&q->active, slot, level); while (!list_empty(bucket)) { cl = kps_remove(&q->active, slot, level); m2 = dn_dequeue(&cl->_q); mq_append(&q->tx_queue, m2); cl->S = cl->F; if (cl->_q.mq.head) { len = cl->_q.mq.head->m_pkthdr.len; kps_insert_flow(q, cl, len); } } slot++; } out: return m; } int kps_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q, struct mbuf *m) { struct kps_sched *q = (struct kps_sched *)(_si + 1); struct kps_class *cl = (struct kps_class *)_q; u64 rlimit; /* verify length */ if (m != _q->mq.head) { if (dn_enqueue(_q, m, 0)) /* packet was dropped */ return 1; if (m != _q->mq.head) return 0; } if (m == _q->mq.head) { /* * This ugly extra check comes from the need of avoiding * flows with too old timestamps reappearing after a * vtime wraparound and being put too far in the future. */ rlimit = q->V + (5ULL << cl->level); if (kps_gt(q->V, cl->S) || kps_gt(cl->S, rlimit)) cl->S = q->V; else if (!q->active.levelbits) q->V = cl->S; kps_insert_flow(q, cl, m->m_pkthdr.len); } return 0; } static int kps_new_queue(struct dn_queue *_q) { struct kps_sched *q = (struct kps_sched *)(_q->_si + 1); struct kps_class *cl = (struct kps_class *)_q; uint32_t w; /* approximated weight */ u64 rate; w = _q->fs->fs.par[0]; cl->lmax = _q->fs->fs.par[1]; if (!w || w > KPS_MAX_WEIGHT) { w = 1; D("rounding weight to 1"); } if (q->wsum + w > KPS_MAX_WSUM) return EINVAL; rate = kps_get_rel_rate(w); cl->weight = w; cl->level = kps_get_rate_level(cl->lmax, rate); return 0; } static int kps_free_queue(struct dn_queue *_q) { struct kps_sched *q = (struct kps_sched *)(_q->_si + 1); struct kps_class *cl = (struct kps_class *)(_q + 1); if (cl->weight) { q->wsum -= cl->weight; cl->weight = 0; /* reset weight to avoid run twice */ } return 0; } static int kps_new_fsk(struct dn_fsk *f) { ipdn_bound_var(&f->fs.par[0], 1, 1, KPS_MAX_WEIGHT, "qfq weight"); ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen"); ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]); return 0; } static int kps_new_sched(struct dn_sch_inst *si) { struct kps_sched *q = (struct kps_sched *)(si + 1); int i, j; for (i = 0; i < KPS_MAX_LEVELS; i++) { for (j = 0; j < KPS_MAX_BUCKETS; j++) { INIT_LIST_HEAD(&q->blocked.buckets[i][j]); INIT_LIST_HEAD(&q->active.buckets[i][j]); } } return 0; } /* * SIWFQ scheduler descriptor */ static struct dn_alg kps_desc = { .type = DN_SCHED_KPS, .name = "KPS", .flags = DN_MULTIQUEUE, .si_datalen = sizeof(struct kps_sched), .q_datalen = sizeof(struct kps_class) - sizeof(struct dn_queue), .enqueue = kps_enqueue, .dequeue = kps_dequeue, #if 0 .config = rr_config, #endif .new_sched = kps_new_sched, .new_fsk = kps_new_fsk, .new_queue = kps_new_queue, .free_queue = kps_free_queue, }; DECLARE_DNSCHED_MODULE(dn_kps, &kps_desc); ipfw/dn_sched_qfq.c000644 000423 000000 00000055260 11334513576 015063 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifdef _KERNEL #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include /* ipfw_rule_ref */ #include /* flow_id */ #include #include #include #include #else #include #endif #ifdef QFQ_DEBUG struct qfq_sched; static void dump_sched(struct qfq_sched *q, const char *msg); #define NO(x) x #else #define NO(x) #endif #define DN_SCHED_QFQ 4 // XXX Where? typedef unsigned long bitmap; /* * bitmaps ops are critical. Some linux versions have __fls * and the bitmap ops. Some machines have ffs */ #if defined(_WIN32) static int fls(unsigned long n) { int i = 0; for (i = 0; n > 0; n >>= 1, i++) ; return i; } #endif #if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) static inline unsigned long __fls(unsigned long word) { return fls(word) - 1; } #endif #if !defined(_KERNEL) || !defined(__linux__) #ifdef QFQ_DEBUG int test_bit(int ix, bitmap *p) { if (ix < 0 || ix > 31) D("bad index %d", ix); return *p & (1< 31) D("bad index %d", ix); *p |= (1< 31) D("bad index %d", ix); *p &= ~(1<index = 0 *.__grp->slot_shift where MIN_SLOT_SHIFT is derived by difference from the others. The max group index corresponds to Lmax/w_min, where Lmax=1<group mapping. Class weights are * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the * group with the smallest index that can support the L_i / r_i * configured for the class. * * grp->index is the index of the group; and grp->slot_shift * is the shift for the corresponding (scaled) sigma_i. * * When computing the group index, we do (len<i_wsum) #define IWSUM ((1< 0; } /* Round a precise timestamp to its slotted value. */ static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift) { return ts & ~((1ULL << shift) - 1); } /* return the pointer to the group with lowest index in the bitmap */ static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, unsigned long bitmap) { int index = ffs(bitmap) - 1; // zero-based return &q->groups[index]; } /* * Calculate a flow index, given its weight and maximum packet length. * index = log_2(maxlen/weight) but we need to apply the scaling. * This is used only once at flow creation. */ static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen) { uint64_t slot_size = (uint64_t)maxlen *inv_w; unsigned long size_map; int index = 0; size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT); if (!size_map) goto out; index = __fls(size_map) + 1; // basically a log_2() index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); if (index < 0) index = 0; out: ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index); return index; } /*---- end support functions ----*/ /*-------- API calls --------------------------------*/ /* * Validate and copy parameters from flowset. */ static int qfq_new_queue(struct dn_queue *_q) { struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); struct qfq_class *cl = (struct qfq_class *)_q; int i; uint32_t w; /* approximated weight */ /* import parameters from the flowset. They should be correct * already. */ w = _q->fs->fs.par[0]; cl->lmax = _q->fs->fs.par[1]; if (!w || w > QFQ_MAX_WEIGHT) { w = 1; D("rounding weight to 1"); } cl->inv_w = ONE_FP/w; w = ONE_FP/cl->inv_w; if (q->wsum + w > QFQ_MAX_WSUM) return EINVAL; i = qfq_calc_index(cl->inv_w, cl->lmax); cl->grp = &q->groups[i]; q->wsum += w; // XXX cl->S = q->V; ? // XXX compute q->i_wsum return 0; } /* remove an empty queue */ static int qfq_free_queue(struct dn_queue *_q) { struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); struct qfq_class *cl = (struct qfq_class *)_q; if (cl->inv_w) { q->wsum -= ONE_FP/cl->inv_w; cl->inv_w = 0; /* reset weight to avoid run twice */ } return 0; } /* Calculate a mask to mimic what would be ffs_from(). */ static inline unsigned long mask_from(unsigned long bitmap, int from) { return bitmap & ~((1UL << from) - 1); } /* * The state computation relies on ER=0, IR=1, EB=2, IB=3 * First compute eligibility comparing grp->S, q->V, * then check if someone is blocking us and possibly add EB */ static inline unsigned int qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp) { /* if S > V we are not eligible */ unsigned int state = qfq_gt(grp->S, q->V); unsigned long mask = mask_from(q->bitmaps[ER], grp->index); struct qfq_group *next; if (mask) { next = qfq_ffs(q, mask); if (qfq_gt(grp->F, next->F)) state |= EB; } return state; } /* * In principle * q->bitmaps[dst] |= q->bitmaps[src] & mask; * q->bitmaps[src] &= ~mask; * but we should make sure that src != dst */ static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst) { q->bitmaps[dst] |= q->bitmaps[src] & mask; q->bitmaps[src] &= ~mask; } static inline void qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish) { unsigned long mask = mask_from(q->bitmaps[ER], index + 1); struct qfq_group *next; if (mask) { next = qfq_ffs(q, mask); if (!qfq_gt(next->F, old_finish)) return; } mask = (1UL << index) - 1; qfq_move_groups(q, mask, EB, ER); qfq_move_groups(q, mask, IB, IR); } /* * perhaps * old_V ^= q->V; old_V >>= QFQ_MIN_SLOT_SHIFT; if (old_V) { ... } * */ static inline void qfq_make_eligible(struct qfq_sched *q, uint64_t old_V) { unsigned long mask, vslot, old_vslot; vslot = q->V >> QFQ_MIN_SLOT_SHIFT; old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; if (vslot != old_vslot) { mask = (2UL << (__fls(vslot ^ old_vslot))) - 1; qfq_move_groups(q, mask, IR, ER); qfq_move_groups(q, mask, IB, EB); } } /* * XXX we should make sure that slot becomes less than 32. * This is guaranteed by the input values. * roundedS is always cl->S rounded on grp->slot_shift bits. */ static inline void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS) { uint64_t slot = (roundedS - grp->S) >> grp->slot_shift; unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS; cl->next = grp->slots[i]; grp->slots[i] = cl; __set_bit(slot, &grp->full_slots); } /* * remove the entry from the slot */ static inline void qfq_front_slot_remove(struct qfq_group *grp) { struct qfq_class **h = &grp->slots[grp->front]; *h = (*h)->next; if (!*h) __clear_bit(0, &grp->full_slots); } /* * Returns the first full queue in a group. As a side effect, * adjust the bucket list so the first non-empty bucket is at * position 0 in full_slots. */ static inline struct qfq_class * qfq_slot_scan(struct qfq_group *grp) { int i; ND("grp %d full %x", grp->index, grp->full_slots); if (!grp->full_slots) return NULL; i = ffs(grp->full_slots) - 1; // zero-based if (i > 0) { grp->front = (grp->front + i) % QFQ_MAX_SLOTS; grp->full_slots >>= i; } return grp->slots[grp->front]; } /* * adjust the bucket list. When the start time of a group decreases, * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to * move the objects. The mask of occupied slots must be shifted * because we use ffs() to find the first non-empty slot. * This covers decreases in the group's start time, but what about * increases of the start time ? * Here too we should make sure that i is less than 32 */ static inline void qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS) { unsigned int i = (grp->S - roundedS) >> grp->slot_shift; grp->full_slots <<= i; grp->front = (grp->front - i) % QFQ_MAX_SLOTS; } static inline void qfq_update_eligible(struct qfq_sched *q, uint64_t old_V) { bitmap ineligible; ineligible = q->bitmaps[IR] | q->bitmaps[IB]; if (ineligible) { if (!q->bitmaps[ER]) { struct qfq_group *grp; grp = qfq_ffs(q, ineligible); if (qfq_gt(grp->S, q->V)) q->V = grp->S; } qfq_make_eligible(q, old_V); } } /* * Updates the class, returns true if also the group needs to be updated. */ static inline int qfq_update_class(struct qfq_sched *q, struct qfq_group *grp, struct qfq_class *cl) { cl->S = cl->F; if (cl->_q.mq.head == NULL) { qfq_front_slot_remove(grp); } else { unsigned int len; uint64_t roundedS; len = cl->_q.mq.head->m_pkthdr.len; cl->F = cl->S + (uint64_t)len * cl->inv_w; roundedS = qfq_round_down(cl->S, grp->slot_shift); if (roundedS == grp->S) return 0; qfq_front_slot_remove(grp); qfq_slot_insert(grp, cl, roundedS); } return 1; } static struct mbuf * qfq_dequeue(struct dn_sch_inst *si) { struct qfq_sched *q = (struct qfq_sched *)(si + 1); struct qfq_group *grp; struct qfq_class *cl; struct mbuf *m; uint64_t old_V; NO(q->loops++;) if (!q->bitmaps[ER]) { NO(if (q->queued) dump_sched(q, "start dequeue");) return NULL; } grp = qfq_ffs(q, q->bitmaps[ER]); cl = grp->slots[grp->front]; /* extract from the first bucket in the bucket list */ m = dn_dequeue(&cl->_q); if (!m) { D("BUG/* non-workconserving leaf */"); return NULL; } NO(q->queued--;) old_V = q->V; q->V += (uint64_t)m->m_pkthdr.len * IWSUM; ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V); if (qfq_update_class(q, grp, cl)) { uint64_t old_F = grp->F; cl = qfq_slot_scan(grp); if (!cl) { /* group gone, remove from ER */ __clear_bit(grp->index, &q->bitmaps[ER]); // grp->S = grp->F + 1; // XXX debugging only } else { uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift); unsigned int s; if (grp->S == roundedS) goto skip_unblock; grp->S = roundedS; grp->F = roundedS + (2ULL << grp->slot_shift); /* remove from ER and put in the new set */ __clear_bit(grp->index, &q->bitmaps[ER]); s = qfq_calc_state(q, grp); __set_bit(grp->index, &q->bitmaps[s]); } /* we need to unblock even if the group has gone away */ qfq_unblock_groups(q, grp->index, old_F); } skip_unblock: qfq_update_eligible(q, old_V); NO(if (!q->bitmaps[ER] && q->queued) dump_sched(q, "end dequeue");) return m; } /* * Assign a reasonable start time for a new flow k in group i. * Admissible values for \hat(F) are multiples of \sigma_i * no greater than V+\sigma_i . Larger values mean that * we had a wraparound so we consider the timestamp to be stale. * * If F is not stale and F >= V then we set S = F. * Otherwise we should assign S = V, but this may violate * the ordering in ER. So, if we have groups in ER, set S to * the F_j of the first group j which would be blocking us. * We are guaranteed not to move S backward because * otherwise our group i would still be blocked. */ static inline void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) { unsigned long mask; uint32_t limit, roundedF; int slot_shift = cl->grp->slot_shift; roundedF = qfq_round_down(cl->F, slot_shift); limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift); if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { /* timestamp was stale */ mask = mask_from(q->bitmaps[ER], cl->grp->index); if (mask) { struct qfq_group *next = qfq_ffs(q, mask); if (qfq_gt(roundedF, next->F)) { cl->S = next->F; return; } } cl->S = q->V; } else { /* timestamp is not stale */ cl->S = cl->F; } } static int qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m) { struct qfq_sched *q = (struct qfq_sched *)(si + 1); struct qfq_group *grp; struct qfq_class *cl = (struct qfq_class *)_q; uint64_t roundedS; int s; NO(q->loops++;) DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len, _q, cl->inv_w, cl->grp->index); /* XXX verify that the packet obeys the parameters */ if (m != _q->mq.head) { if (dn_enqueue(_q, m, 0)) /* packet was dropped */ return 1; NO(q->queued++;) if (m != _q->mq.head) return 0; } /* If reach this point, queue q was idle */ grp = cl->grp; qfq_update_start(q, cl); /* adjust start time */ /* compute new finish time and rounded start. */ cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w; roundedS = qfq_round_down(cl->S, grp->slot_shift); /* * insert cl in the correct bucket. * If cl->S >= grp->S we don't need to adjust the * bucket list and simply go to the insertion phase. * Otherwise grp->S is decreasing, we must make room * in the bucket list, and also recompute the group state. * Finally, if there were no flows in this group and nobody * was in ER make sure to adjust V. */ if (grp->full_slots) { if (!qfq_gt(grp->S, cl->S)) goto skip_update; /* create a slot for this cl->S */ qfq_slot_rotate(q, grp, roundedS); /* group was surely ineligible, remove */ __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[IB]); } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V)) q->V = roundedS; grp->S = roundedS; grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i s = qfq_calc_state(q, grp); __set_bit(grp->index, &q->bitmaps[s]); ND("new state %d 0x%x", s, q->bitmaps[s]); ND("S %llx F %llx V %llx", cl->S, cl->F, q->V); skip_update: qfq_slot_insert(grp, cl, roundedS); return 0; } #if 0 static inline void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, struct qfq_class *cl, struct qfq_class **pprev) { unsigned int i, offset; uint64_t roundedS; roundedS = qfq_round_down(cl->S, grp->slot_shift); offset = (roundedS - grp->S) >> grp->slot_shift; i = (grp->front + offset) % QFQ_MAX_SLOTS; #ifdef notyet if (!pprev) { pprev = &grp->slots[i]; while (*pprev && *pprev != cl) pprev = &(*pprev)->next; } #endif *pprev = cl->next; if (!grp->slots[i]) __clear_bit(offset, &grp->full_slots); } /* * called to forcibly destroy a queue. * If the queue is not in the front bucket, or if it has * other queues in the front bucket, we can simply remove * the queue with no other side effects. * Otherwise we must propagate the event up. * XXX description to be completed. */ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl, struct qfq_class **pprev) { struct qfq_group *grp = &q->groups[cl->index]; unsigned long mask; uint64_t roundedS; int s; cl->F = cl->S; // not needed if the class goes away. qfq_slot_remove(q, grp, cl, pprev); if (!grp->full_slots) { /* nothing left in the group, remove from all sets. * Do ER last because if we were blocking other groups * we must unblock them. */ __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[EB]); __clear_bit(grp->index, &q->bitmaps[IB]); if (test_bit(grp->index, &q->bitmaps[ER]) && !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); if (mask) mask = ~((1UL << __fls(mask)) - 1); else mask = ~0UL; qfq_move_groups(q, mask, EB, ER); qfq_move_groups(q, mask, IB, IR); } __clear_bit(grp->index, &q->bitmaps[ER]); } else if (!grp->slots[grp->front]) { cl = qfq_slot_scan(grp); roundedS = qfq_round_down(cl->S, grp->slot_shift); if (grp->S != roundedS) { __clear_bit(grp->index, &q->bitmaps[ER]); __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[EB]); __clear_bit(grp->index, &q->bitmaps[IB]); grp->S = roundedS; grp->F = roundedS + (2ULL << grp->slot_shift); s = qfq_calc_state(q, grp); __set_bit(grp->index, &q->bitmaps[s]); } } qfq_update_eligible(q, q->V); } #endif static int qfq_new_fsk(struct dn_fsk *f) { ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight"); ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen"); ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]); return 0; } /* * initialize a new scheduler instance */ static int qfq_new_sched(struct dn_sch_inst *si) { struct qfq_sched *q = (struct qfq_sched *)(si + 1); struct qfq_group *grp; int i; for (i = 0; i <= QFQ_MAX_INDEX; i++) { grp = &q->groups[i]; grp->index = i; grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS - (QFQ_MAX_INDEX - i); } return 0; } /* * QFQ scheduler descriptor */ static struct dn_alg qfq_desc = { _SI( .type = ) DN_SCHED_QFQ, _SI( .name = ) "QFQ", _SI( .flags = ) DN_MULTIQUEUE, _SI( .schk_datalen = ) 0, _SI( .si_datalen = ) sizeof(struct qfq_sched), _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue), _SI( .enqueue = ) qfq_enqueue, _SI( .dequeue = ) qfq_dequeue, _SI( .config = ) NULL, _SI( .destroy = ) NULL, _SI( .new_sched = ) qfq_new_sched, _SI( .free_sched = ) NULL, _SI( .new_fsk = ) qfq_new_fsk, _SI( .free_fsk = ) NULL, _SI( .new_queue = ) qfq_new_queue, _SI( .free_queue = ) qfq_free_queue, }; DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc); #ifdef QFQ_DEBUG static void dump_groups(struct qfq_sched *q, uint32_t mask) { int i, j; for (i = 0; i < QFQ_MAX_INDEX + 1; i++) { struct qfq_group *g = &q->groups[i]; if (0 == (mask & (1<slots[j]) D(" bucket %d %p", j, g->slots[j]); } D("full_slots 0x%x", g->full_slots); D(" %2d S 0x%20llx F 0x%llx %c", i, g->S, g->F, mask & (1<loops, q->queued, q->V); D(" ER 0x%08x", q->bitmaps[ER]); D(" EB 0x%08x", q->bitmaps[EB]); D(" IR 0x%08x", q->bitmaps[IR]); D(" IB 0x%08x", q->bitmaps[IB]); dump_groups(q, 0xffffffff); }; #endif /* QFQ_DEBUG */ ipfw/dn_sched_rr.c000644 000423 000000 00000016047 11334513576 014717 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifdef _KERNEL #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include /* ipfw_rule_ref */ #include /* flow_id */ #include #include #include #include #else #include #endif #define DN_SCHED_RR 3 // XXX Where? struct rr_queue { struct dn_queue q; /* Standard queue */ int status; /* 1: queue is in the list */ int credit; /* Number of bytes to transmit */ int quantum; /* quantum * C */ struct rr_queue *qnext; /* */ }; /* struct rr_schk contains global config parameters * and is right after dn_schk */ struct rr_schk { int min_q; /* Min quantum */ int max_q; /* Max quantum */ int q_bytes; /* Bytes per quantum */ }; /* per-instance round robin list, right after dn_sch_inst */ struct rr_si { struct rr_queue *head, *tail; /* Pointer to current queue */ }; /* Append a queue to the rr list */ static inline void rr_append(struct rr_queue *q, struct rr_si *si) { q->status = 1; /* mark as in-rr_list */ q->credit = q->quantum; /* initialize credit */ /* append to the tail */ if (si->head == NULL) si->head = q; else si->tail->qnext = q; si->tail = q; /* advance the tail pointer */ q->qnext = si->head; /* make it circular */ } /* Remove the head queue from circular list. */ static inline void rr_remove_head(struct rr_si *si) { if (si->head == NULL) return; /* empty queue */ si->head->status = 0; if (si->head == si->tail) { si->head = si->tail = NULL; return; } si->head = si->head->qnext; si->tail->qnext = si->head; } /* Remove a queue from circular list. * XXX see if ti can be merge with remove_queue() */ static inline void remove_queue_q(struct rr_queue *q, struct rr_si *si) { struct rr_queue *prev; if (q->status != 1) return; if (q == si->head) { rr_remove_head(si); return; } for (prev = si->head; prev; prev = prev->qnext) { if (prev->qnext != q) continue; prev->qnext = q->qnext; if (q == si->tail) si->tail = prev; q->status = 0; break; } } static inline void next_pointer(struct rr_si *si) { if (si->head == NULL) return; /* empty queue */ si->head = si->head->qnext; si->tail = si->tail->qnext; } static int rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) { struct rr_si *si; struct rr_queue *rrq; if (m != q->mq.head) { if (dn_enqueue(q, m, 0)) /* packet was dropped */ return 1; if (m != q->mq.head) return 0; } /* If reach this point, queue q was idle */ si = (struct rr_si *)(_si + 1); rrq = (struct rr_queue *)q; if (rrq->status == 1) /* Queue is already in the queue list */ return 0; /* Insert the queue in the queue list */ rr_append(rrq, si); return 0; } static struct mbuf * rr_dequeue(struct dn_sch_inst *_si) { /* Access scheduler instance private data */ struct rr_si *si = (struct rr_si *)(_si + 1); struct rr_queue *rrq; uint64_t len; while ( (rrq = si->head) ) { struct mbuf *m = rrq->q.mq.head; if ( m == NULL) { /* empty queue, remove from list */ rr_remove_head(si); continue; } len = m->m_pkthdr.len; if (len > rrq->credit) { /* Packet too big */ rrq->credit += rrq->quantum; /* Try next queue */ next_pointer(si); } else { rrq->credit -= len; return dn_dequeue(&rrq->q); } } /* no packet to dequeue*/ return NULL; } static int rr_config(struct dn_schk *_schk) { struct rr_schk *schk = (struct rr_schk *)(_schk + 1); ND("called"); /* use reasonable quantums (64..2k bytes, default 1500) */ schk->min_q = 64; schk->max_q = 2048; schk->q_bytes = 1500; /* quantum */ return 0; } static int rr_new_sched(struct dn_sch_inst *_si) { struct rr_si *si = (struct rr_si *)(_si + 1); ND("called"); si->head = si->tail = NULL; return 0; } static int rr_free_sched(struct dn_sch_inst *_si) { ND("called"); /* Nothing to do? */ return 0; } static int rr_new_fsk(struct dn_fsk *fs) { struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1); /* par[0] is the weight, par[1] is the quantum step */ ipdn_bound_var(&fs->fs.par[0], 1, 1, 65536, "RR weight"); ipdn_bound_var(&fs->fs.par[1], schk->q_bytes, schk->min_q, schk->max_q, "RR quantum"); return 0; } static int rr_new_queue(struct dn_queue *_q) { struct rr_queue *q = (struct rr_queue *)_q; _q->ni.oid.subtype = DN_SCHED_RR; q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1]; ND("called, q->quantum %d", q->quantum); q->credit = q->quantum; q->status = 0; if (_q->mq.head != NULL) { /* Queue NOT empty, insert in the queue list */ rr_append(q, (struct rr_si *)(_q->_si + 1)); } return 0; } static int rr_free_queue(struct dn_queue *_q) { struct rr_queue *q = (struct rr_queue *)_q; ND("called"); if (q->status == 1) { struct rr_si *si = (struct rr_si *)(_q->_si + 1); remove_queue_q(q, si); } return 0; } /* * RR scheduler descriptor * contains the type of the scheduler, the name, the size of the * structures and function pointers. */ static struct dn_alg rr_desc = { _SI( .type = ) DN_SCHED_RR, _SI( .name = ) "RR", _SI( .flags = ) DN_MULTIQUEUE, _SI( .schk_datalen = ) 0, _SI( .si_datalen = ) sizeof(struct rr_si), _SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue), _SI( .enqueue = ) rr_enqueue, _SI( .dequeue = ) rr_dequeue, _SI( .config = ) rr_config, _SI( .destroy = ) NULL, _SI( .new_sched = ) rr_new_sched, _SI( .free_sched = ) rr_free_sched, _SI( .new_fsk = ) rr_new_fsk, _SI( .free_fsk = ) NULL, _SI( .new_queue = ) rr_new_queue, _SI( .free_queue = ) rr_free_queue, }; DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc); ipfw/dn_sched_wf2q.c000644 000423 000000 00000027315 11334513576 015153 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifdef _KERNEL #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include /* ipfw_rule_ref */ #include /* flow_id */ #include #include #include #include #else #include #endif #ifndef MAX64 #define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) #endif /* * timestamps are computed on 64 bit using fixed point arithmetic. * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len * and sum of weights, respectively. FRAC_BITS is the number of * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w * using an unsigned 32-bit division, and to avoid wraparounds we need * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64 * As an example * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19 */ #ifndef FRAC_BITS #define FRAC_BITS 28 /* shift for fixed point arithmetic */ #define ONE_FP (1UL << FRAC_BITS) #endif /* * Private information for the scheduler instance: * sch_heap (key is Finish time) returns the next queue to serve * ne_heap (key is Start time) stores not-eligible queues * idle_heap (key=start/finish time) stores idle flows. It must * support extract-from-middle. * A flow is only in 1 of the three heaps. * XXX todo: use a more efficient data structure, e.g. a tree sorted * by F with min_subtree(S) in each node */ struct wf2qp_si { struct dn_heap sch_heap; /* top extract - key Finish time */ struct dn_heap ne_heap; /* top extract - key Start time */ struct dn_heap idle_heap; /* random extract - key Start=Finish time */ uint64_t V; /* virtual time */ uint32_t inv_wsum; /* inverse of sum of weights */ uint32_t wsum; /* sum of weights */ }; struct wf2qp_queue { struct dn_queue _q; uint64_t S, F; /* start time, finish time */ uint32_t inv_w; /* ONE_FP / weight */ int32_t heap_pos; /* position (index) of struct in heap */ }; /* * This file implements a WF2Q+ scheduler as it has been in dummynet * since 2000. * The scheduler supports per-flow queues and has O(log N) complexity. * * WF2Q+ needs to drain entries from the idle heap so that we * can keep the sum of weights up to date. We can do it whenever * we get a chance, or periodically, or following some other * strategy. The function idle_check() drains at most N elements * from the idle heap. */ static void idle_check(struct wf2qp_si *si, int n, int force) { struct dn_heap *h = &si->idle_heap; while (n-- > 0 && h->elements > 0 && (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) { struct dn_queue *q = HEAP_TOP(h)->object; struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; heap_extract(h, NULL); /* XXX to let the flowset delete the queue we should * mark it as 'unused' by the scheduler. */ alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */ si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */ if (si->wsum > 0) si->inv_wsum = ONE_FP/si->wsum; } } static int wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) { struct dn_fsk *fs = q->fs; struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); struct wf2qp_queue *alg_fq; uint64_t len = m->m_pkthdr.len; if (m != q->mq.head) { if (dn_enqueue(q, m, 0)) /* packet was dropped */ return 1; if (m != q->mq.head) /* queue was already busy */ return 0; } /* If reach this point, queue q was idle */ alg_fq = (struct wf2qp_queue *)q; if (DN_KEY_LT(alg_fq->F, alg_fq->S)) { /* Fbrand new queue. */ alg_fq->S = si->V; /* init start time */ si->wsum += fs->fs.par[0]; /* add weight of new queue. */ si->inv_wsum = ONE_FP/si->wsum; } else { /* if it was idle then it was in the idle heap */ heap_extract(&si->idle_heap, q); alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */ } alg_fq->F = alg_fq->S + len * alg_fq->inv_w; /* if nothing is backlogged, make sure this flow is eligible */ if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0) si->V = MAX64(alg_fq->S, si->V); /* * Look at eligibility. A flow is not eligibile if S>V (when * this happens, it means that there is some other flow already * scheduled for the same pipe, so the sch_heap cannot be * empty). If the flow is not eligible we just store it in the * ne_heap. Otherwise, we store in the sch_heap. * Note that for all flows in sch_heap (SCH), S_i <= V, * and for all flows in ne_heap (NEH), S_i > V. * So when we need to compute max(V, min(S_i)) forall i in * SCH+NEH, we only need to look into NEH. */ if (DN_KEY_LT(si->V, alg_fq->S)) { /* S>V means flow Not eligible. */ if (si->sch_heap.elements == 0) D("++ ouch! not eligible but empty scheduler!"); heap_insert(&si->ne_heap, alg_fq->S, q); } else { heap_insert(&si->sch_heap, alg_fq->F, q); } return 0; } /* XXX invariant: sch > 0 || V >= min(S in neh) */ static struct mbuf * wf2qp_dequeue(struct dn_sch_inst *_si) { /* Access scheduler instance private data */ struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); struct mbuf *m; struct dn_queue *q; struct dn_heap *sch = &si->sch_heap; struct dn_heap *neh = &si->ne_heap; struct wf2qp_queue *alg_fq; if (sch->elements == 0 && neh->elements == 0) { /* we have nothing to do. We could kill the idle heap * altogether and reset V */ idle_check(si, 0x7fffffff, 1); si->V = 0; si->wsum = 0; /* should be set already */ return NULL; /* quick return if nothing to do */ } idle_check(si, 1, 0); /* drain something from the idle heap */ /* make sure at least one element is eligible, bumping V * and moving entries that have become eligible. * We need to repeat the first part twice, before and * after extracting the candidate, or enqueue() will * find the data structure in a wrong state. */ m = NULL; for(;;) { /* * Compute V = max(V, min(S_i)). Remember that all elements * in sch have by definition S_i <= V so if sch is not empty, * V is surely the max and we must not update it. Conversely, * if sch is empty we only need to look at neh. * We don't need to move the queues, as it will be done at the * next enqueue */ if (sch->elements == 0 && neh->elements > 0) { si->V = MAX64(si->V, HEAP_TOP(neh)->key); } while (neh->elements > 0 && DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) { q = HEAP_TOP(neh)->object; alg_fq = (struct wf2qp_queue *)q; heap_extract(neh, NULL); heap_insert(sch, alg_fq->F, q); } if (m) /* pkt found in previous iteration */ break; /* ok we have at least one eligible pkt */ q = HEAP_TOP(sch)->object; alg_fq = (struct wf2qp_queue *)q; m = dn_dequeue(q); heap_extract(sch, NULL); /* Remove queue from heap. */ si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum; alg_fq->S = alg_fq->F; /* Update start time. */ if (q->mq.head == 0) { /* not backlogged any more. */ heap_insert(&si->idle_heap, alg_fq->F, q); } else { /* Still backlogged. */ /* Update F, store in neh or sch */ uint64_t len = q->mq.head->m_pkthdr.len; alg_fq->F += len * alg_fq->inv_w; if (DN_KEY_LEQ(alg_fq->S, si->V)) { heap_insert(sch, alg_fq->F, q); } else { heap_insert(neh, alg_fq->S, q); } } } return m; } static int wf2qp_new_sched(struct dn_sch_inst *_si) { struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); int ofs = offsetof(struct wf2qp_queue, heap_pos); /* all heaps support extract from middle */ if (heap_init(&si->idle_heap, 16, ofs) || heap_init(&si->sch_heap, 16, ofs) || heap_init(&si->ne_heap, 16, ofs)) { heap_free(&si->ne_heap); heap_free(&si->sch_heap); heap_free(&si->idle_heap); return ENOMEM; } return 0; } static int wf2qp_free_sched(struct dn_sch_inst *_si) { struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); heap_free(&si->sch_heap); heap_free(&si->ne_heap); heap_free(&si->idle_heap); return 0; } static int wf2qp_new_fsk(struct dn_fsk *fs) { ipdn_bound_var(&fs->fs.par[0], 1, 1, 100, "WF2Q+ weight"); return 0; } static int wf2qp_new_queue(struct dn_queue *_q) { struct wf2qp_queue *q = (struct wf2qp_queue *)_q; _q->ni.oid.subtype = DN_SCHED_WF2QP; q->F = 0; /* not strictly necessary */ q->S = q->F + 1; /* mark timestamp as invalid. */ q->inv_w = ONE_FP / _q->fs->fs.par[0]; if (_q->mq.head != NULL) { wf2qp_enqueue(_q->_si, _q, _q->mq.head); } return 0; } /* * Called when the infrastructure removes a queue (e.g. flowset * is reconfigured). Nothing to do if we did not 'own' the queue, * otherwise remove it from the right heap and adjust the sum * of weights. */ static int wf2qp_free_queue(struct dn_queue *q) { struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1); if (alg_fq->S >= alg_fq->F + 1) return 0; /* nothing to do, not in any heap */ si->wsum -= q->fs->fs.par[0]; if (si->wsum > 0) si->inv_wsum = ONE_FP/si->wsum; /* extract from the heap. XXX TODO we may need to adjust V * to make sure the invariants hold. */ if (q->mq.head == NULL) { heap_extract(&si->idle_heap, q); } else if (DN_KEY_LT(si->V, alg_fq->S)) { heap_extract(&si->ne_heap, q); } else { heap_extract(&si->sch_heap, q); } return 0; } /* * WF2Q+ scheduler descriptor * contains the type of the scheduler, the name, the size of the * structures and function pointers. */ static struct dn_alg wf2qp_desc = { _SI( .type = ) DN_SCHED_WF2QP, _SI( .name = ) "WF2Q+", _SI( .flags = ) DN_MULTIQUEUE, /* we need extra space in the si and the queue */ _SI( .schk_datalen = ) 0, _SI( .si_datalen = ) sizeof(struct wf2qp_si), _SI( .q_datalen = ) sizeof(struct wf2qp_queue) - sizeof(struct dn_queue), _SI( .enqueue = ) wf2qp_enqueue, _SI( .dequeue = ) wf2qp_dequeue, _SI( .config = ) NULL, _SI( .destroy = ) NULL, _SI( .new_sched = ) wf2qp_new_sched, _SI( .free_sched = ) wf2qp_free_sched, _SI( .new_fsk = ) wf2qp_new_fsk, _SI( .free_fsk = ) NULL, _SI( .new_queue = ) wf2qp_new_queue, _SI( .free_queue = ) wf2qp_free_queue, }; DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc); ipfw/ip_dn_private.h000644 000423 000000 00000021565 11334513576 015276 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _IP_DN_PRIVATE_H #define _IP_DN_PRIVATE_H /* * internal dummynet APIs. */ /* debugging support * use ND() to remove debugging, D() to print a line, * DX(level, ...) to print above a certain level * If you redefine D() you are expected to redefine all. */ #ifndef D #define ND(fmt, args...) do {} while (0) #define D1(fmt, args...) do {} while (0) #define D(fmt, args...) printf("%-10s " fmt "\n", \ __FUNCTION__, ## args) #define DX(lev, fmt, args...) do { \ if (dn_cfg.debug > lev) D(fmt, ## args); } while (0) #endif MALLOC_DECLARE(M_DUMMYNET); #ifndef FREE_PKT #define FREE_PKT(m) m_freem(m) #endif #define div64(a, b) ((int64_t)(a) / (int64_t)(b)) #define DN_LOCK_INIT() do { \ mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF); \ mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF); \ } while (0) #define DN_LOCK_DESTROY() do { \ mtx_destroy(&dn_cfg.uh_mtx); \ mtx_destroy(&dn_cfg.bh_mtx); \ } while (0) #if 0 /* not used yet */ #define DN_UH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) #define DN_UH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) #define DN_UH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) #define DN_UH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) #define DN_UH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) #endif #define DN_BH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) #define DN_BH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) #define DN_BH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) #define DN_BH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) #define DN_BH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) SLIST_HEAD(dn_schk_head, dn_schk); SLIST_HEAD(dn_sch_inst_head, dn_sch_inst); SLIST_HEAD(dn_fsk_head, dn_fsk); SLIST_HEAD(dn_queue_head, dn_queue); SLIST_HEAD(dn_alg_head, dn_alg); struct mq { /* a basic queue of packets*/ struct mbuf *head, *tail; }; static inline void set_oid(struct dn_id *o, int type, int len) { o->type = type; o->len = len; o->subtype = 0; }; /* * configuration and global data for a dummynet instance * * When a configuration is modified from userland, 'id' is incremented * so we can use the value to check for stale pointers. */ struct dn_parms { uint32_t id; /* configuration version */ /* defaults (sysctl-accessible) */ int red_lookup_depth; int red_avg_pkt_size; int red_max_pkt_size; int hash_size; int max_hash_size; long byte_limit; /* max queue sizes */ long slot_limit; int io_fast; int debug; /* timekeeping */ struct timeval prev_t; /* last time dummynet_tick ran */ struct dn_heap evheap; /* scheduled events */ /* counters of objects -- used for reporting space */ int schk_count; int si_count; int fsk_count; int queue_count; /* flowsets and schedulers are in hash tables, with 'hash_size' * buckets. fshash is looked up at every packet arrival * so better be generous if we expect many entries. */ struct dn_ht *fshash; struct dn_ht *schedhash; /* list of flowsets without a scheduler -- use sch_chain */ struct dn_fsk_head fsu; /* list of unlinked flowsets */ struct dn_alg_head schedlist; /* list of algorithms */ /* if the upper half is busy doing something long, * can set the busy flag and we will enqueue packets in * a queue for later processing. */ int busy; struct mq pending; #ifdef _KERNEL /* * This file is normally used in the kernel, unless we do * some userland tests, in which case we do not need a mtx. * uh_mtx arbitrates between system calls and also * protects fshash, schedhash and fsunlinked. * These structures are readonly for the lower half. * bh_mtx protects all other structures which may be * modified upon packet arrivals */ struct mtx uh_mtx; struct mtx bh_mtx; #endif /* _KERNEL */ }; /* * Delay line, contains all packets on output from a link. * Every scheduler instance has one. */ struct delay_line { struct dn_id oid; struct dn_sch_inst *si; struct mq mq; }; /* * The kernel side of a flowset. It is linked in a hash table * of flowsets, and in a list of children of their parent scheduler. * qht is either the queue or (if HAVE_MASK) a hash table queues. * Note that the mask to use is the (flow_mask|sched_mask), which * changes as we attach/detach schedulers. So we store it here. * * XXX If we want to add scheduler-specific parameters, we need to * put them in external storage because the scheduler may not be * available when the fsk is created. */ struct dn_fsk { /* kernel side of a flowset */ struct dn_fs fs; SLIST_ENTRY(dn_fsk) fsk_next; /* hash chain for fshash */ struct ipfw_flow_id fsk_mask; /* qht is a hash table of queues, or just a single queue * a bit in fs.flags tells us which one */ struct dn_ht *qht; struct dn_schk *sched; /* Sched we are linked to */ SLIST_ENTRY(dn_fsk) sch_chain; /* list of fsk attached to sched */ }; /* * A queue is created as a child of a flowset unless it belongs to * a !MULTIQUEUE scheduler. It is normally in a hash table in the * flowset. fs always points to the parent flowset. * si normally points to the sch_inst, unless the flowset has been * detached from the scheduler -- in this case si == NULL and we * should not enqueue. */ struct dn_queue { struct dn_flow ni; /* oid, flow_id, stats */ struct mq mq; /* packets queue */ struct dn_sch_inst *_si; /* owner scheduler instance */ SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */ struct dn_fsk *fs; /* parent flowset. */ }; /* * The kernel side of a scheduler. Contains the userland config, * a link, pointer to extra config arguments from command line, * kernel flags, and a pointer to the scheduler methods. * It is stored in a hash table, and holds a list of all * flowsets and scheduler instances. * XXX sch must be at the beginning, see schk_hash(). */ struct dn_schk { struct dn_sch sch; struct dn_alg *fp; /* Pointer to scheduler functions */ struct dn_link link; /* The link, embedded */ struct dn_profile *profile; /* delay profile, if any */ struct dn_id *cfg; /* extra config arguments */ SLIST_ENTRY(dn_schk) schk_next; /* hash chain for schedhash */ struct dn_fsk_head fsk_list; /* all fsk linked to me */ struct dn_fsk *fs; /* Flowset for !MULTIQUEUE */ /* Hash table of all instances (through sch.sched_mask) * or single instance if no mask. Always valid. */ struct dn_ht *siht; }; /* * Scheduler instance. * Contains variables and all queues relative to a this instance. * This struct is created a runtime. */ struct dn_sch_inst { struct dn_flow ni; /* oid, flowid and stats */ SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */ struct delay_line dline; struct dn_schk *sched; /* the template */ int kflags; /* DN_ACTIVE */ int64_t credit; /* bits I can transmit (more or less). */ uint64_t sched_time; /* time link was scheduled in ready_heap */ uint64_t idle_time; /* start of scheduler instance idle time */ }; /* kernel-side flags. Linux has DN_DELETE in fcntl.h */ enum { /* 1 and 2 are reserved for the SCAN flags */ DN_DESTROY = 0x0004, /* destroy */ DN_DELETE_FS = 0x0008, /* destroy flowset */ DN_DETACH = 0x0010, DN_ACTIVE = 0x0020, /* object is in evheap */ DN_F_DLINE = 0x0040, /* object is a delay line */ DN_F_SCHI = 0x00C0, /* object is a sched.instance */ DN_QHT_IS_Q = 0x0100, /* in flowset, qht is a single queue */ }; extern struct dn_parms dn_cfg; int dummynet_io(struct mbuf **, int , struct ip_fw_args *); void dummynet_task(void *context, int pending); void dn_reschedule(void); struct dn_queue *ipdn_q_find(struct dn_fsk *, struct dn_sch_inst *, struct ipfw_flow_id *); struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *); #endif /* _IP_DN_PRIVATE_H */ Makefile000644 000423 000000 00000001275 11334514747 012772 0ustar00luigiwheel000000 000000 # $Id: Makefile 5225 2010-02-10 10:41:46Z luigi $ # testing framework for schedulers # Use gmake to build this program # make IPFW=... to override the ipfw sources CFLAGS= -Wall -O3 CFLAGS += -DWITH_KPS OBJS := main.o ifeq ($(IPFW),) DN_SRCS=ipfw else DN_SRCS=$(IPFW) endif VPATH= .:$(DN_SRCS) CFLAGS += -DIPFW -I. -I$(DN_SRCS) OBJS += test_dn_sched.o OBJS += dn_sched_fifo.o OBJS += dn_sched_wf2q.o OBJS += dn_sched_rr.o OBJS += dn_sched_qfq.o OBJS += dn_sched_kps.o OBJS += dn_heap.o LDFLAGS:=-lm -g all: test test: $(OBJS) $(CC) $(LDFLAGS) -o test $(OBJS) $(OBJS): dn_test.h clean: rm -rf test *.o *.core ALLSRC = ipfw/*[ch] Makefile *.[ch] tar: tar cvzf /tmp/qfq-test.tgz $(ALLSRC) dn_test.h000644 000423 000000 00000006200 11334514565 013132 0ustar00luigiwheel000000 000000 /* * $Id: dn_test.h 5225 2010-02-10 10:41:46Z luigi $ * * userspace compatibility code for dummynet schedulers */ #ifndef _DN_TEST_H #define _DN_TEST_H #include #include #include #include /* bzero, ffs, ... */ #include /* strcmp */ #include #include #include #include extern int debug; #define ND(fmt, args...) do {} while (0) #define D1(fmt, args...) do {} while (0) #define D(fmt, args...) fprintf(stderr, "%-8s " fmt "\n", \ __FUNCTION__, ## args) #define DX(lev, fmt, args...) do { \ if (debug > lev) D(fmt, ## args); } while (0) #define offsetof(t,m) (int)((&((t *)0L)->m)) #include /* prevent include of other system headers */ #define _NETINET_IP_VAR_H_ /* ip_fw_args */ #define _IPFW2_H #define _SYS_MBUF_H_ enum { DN_QUEUE, }; enum { DN_SCHED_FIFO, DN_SCHED_WF2QP, }; struct dn_id { int type, subtype, len, id; }; struct dn_fs { int par[4]; /* flowset parameters */ /* simulation entries. * 'index' is not strictly necessary * y is used for the inverse mapping , */ int index; int y; /* inverse mapping */ int base_y; /* inverse mapping */ int next_y; /* inverse mapping */ int n_flows; int first_flow; int next_flow; /* first_flow + n_flows */ /* * when generating, let 'cur' go from 0 to n_flows-1, * then point to flow first_flow + cur */ int cur; }; struct dn_sch { }; struct dn_flow { struct dn_id oid; int length; int len_bytes; int drops; uint64_t tot_bytes; uint32_t flow_id; struct list_head h; /* used by the generator */ }; struct dn_link { }; struct ip_fw_args { }; struct mbuf { struct { int len; } m_pkthdr; struct mbuf *m_nextpkt; int flow_id; /* for testing, index of a flow */ //int flowset_id; /* for testing, index of a flowset */ void *cfg; /* config args */ }; #define MALLOC_DECLARE(x) #define KASSERT(x, y) do { if (!(x)) printf y ; exit(0); } while (0) struct ipfw_flow_id { }; typedef void * module_t; struct _md_t { const char *name; int (*f)(module_t, int, void *); void *p; }; typedef struct _md_t moduledata_t; #define DECLARE_MODULE(name, b, c, d) \ moduledata_t *_g_##name = & b #define MODULE_DEPEND(a, b, c, d, e) #ifdef IPFW #include #include #include #else struct dn_queue { struct dn_fsk *fs; /* parent flowset. */ struct dn_sch_inst *_si; /* parent sched instance. */ }; struct dn_schk { }; struct dn_fsk { struct dn_fs fs; struct dn_schk *sched; }; struct dn_sch_inst { struct dn_schk *sched; }; struct dn_alg { int type; const char *name; void *enqueue, *dequeue; int q_datalen, si_datalen, schk_datalen; int (*config)(struct dn_schk *); int (*new_sched)(struct dn_sch_inst *); int (*new_fsk)(struct dn_fsk *); int (*new_queue)(struct dn_queue *q); }; #endif static inline void mq_append(struct mq *q, struct mbuf *m) { if (q->head == NULL) q->head = m; else q->tail->m_nextpkt = m; q->tail = m; m->m_nextpkt = NULL; } #endif /* _DN_TEST_H */ main.c000644 000423 000000 00000036570 11334514565 012426 0ustar00luigiwheel000000 000000 /* * $Id: main.c 5225 2010-02-10 10:41:46Z luigi $ * * Testing program for schedulers * * The framework include a simple controller which, at each * iteration, decides whether we can enqueue and/or dequeue. * Then the mainloop runs the required number of tests, * keeping track of statistics. */ #include "dn_test.h" struct q_list { struct list_head h; }; struct cfg_s { int ac; char * const *av; const char *name; int loops; struct timeval time; /* running counters */ uint32_t _enqueue; uint32_t drop; uint32_t pending; uint32_t dequeue; /* generator parameters */ int th_min, th_max; int maxburst; int lmin, lmax; /* packet len */ int flows; /* number of flows */ int flowsets; /* number of flowsets */ int wsum; /* sum of weights of all flows */ int max_y; /* max random number in the generation */ int cur_y, cur_fs; /* used in generation, between 0 and max_y - 1 */ const char *fs_config; /* flowset config */ int can_dequeue; int burst; /* count of packets sent in a burst */ struct mbuf *tosend; /* packet to send -- also flag to enqueue */ struct mbuf *freelist; struct mbuf *head, *tail; /* a simple tailq */ /* scheduler hooks */ int (*enq)(struct dn_sch_inst *, struct dn_queue *, struct mbuf *); struct mbuf * (*deq)(struct dn_sch_inst *); /* size of the three fields including sched-specific areas */ int schk_len; int q_len; /* size of a queue including sched-fields */ int si_len; /* size of a sch_inst including sched-fields */ char *q; /* array of flow queues */ /* use a char* because size is variable */ struct dn_fsk *fs; /* array of flowsets */ struct dn_sch_inst *si; struct dn_schk *sched; /* generator state */ int state; /* 0 = going up, 1: going down */ /* * We keep lists for each backlog level, and always serve * the one with shortest backlog. llmask contains a bitmap * of lists, and ll are the heads of the lists. The last * entry (BACKLOG) contains all entries considered 'full' * XXX to optimize things, entry i could contain queues with * 2^{i-1}+1 .. 2^i entries. */ #define BACKLOG 30 uint32_t llmask; struct list_head ll[BACKLOG + 10]; }; /* FI2Q and Q2FI converts from flow_id to dn_queue and back. * We cannot easily use pointer arithmetic because it is variable size. */ #define FI2Q(c, i) ((struct dn_queue *)((c)->q + (c)->q_len * (i))) #define Q2FI(c, q) (((char *)(q) - (c)->q)/(c)->q_len) int debug = 0; static void controller(struct cfg_s *c); /* release a packet: put the mbuf in the freelist, and the queue in * the bucket. */ int drop(struct cfg_s *c, struct mbuf *m) { struct dn_queue *q; int i; c->drop++; q = FI2Q(c, m->flow_id); i = q->ni.length; // XXX or ffs... ND("q %p id %d current length %d", q, m->flow_id, i); if (i < BACKLOG) { struct list_head *h = &q->ni.h; c->llmask &= ~(1<<(i+1)); c->llmask |= (1<<(i)); list_del(h); list_add_tail(h, &c->ll[i]); } m->m_nextpkt = c->freelist; c->freelist = m; return 0; } /* dequeue returns NON-NULL when a packet is dropped */ static int enqueue(struct cfg_s *c, void *_m) { struct mbuf *m = _m; if (c->enq) return c->enq(c->si, FI2Q(c, m->flow_id), m); if (c->head == NULL) c->head = m; else c->tail->m_nextpkt = m; c->tail = m; return 0; /* default - success */ } /* dequeue returns NON-NULL when a packet is available */ static void * dequeue(struct cfg_s *c) { struct mbuf *m; if (c->deq) return c->deq(c->si); if ((m = c->head)) { m = c->head; c->head = m->m_nextpkt; m->m_nextpkt = NULL; } return m; } static int mainloop(struct cfg_s *c) { int i; struct mbuf *m; for (i=0; i < c->loops; i++) { /* implement histeresis */ controller(c); DX(3, "loop %d enq %d send %p rx %d", i, c->_enqueue, c->tosend, c->can_dequeue); if ( (m = c->tosend) ) { c->_enqueue++; if (enqueue(c, m)) { drop(c, m); ND("loop %d enqueue fail", i ); } else { ND("enqueue ok"); c->pending++; } } if (c->can_dequeue) { c->dequeue++; if ((m = dequeue(c))) { c->pending--; drop(c, m); c->drop--; /* compensate */ } } } DX(1, "mainloop ends %d", i); return 0; } int dump(struct cfg_s *c) { int i; struct dn_queue *q; for (i=0; i < c->flows; i++) { q = FI2Q(c, i); DX(1, "queue %4d tot %10lld", i, q->ni.tot_bytes); } DX(1, "done %d loops\n", c->loops); return 0; } /* interpret a number in human form */ static long getnum(const char *s, char **next, const char *key) { char *end = NULL; long l; if (next) /* default */ *next = NULL; if (s && *s) { DX(3, "token is <%s> %s", s, key ? key : "-"); l = strtol(s, &end, 0); } else { DX(3, "empty string"); l = -1; } if (l < 0) { DX(2, "invalid %s for %s", s ? s : "NULL", (key ? key : "") ); return 0; // invalid } if (!end || !*end) return l; if (*end == 'n') l = -l; /* multiply by n */ else if (*end == 'K') l = l*1000; else if (*end == 'M') l = l*1000000; else if (*end == 'k') l = l*1024; else if (*end == 'm') l = l*1024*1024; else if (*end == 'w') ; else {/* not recognized */ D("suffix %s for %s, next %p", end, key, next); end--; } end++; DX(3, "suffix now %s for %s, next %p", end, key, next); if (next && *end) { DX(3, "setting next to %s for %s", end, key); *next = end; } return l; } /* * flowsets are a comma-separated list of * weight:maxlen:flows * indicating how many flows are hooked to that fs. * Both weight and range can be min-max-steps. * In a first pass we just count the number of flowsets and flows, * in a second pass we complete the setup. */ static void parse_flowsets(struct cfg_s *c, const char *fs, int pass) { char *s, *cur, *next; int n_flows = 0, n_fs = 0, wsum = 0; int i, j; struct dn_fs *prev = NULL; DX(3, "--- pass %d flows %d flowsets %d", pass, c->flows, c->flowsets); if (pass == 0) c->fs_config = fs; s = c->fs_config ? strdup(c->fs_config) : NULL; if (s == NULL) { if (pass == 0) D("no fsconfig"); return; } for (next = s; (cur = strsep(&next, ","));) { char *p = NULL; int w, w_h, w_steps, wi; int len, len_h, l_steps, li; int flows; w = getnum(strsep(&cur, ":"), &p, "weight"); if (w <= 0) w = 1; w_h = p ? getnum(p+1, &p, "weight_max") : w; w_steps = p ? getnum(p+1, &p, "w_steps") : (w_h == w ?1:2); len = getnum(strsep(&cur, ":"), &p, "len"); if (len <= 0) len = 1000; len_h = p ? getnum(p+1, &p, "len_max") : len; l_steps = p ? getnum(p+1, &p, "l_steps") : (len_h == len ? 1 : 2); flows = getnum(strsep(&cur, ":"), NULL, "flows"); if (flows == 0) flows = 1; DX(4, "weight %d..%d (%d) len %d..%d (%d) flows %d", w, w_h, w_steps, len, len_h, l_steps, flows); if (w == 0 || w_h < w || len == 0 || len_h < len || flows == 0) { DX(4,"wrong parameters %s", fs); return; } n_flows += flows * w_steps * l_steps; for (i = 0; i < w_steps; i++) { wi = w + ((w_h - w)* i)/(w_steps == 1 ? 1 : (w_steps-1)); for (j = 0; j < l_steps; j++, n_fs++) { struct dn_fs *fs = &c->fs[n_fs].fs; // tentative int x; li = len + ((len_h - len)* j)/(l_steps == 1 ? 1 : (l_steps-1)); x = (wi*2048)/li; DX(3, "----- fs %4d weight %4d lmax %4d X %4d flows %d", n_fs, wi, li, x, flows); if (pass == 0) continue; if (c->fs == NULL || c->flowsets <= n_fs) { D("error in number of flowsets"); return; } wsum += wi * flows; fs->par[0] = wi; fs->par[1] = li; fs->index = n_fs; fs->n_flows = flows; fs->cur = fs->first_flow = prev==NULL ? 0 : prev->next_flow; fs->next_flow = fs->first_flow + fs->n_flows; fs->y = x * flows; fs->base_y = (prev == NULL) ? 0 : prev->next_y; fs->next_y = fs->base_y + fs->y; prev = fs; } } } c->max_y = prev ? prev->base_y + prev->y : 0; c->flows = n_flows; c->flowsets = n_fs; c->wsum = wsum; if (pass == 0) return; /* now link all flows to their parent flowsets */ DX(1,"%d flows on %d flowsets max_y %d", c->flows, c->flowsets, c->max_y); for (i=0; i < c->flowsets; i++) { struct dn_fs *fs = &c->fs[i].fs; DX(1, "fs %3d w %5d l %4d flow %5d .. %5d y %6d .. %6d", i, fs->par[0], fs->par[1], fs->first_flow, fs->next_flow, fs->base_y, fs->next_y); for (j = fs->first_flow; j < fs->next_flow; j++) { struct dn_queue *q = FI2Q(c, j); q->fs = &c->fs[i]; } } } static int init(struct cfg_s *c) { int i; int ac = c->ac; char * const *av = c->av; c->si_len = sizeof(struct dn_sch_inst); c->q_len = sizeof(struct dn_queue); moduledata_t *mod = NULL; struct dn_alg *p = NULL; c->th_min = 0; c->th_max = -20;/* 20 packets per flow */ c->lmin = c->lmax = 1280; /* packet len */ c->flows = 1; c->flowsets = 1; c->name = "null"; ac--; av++; while (ac > 1) { if (!strcmp(*av, "-n")) { c->loops = getnum(av[1], NULL, av[0]); } else if (!strcmp(*av, "-d")) { debug = atoi(av[1]); } else if (!strcmp(*av, "-alg")) { extern moduledata_t *_g_dn_fifo; extern moduledata_t *_g_dn_wf2qp; extern moduledata_t *_g_dn_rr; extern moduledata_t *_g_dn_qfq; #ifdef WITH_KPS extern moduledata_t *_g_dn_kps; #endif if (!strcmp(av[1], "rr")) mod = _g_dn_rr; else if (!strcmp(av[1], "wf2qp")) mod = _g_dn_wf2qp; else if (!strcmp(av[1], "fifo")) mod = _g_dn_fifo; else if (!strcmp(av[1], "qfq")) mod = _g_dn_qfq; #ifdef WITH_KPS else if (!strcmp(av[1], "kps")) mod = _g_dn_kps; #endif else mod = NULL; c->name = mod ? mod->name : "NULL"; DX(3, "using scheduler %s", c->name); } else if (!strcmp(*av, "-len")) { c->lmin = getnum(av[1], NULL, av[0]); c->lmax = c->lmin; DX(3, "setting max to %d", c->th_max); } else if (!strcmp(*av, "-burst")) { c->maxburst = getnum(av[1], NULL, av[0]); DX(3, "setting max to %d", c->th_max); } else if (!strcmp(*av, "-qmax")) { c->th_max = getnum(av[1], NULL, av[0]); DX(3, "setting max to %d", c->th_max); } else if (!strcmp(*av, "-qmin")) { c->th_min = getnum(av[1], NULL, av[0]); DX(3, "setting min to %d", c->th_min); } else if (!strcmp(*av, "-flows")) { c->flows = getnum(av[1], NULL, av[0]); DX(3, "setting flows to %d", c->flows); } else if (!strcmp(*av, "-flowsets")) { parse_flowsets(c, av[1], 0); DX(3, "setting flowsets to %d", c->flowsets); } else { D("option %s not recognised, ignore", *av); } ac -= 2; av += 2; } if (c->maxburst <= 0) c->maxburst = 1; if (c->loops <= 0) c->loops = 1; if (c->flows <= 0) c->flows = 1; if (c->flowsets <= 0) c->flowsets = 1; if (c->lmin <= 0) c->lmin = 1; if (c->lmax <= 0) c->lmax = 1; /* multiply by N */ if (c->th_min < 0) c->th_min = c->flows * -c->th_min; if (c->th_max < 0) c->th_max = c->flows * -c->th_max; if (c->th_max <= c->th_min) c->th_max = c->th_min + 1; if (mod) { p = mod->p; DX(3, "using module %s f %p p %p", mod->name, mod->f, mod->p); DX(3, "modname %s ty %d", p->name, p->type); c->enq = p->enqueue; c->deq = p->dequeue; c->si_len += p->si_datalen; c->q_len += p->q_datalen; c->schk_len += p->schk_datalen; } /* allocate queues, flowsets and one scheduler */ c->q = calloc(c->flows, c->q_len); c->fs = calloc(c->flowsets, sizeof(struct dn_fsk)); c->si = calloc(1, c->si_len); c->sched = calloc(c->flows, c->schk_len); if (c->q == NULL || c->fs == NULL) { D("error allocating memory for flows"); exit(1); } c->si->sched = c->sched; if (p) { if (p->config) p->config(c->sched); if (p->new_sched) p->new_sched(c->si); } /* parse_flowsets links queues to their flowsets */ parse_flowsets(c, av[1], 1); /* complete the work calling new_fsk */ for (i = 0; i < c->flowsets; i++) { if (c->fs[i].fs.par[1] == 0) c->fs[i].fs.par[1] = 1000; /* default pkt len */ c->fs[i].sched = c->sched; if (p && p->new_fsk) p->new_fsk(&c->fs[i]); } /* initialize the lists for the generator, and put * all flows in the list for backlog = 0 */ for (i=0; i <= BACKLOG+5; i++) INIT_LIST_HEAD(&c->ll[i]); for (i = 0; i < c->flows; i++) { struct dn_queue *q = FI2Q(c, i); if (q->fs == NULL) q->fs = &c->fs[0]; /* XXX */ q->_si = c->si; if (p && p->new_queue) p->new_queue(q); INIT_LIST_HEAD(&q->ni.h); list_add_tail(&q->ni.h, &c->ll[0]); } c->llmask = 1; return 0; } int main(int ac, char *av[]) { struct cfg_s c; struct timeval end; double ll; int i; char msg[40]; bzero(&c, sizeof(c)); c.ac = ac; c.av = av; init(&c); gettimeofday(&c.time, NULL); mainloop(&c); gettimeofday(&end, NULL); end.tv_sec -= c.time.tv_sec; end.tv_usec -= c.time.tv_usec; if (end.tv_usec < 0) { end.tv_usec += 1000000; end.tv_sec--; } c.time = end; ll = end.tv_sec*1000000 + end.tv_usec; ll *= 1000; /* convert to nanoseconds */ ll /= c._enqueue; sprintf(msg, "1::%d", c.flows); D("%-8s n %d %d time %d.%06d %8.3f qlen %d %d flows %s drops %d", c.name, c._enqueue, c.loops, (int)c.time.tv_sec, (int)c.time.tv_usec, ll, c.th_min, c.th_max, c.fs_config ? c.fs_config : msg, c.drop); dump(&c); DX(1, "done ac %d av %p", ac, av); for (i=0; i < ac; i++) DX(1, "arg %d %s", i, av[i]); return 0; } /* * The controller decides whether in this iteration we should send * (the packet is in c->tosend) and/or receive (flag c->can_dequeue) */ static void controller(struct cfg_s *c) { struct mbuf *m; struct dn_fs *fs; int flow_id; /* histeresis between max and min */ if (c->state == 0 && c->pending >= c->th_max) c->state = 1; else if (c->state == 1 && c->pending <= c->th_min) c->state = 0; ND(1, "state %d pending %2d", c->state, c->pending); c->can_dequeue = c->state; c->tosend = NULL; if (c->state) return; if (1) { int i; struct dn_queue *q; struct list_head *h; i = ffs(c->llmask) - 1; if (i < 0) { DX(2, "no candidate"); c->can_dequeue = 1; return; } h = &c->ll[i]; ND(1, "backlog %d p %p prev %p next %p", i, h, h->prev, h->next); q = list_first_entry(h, struct dn_queue, ni.h); list_del(&q->ni.h); flow_id = Q2FI(c, q); DX(2, "extracted flow %p %d backlog %d", q, flow_id, i); if (list_empty(h)) { ND(2, "backlog %d empty", i); c->llmask &= ~(1<ni.h, h+1); ND(1, " after %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next); if (i < BACKLOG) { ND(2, "backlog %d full", i+1); c->llmask |= 1<<(1+i); } fs = &q->fs->fs; c->cur_fs = q->fs - c->fs; fs->cur = flow_id; } else { /* XXX this does not work ? */ /* now decide whom to send the packet, and the length */ /* lookup in the flow table */ if (c->cur_y >= c->max_y) { /* handle wraparound */ c->cur_y = 0; c->cur_fs = 0; } fs = &c->fs[c->cur_fs].fs; flow_id = fs->cur++; if (fs->cur >= fs->next_flow) fs->cur = fs->first_flow; c->cur_y++; if (c->cur_y >= fs->next_y) c->cur_fs++; } /* construct a packet */ if (c->freelist) { m = c->tosend = c->freelist; c->freelist = c->freelist->m_nextpkt; } else { m = c->tosend = calloc(1, sizeof(struct mbuf)); } if (m == NULL) return; m->cfg = c; m->m_nextpkt = NULL; m->m_pkthdr.len = fs->par[1]; // XXX maxlen m->flow_id = flow_id; ND(2,"y %6d flow %5d fs %3d weight %4d len %4d", c->cur_y, m->flow_id, c->cur_fs, fs->par[0], m->m_pkthdr.len); } /* Packet allocation: to achieve a distribution that matches weights, for each X=w/lmax class we should generate a number of packets proportional to Y = X times the number of flows in the class. So we construct an array with the cumulative distribution of Y's, and use it to identify the flow via inverse mapping (if the Y's are not too many we can use an array for the lookup). In practice, each flow will have X entries [virtually] pointing to it. */ mylist.h000644 000423 000000 00000002030 11331234700 012773 0ustar00luigiwheel000000 000000 /* * linux-like bidirectional lists */ #ifndef _MYLIST_H #define _MYLIST_H struct list_head { struct list_head *prev, *next; }; #define INIT_LIST_HEAD(l) do { (l)->prev = (l)->next = (l); } while (0) #define list_empty(l) ( (l)->next == l ) static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { next->prev = new; new->next = next; new->prev = prev; prev->next = new; } static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } #define list_first_entry(pL, ty, member) \ (ty *)((char *)((pL)->next) - offsetof(ty, member)) static inline void __list_del(struct list_head *prev, struct list_head *next) { next->prev = prev; prev->next = next; } static inline void list_del(struct list_head *entry) { ND("called on %p", entry); __list_del(entry->prev, entry->next); entry->next = entry->prev = NULL; } #endif /* _MYLIST_H */ test_dn_sched.c000644 000423 000000 00000002636 11334514565 014304 0ustar00luigiwheel000000 000000 /* * $Id: test_dn_sched.c 5225 2010-02-10 10:41:46Z luigi $ library functions for dummynet schedulers */ #include "dn_test.h" void m_freem(struct mbuf *m) { printf("free %p\n", m); } int dn_sched_modevent(module_t mod, int cmd, void *arg) { return 0; } void dn_free_pkts(struct mbuf *m) { struct mbuf *x; while ( (x = m) ) { m = m->m_nextpkt; m_freem(x); } } int dn_delete_queue(void *_q, void *do_free) { struct dn_queue *q = _q; if (q->mq.head) dn_free_pkts(q->mq.head); free(q); return 0; } /* * This is a simplified function for testing purposes, which does * not implement statistics or random loss. * Enqueue a packet in q, subject to space and queue management policy * (whose parameters are in q->fs). * Update stats for the queue and the scheduler. * Return 0 on success, 1 on drop. The packet is consumed anyways. */ int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) { if (drop) goto drop; if (q->ni.length >= 200) goto drop; mq_append(&q->mq, m); q->ni.length++; q->ni.tot_bytes += m->m_pkthdr.len; return 0; drop: q->ni.drops++; return 1; } int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) { if (*v < lo) { *v = dflt; } else if (*v > hi) { *v = hi; } return *v; }