Index: sys/conf/files =================================================================== RCS file: /home/ncvs/src/sys/conf/files,v retrieving revision 1.340.2.94 diff -u -r1.340.2.94 files --- sys/conf/files 26 Mar 2002 10:12:22 -0000 1.340.2.94 +++ sys/conf/files 18 Jul 2002 20:17:01 -0000 @@ -530,6 +530,7 @@ isofs/cd9660/cd9660_util.c optional cd9660 isofs/cd9660/cd9660_vfsops.c optional cd9660 isofs/cd9660/cd9660_vnops.c optional cd9660 +kern/heap.c standard kern/imgact_aout.c standard kern/imgact_elf.c standard kern/imgact_gzip.c optional gzip @@ -570,6 +571,7 @@ kern/kern_sig.c standard kern/kern_subr.c standard kern/kern_switch.c standard +kern/kern_switch_ps.c standard kern/kern_synch.c standard kern/kern_syscalls.c standard kern/kern_sysctl.c standard Index: sys/sys/proc.h =================================================================== RCS file: /home/ncvs/src/sys/sys/proc.h,v retrieving revision 1.99.2.7 diff -u -r1.99.2.7 proc.h --- sys/sys/proc.h 31 Jan 2002 18:40:29 -0000 1.99.2.7 +++ sys/sys/proc.h 18 Jul 2002 19:58:45 -0000 @@ -55,6 +55,53 @@ #include /* For struct klist */ /* + * Interface for schedulers + */ +#define FEEDBACK_PRIORITY 0 +#define PROPORTIONAL_SHARE 1 + +typedef void sched_fun_t (struct proc *); +typedef void sched_conf_t (void *); +typedef int curpriority_cmp_t (struct proc *); +typedef struct proc * chooseproc_t (void); +typedef u_int32_t procrunnable_t (void); + +struct _sched_interface { + struct _sched_interface *next; + char *name; + int id; + + sched_conf_t *sched_init; + void (*sched_move)(void); + sched_fun_t *setrunqueue; + sched_fun_t *remrunqueue; + sched_fun_t *schedulerexit; + sched_fun_t *resetpriority; + sched_fun_t *schedcpu1; + sched_fun_t *schedclock1; + curpriority_cmp_t *curpriority_cmp; + chooseproc_t *chooseproc; + procrunnable_t *procrunnable; +}; + +extern struct _sched_interface *schedulers; + +extern sched_conf_t *sched_init; +extern sched_fun_t *setrunqueue; +extern sched_fun_t *remrunqueue; +extern sched_fun_t *schedulerexit; +extern sched_fun_t *resetpriority; +extern sched_fun_t *schedcpu1; +extern sched_fun_t *schedclock1; +extern curpriority_cmp_t *curpriority_cmp; +extern chooseproc_t *Xchooseproc; +extern procrunnable_t *Xprocrunnable; + +/* + * End of scheduler interface + */ + +/* * One structure allocated per session. */ struct session { @@ -429,15 +476,13 @@ void mi_switch __P((void)); void procinit __P((void)); int p_trespass __P((struct proc *p1, struct proc *p2)); -void resetpriority __P((struct proc *)); +void maybe_resched __P((struct proc *chk)); int roundrobin_interval __P((void)); void schedclock __P((struct proc *)); void setrunnable __P((struct proc *)); -void setrunqueue __P((struct proc *)); void sleepinit __P((void)); int suser __P((struct proc *)); int suser_xxx __P((struct ucred *cred, struct proc *proc, int flag)); -void remrunqueue __P((struct proc *)); void cpu_switch __P((struct proc *)); void unsleep __P((struct proc *)); Index: sys/kern/kern_exit.c =================================================================== RCS file: /home/ncvs/src/sys/kern/kern_exit.c,v retrieving revision 1.92.2.10 diff -u -r1.92.2.10 kern_exit.c --- sys/kern/kern_exit.c 29 Apr 2002 09:42:35 -0000 1.92.2.10 +++ sys/kern/kern_exit.c 18 Jul 2002 22:14:58 -0000 @@ -269,6 +269,7 @@ p->p_textvp = NULL; vrele(vtmp); } + schedulerexit(p); /* Remove proc from scheduler */ /* * Remove proc from allproc queue and pidhash chain. Index: sys/kern/kern_switch.c =================================================================== RCS file: /home/ncvs/src/sys/kern/kern_switch.c,v retrieving revision 1.3.2.1 diff -u -r1.3.2.1 kern_switch.c --- sys/kern/kern_switch.c 16 May 2000 06:58:12 -0000 1.3.2.1 +++ sys/kern/kern_switch.c 18 Jul 2002 19:58:26 -0000 @@ -32,6 +32,9 @@ #include #include #include +#include +#include +#include /* * We have NQS (32) run queues per scheduling class. For the normal @@ -52,6 +55,28 @@ u_int32_t rtqueuebits; u_int32_t idqueuebits; +static struct callout_handle thandle; /* round robin timeout */ +/* + * Force switch among equal priority processes every sched_quantum + * (default value is 100ms). + */ +/* ARGSUSED */ +static void +roundrobin(void *arg) +{ +#ifdef SMP + need_resched(); + forward_roundrobin(); +#else + struct proc *p = curproc; /* XXX */ + + if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type)) + need_resched(); +#endif + + thandle = timeout(roundrobin, NULL, sched_quantum); +} + /* * Initialize the run queues at boot time. */ @@ -65,8 +90,9 @@ TAILQ_INIT(&rtqueues[i]); TAILQ_INIT(&idqueues[i]); } + /* Kick off timeout driven events by calling first time. */ + roundrobin(NULL) ; } -SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL) /* * setrunqueue() examines a process priority and class and inserts it on @@ -75,8 +101,8 @@ * The process must be runnable. * This must be called at splhigh(). */ -void -setrunqueue(struct proc *p) +static void +fp_setrunqueue(struct proc *p) { struct rq *q; u_int8_t pri; @@ -107,8 +133,8 @@ * clearing the queue busy bit if it becomes empty. * This must be called at splhigh(). */ -void -remrunqueue(struct proc *p) +static void +fp_remrunqueue(struct proc *p) { struct rq *q; u_int32_t *which; @@ -143,8 +169,8 @@ * * MP SAFE. CALLED WITHOUT THE MP LOCK */ -u_int32_t -procrunnable(void) +static u_int32_t +fp_procrunnable(void) { return (rtqueuebits || queuebits || idqueuebits); } @@ -162,8 +188,8 @@ * choose a slightly lower priority process in order to preserve the cpu * caches. */ -struct proc * -chooseproc(void) +static struct proc * +fp_chooseproc(void) { struct proc *p; struct rq *q; @@ -206,3 +232,133 @@ *which &= ~(1 << pri); return p; } + +/*- + * Compare priorities. Return: + * <0: priority of p < current priority + * 0: priority of p == current priority + * >0: priority of p > current priority + * The priorities are the normal priorities or the normal realtime priorities + * if p is on the same scheduler as curproc. Otherwise the process on the + * more realtimeish scheduler has lowest priority. As usual, a higher + * priority really means a lower priority. + */ +static int +fp_curpriority_cmp(struct proc *p) +{ + int c_class, p_class; + + c_class = RTP_PRIO_BASE(curproc->p_rtprio.type); + p_class = RTP_PRIO_BASE(p->p_rtprio.type); + if (p_class != c_class) + return (p_class - c_class); + if (p_class == RTP_PRIO_NORMAL) + return (((int)p->p_priority - (int)curpriority) / PPQ); + return ((int)p->p_rtprio.prio - (int)curproc->p_rtprio.prio); +} + +/* + * Dummy function entry: this scheduler doesn't need a schedulerexit + */ +static void +fp_schedulerexit(struct proc *p) +{ +} + +/* + * Compute the priority of a process when running in user mode. + * Arrange to reschedule if the resulting priority is better + * than that of the current process. + */ +static void +fp_resetpriority(struct proc *p) +{ + unsigned int newpriority; + + if (p->p_rtprio.type == RTP_PRIO_NORMAL) { + newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT + + NICE_WEIGHT * p->p_nice; + newpriority = min(newpriority, MAXPRI); + p->p_usrpri = newpriority; + } + maybe_resched(p); +} + +static void +fp_schedcpu1(struct proc *p) +{ + fp_resetpriority(p) ; + if (p->p_priority >= PUSER) { + if ((p != curproc) && +#ifdef SMP + p->p_oncpu == 0xff && /* idle */ +#endif + p->p_stat == SRUN && + (p->p_flag & P_INMEM) && + (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) { + remrunqueue(p); + p->p_priority = p->p_usrpri; + setrunqueue(p); + } else + p->p_priority = p->p_usrpri; + } +} + +static void +fp_schedclock1(struct proc *p) +{ + if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { + resetpriority(p); + if (p->p_priority >= PUSER) + p->p_priority = p->p_usrpri; + } +} + +/* + * Moves processes from Feedback Priority scheduler to the new + * current scheduler. + */ +static void +fp_sched_move(void) +{ + struct rq *work_queue[3] = { queues, rtqueues, idqueues } ; + int i, j; + struct proc *p; + + untimeout(roundrobin, NULL, thandle); /* stop roundrobin */ + sched_init(NULL); /* init new scheduler */ + /* move processes to new scheduler */ + for (i = 0 ; i < 3 ; i++) + for (j = 0 ; j < NQS ; j++) + TAILQ_FOREACH(p, &work_queue[i][j], p_procq) + setrunqueue(p) ; + + queuebits = rtqueuebits = idqueuebits = 0 ; +} + +static struct _sched_interface fp_scheduler = { + NULL, + "fp", + FEEDBACK_PRIORITY, + rqinit, + fp_sched_move, + fp_setrunqueue, + fp_remrunqueue, + fp_schedulerexit, + fp_resetpriority, + fp_schedcpu1, + fp_schedclock1, + fp_curpriority_cmp, + fp_chooseproc, + fp_procrunnable +}; + +static void +fp_load(void) +{ + printf("Loaded feedback priority scheduler\n"); + fp_scheduler.next = schedulers; + schedulers = &fp_scheduler; +} +SYSINIT(fpload, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, fp_load, NULL); + Index: sys/kern/kern_synch.c =================================================================== RCS file: /home/ncvs/src/sys/kern/kern_synch.c,v retrieving revision 1.87.2.5 diff -u -r1.87.2.5 kern_synch.c --- sys/kern/kern_synch.c 28 Jun 2002 00:21:44 -0000 1.87.2.5 +++ sys/kern/kern_synch.c 18 Jul 2002 19:58:35 -0000 @@ -58,8 +58,33 @@ #include #include +#define DBB(a) a +#define DEB(a) + +/* + * Scheduler interface + */ + +sched_fun_t *setrunqueue; +sched_fun_t *remrunqueue; +sched_fun_t *schedulerexit; +sched_fun_t *resetpriority; +sched_fun_t *schedcpu1; +sched_fun_t *schedclock1; +sched_conf_t *sched_init; +static void (*sched_move)(void); +curpriority_cmp_t *curpriority_cmp; + +chooseproc_t *Xchooseproc; +procrunnable_t *Xprocrunnable; + +static int sched_algorithm = PROPORTIONAL_SHARE; /* XXX */ +struct _sched_interface *schedulers = NULL; /* XXX */ + + static void sched_setup __P((void *dummy)); -SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) +SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_ANY, sched_setup, NULL) +/* XXX It was: SI_SUB_KICK_SCHEDULER */ u_char curpriority; int hogticks; @@ -80,11 +105,8 @@ 0.9944598480048967 * FSCALE, /* exp(-1/180) */ }; -static int curpriority_cmp __P((struct proc *p)); static void endtsleep __P((void *)); static void loadav __P((void *arg)); -static void maybe_resched __P((struct proc *chk)); -static void roundrobin __P((void *arg)); static void schedcpu __P((void *arg)); static void updatepri __P((struct proc *p)); @@ -107,36 +129,11 @@ SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW, 0, sizeof sched_quantum, sysctl_kern_quantum, "I", ""); -/*- - * Compare priorities. Return: - * <0: priority of p < current priority - * 0: priority of p == current priority - * >0: priority of p > current priority - * The priorities are the normal priorities or the normal realtime priorities - * if p is on the same scheduler as curproc. Otherwise the process on the - * more realtimeish scheduler has lowest priority. As usual, a higher - * priority really means a lower priority. - */ -static int -curpriority_cmp(p) - struct proc *p; -{ - int c_class, p_class; - - c_class = RTP_PRIO_BASE(curproc->p_rtprio.type); - p_class = RTP_PRIO_BASE(p->p_rtprio.type); - if (p_class != c_class) - return (p_class - c_class); - if (p_class == RTP_PRIO_NORMAL) - return (((int)p->p_priority - (int)curpriority) / PPQ); - return ((int)p->p_rtprio.prio - (int)curproc->p_rtprio.prio); -} - /* * Arrange to reschedule if necessary, taking the priorities and * schedulers into account. */ -static void +void maybe_resched(chk) struct proc *chk; { @@ -167,29 +164,6 @@ } /* - * Force switch among equal priority processes every 100ms. - */ -/* ARGSUSED */ -static void -roundrobin(arg) - void *arg; -{ -#ifndef SMP - struct proc *p = curproc; /* XXX */ -#endif - -#ifdef SMP - need_resched(); - forward_roundrobin(); -#else - if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type)) - need_resched(); -#endif - - timeout(roundrobin, NULL, sched_quantum); -} - -/* * Constants for digital decay and forget: * 90% of (p_estcpu) usage in 5 * loadav time * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) @@ -323,21 +297,7 @@ #endif p->p_cpticks = 0; p->p_estcpu = decay_cpu(loadfac, p->p_estcpu); - resetpriority(p); - if (p->p_priority >= PUSER) { - if ((p != curproc) && -#ifdef SMP - p->p_oncpu == 0xff && /* idle */ -#endif - p->p_stat == SRUN && - (p->p_flag & P_INMEM) && - (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) { - remrunqueue(p); - p->p_priority = p->p_usrpri; - setrunqueue(p); - } else - p->p_priority = p->p_usrpri; - } + schedcpu1(p); splx(s); } wakeup((caddr_t)&lbolt); @@ -419,6 +379,7 @@ int s, sig, catch = priority & PCATCH; struct callout_handle thandle; + TSTMP(4, priority, 101, p->p_pid); #ifdef KTRACE if (p && KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 1, 0); @@ -811,6 +772,7 @@ register struct rlimit *rlim; int x; + TSTMP(4, 0, 102, p ? p->p_pid : 0 ); /* * XXX this spl is almost unnecessary. It is partly to allow for * sloppy callers that don't do it (issignal() via CURSIG() is the @@ -892,6 +854,7 @@ register int s; s = splhigh(); + TSTMP(4, 0, 103, p->p_pid); switch (p->p_stat) { case 0: case SRUN: @@ -922,26 +885,6 @@ } /* - * Compute the priority of a process when running in user mode. - * Arrange to reschedule if the resulting priority is better - * than that of the current process. - */ -void -resetpriority(p) - register struct proc *p; -{ - register unsigned int newpriority; - - if (p->p_rtprio.type == RTP_PRIO_NORMAL) { - newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT + - NICE_WEIGHT * p->p_nice; - newpriority = min(newpriority, MAXPRI); - p->p_usrpri = newpriority; - } - maybe_resched(p); -} - -/* * Compute a tenex style load average of a quantity on * 1, 5 and 15 minute intervals. */ @@ -973,20 +916,6 @@ loadav, NULL); } -/* ARGSUSED */ -static void -sched_setup(dummy) - void *dummy; -{ - - callout_init(&loadav_callout); - - /* Kick off timeout driven events by calling first time. */ - roundrobin(NULL); - schedcpu(NULL); - loadav(NULL); -} - /* * We adjust the priority of the current process. The priority of * a process gets worse as it accumulates CPU time. The cpu usage @@ -1005,12 +934,108 @@ schedclock(p) struct proc *p; { - p->p_cpticks++; p->p_estcpu = ESTCPULIM(p->p_estcpu + 1); - if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { - resetpriority(p); - if (p->p_priority >= PUSER) - p->p_priority = p->p_usrpri; + schedclock1(p) ; +} + +/* ------------- scheduling algorithm dependent code ----------------------- */ + +/* ARGSUSED */ +static void +sched_setup(dummy) + void *dummy; +{ + struct _sched_interface *p; + + for (p = schedulers; p && p->id != sched_algorithm ; p = p->next) + ; + + if (p == NULL) + panic("scheduler not found\n") ; + printf("Active scheduler %s\n", p->name); + + sched_init = p->sched_init; + sched_move = p->sched_move; + setrunqueue = p->setrunqueue; + remrunqueue = p->remrunqueue; + schedulerexit = p->schedulerexit; + resetpriority = p->resetpriority; + schedcpu1 = p->schedcpu1; + schedclock1 = p->schedclock1; + curpriority_cmp = p->curpriority_cmp; + Xchooseproc = p->chooseproc; + Xprocrunnable = p->procrunnable; + + callout_init(&loadav_callout); + + sched_init(NULL) ; + /* Kick off timeout driven events by calling first time. */ + schedcpu(NULL); + loadav(NULL); +} + +/* + * Implements scheduling algorithm switch + */ +static int +sysctl_kern_scheduler(SYSCTL_HANDLER_ARGS) +{ + int error, new_val, old_val, s ; + struct _sched_interface *p; + + old_val = new_val = sched_algorithm ; + error = sysctl_handle_int(oidp, &new_val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + s = splhigh(); + for (p = schedulers; p && p->id != new_val ; p = p->next) + ; + if (p == NULL) { + printf("kern_scheduler %d not found\n", new_val); + return (EINVAL); } + if (new_val != old_val) { /* switch */ + sched_algorithm = new_val ; + sched_init = p->sched_init; + setrunqueue = p->setrunqueue; + remrunqueue = p->remrunqueue; + schedulerexit = p->schedulerexit; + resetpriority = p->resetpriority; + schedcpu1 = p->schedcpu1; + schedclock1 = p->schedclock1; + curpriority_cmp = p->curpriority_cmp; + Xchooseproc = p->chooseproc; + Xprocrunnable = p->procrunnable; + callout_init(&loadav_callout); + sched_move(); + sched_move = p->sched_move; + } + splx(s) ; + return (0); +} + +SYSCTL_PROC(_kern, OID_AUTO, scheduler, CTLTYPE_INT|CTLFLAG_RW, + 0, sizeof sched_algorithm, sysctl_kern_scheduler, "I", + "Current scheduling algorithm"); +/* + * procrunnable demultiplexing function + */ +u_int32_t +procrunnable(void) +{ + return Xprocrunnable(); +} + +/* + * chooseproc demultiplexing function + */ +struct proc * +chooseproc(void) +{ + return Xchooseproc(); } + + +/* ------------ end of scheduling algorithm dependent code ----------------- */ + --- /dev/null Thu Jul 18 21:51:23 2002 +++ sys/sys/heap.h Thu Jul 18 22:09:44 2002 @@ -0,0 +1,66 @@ +#ifndef _HEAP_H +#define _HEAP_H + +/* + * Definition of heap data structures. In the structures, I decided + * not to use the macros in in the hope of making the code + * easier to port to other architectures. The type of lists and queue we + * use here is pretty simple anyways. + */ + +/* + * So we use a key "dn_key" which is 64 bits. Some macros are used to + * compare key values and handle wraparounds. + * MAX64 returns the largest of two key values. + * MY_M is used as a shift count when doing fixed point arithmetic + * (a better name would be useful...). + */ + +typedef u_int64_t dn_key ; /* sorting key */ +#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) +#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) +#define DN_KEY_GT(a,b) ((int64_t)((a)-(b)) > 0) +#define DN_KEY_GEQ(a,b) ((int64_t)((a)-(b)) >= 0) +#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) +/* + * The OFFSET_OF macro is used to return the offset of a field within + * a structure. It is used by the heap management routines. + */ +#define OFFSET_OF(type, field) ((int)&( ((type *)0)->field) ) + +/* + * A heap entry is made of a key and a pointer to the actual + * object stored in the heap. + * The heap is an array of dn_heap_entry entries, dynamically allocated. + * Current size is "size", with "elements" actually in use. + * The heap normally supports only ordered insert and extract from the top. + * If we want to extract an object from the middle of the heap, we + * have to know where the object itself is located in the heap (or we + * need to scan the whole array). To this purpose, an object has a + * field (int) which contains the index of the object itself into the + * heap. When the object is moved, the field must also be updated. + * The offset of the index in the object is stored in the 'offset' + * field in the heap descriptor. The assumption is that this offset + * is non-zero if we want to support extract from the middle. + */ +struct dn_heap_entry { + dn_key key ; /* sorting key. Topmost element is smallest one */ + void *object ; /* object pointer */ +} ; + +struct dn_heap { + int size ; + int elements ; + int offset ; /* XXX if > 0 this is the offset of direct ptr to obj */ + struct dn_heap_entry *p ; /* really an array of "size" entries */ +} ; + +int heap_init(struct dn_heap *h, int size) ; +int heap_insert (struct dn_heap *h, dn_key key1, void *p); +void heap_modify(struct dn_heap *h, void *old, void *new); +void heap_extract(struct dn_heap *h, void *obj); +void heapify(struct dn_heap *h); +void heap_free(struct dn_heap *h); + +#endif /* _HEAP_H */ + --- /dev/null Thu Jul 18 21:51:23 2002 +++ sys/kern/heap.c Thu Jul 18 21:58:19 2002 @@ -0,0 +1,280 @@ +#define DEB(x) +#define DDB(x) x + +/* + * This module implements + * + heap management functions; + * + * include files marked with XXX are probably not needed + */ + +#include +#include +#include +#include +#include /* XXX */ +#include +#include +#include +#include + +MALLOC_DEFINE(M_HEAP, "heap", "heap data structures"); /* XXX Check this */ + +/* + * Heap management functions. + * + * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. + * Some macros help finding parent/children so we can optimize them. + * + * heap_init() is called to expand the heap when needed. + * Increment size in blocks of 16 entries. + * XXX failure to allocate a new element is a pretty bad failure + * as we basically stall a whole queue forever!! + * Returns 1 on error, 0 on success + */ +#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) +#define HEAP_LEFT(x) ( 2*(x) + 1 ) +#define HEAP_IS_LEFT(x) ( (x) & 1 ) +#define HEAP_RIGHT(x) ( 2*(x) + 2 ) +#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } +#define HEAP_INCREMENT 15 + +/* + * DEBUGGING facility + */ +void +check_heap(struct dn_heap *h, u_char * s) +{ + int i, error = 0 ; + + for (i = 0 ; (i < h->elements) && (!error) ; i++) { + void *obj = h->p[i].object ; + + error = 1 ; + if (obj == NULL) + printf("check_heap: null object") ; + else if (h->offset > 0 && *((int *)((char *)obj + h->offset)) != i) + printf("check_heap: internal offset %d index %d", + *((int *)((char *)obj + h->offset)),i) ; + else + error = 0 ; + } + if (error) + panic(s) ; + DEB(printf("%s: check_heap OK\n", s) ); +} + +int +heap_init(struct dn_heap *h, int new_size) +{ + struct dn_heap_entry *p; + + if (h->size >= new_size ) { + printf("heap_init, Bogus call, have %d want %d\n", + h->size, new_size); + return 0 ; + } + new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ; + p = malloc(new_size * sizeof(*p), M_HEAP, M_DONTWAIT ); + if (p == NULL) { + printf(" heap_init, resize %d failed\n", new_size ); + return 1 ; /* error */ + } + if (h->size > 0) { + bcopy(h->p, p, h->size * sizeof(*p) ); + free(h->p, M_HEAP); + } + h->p = p ; + h->size = new_size ; + DDB(check_heap(h, "heap_init")) ; + return 0 ; +} + +/* + * Insert element in heap. Normally, p != NULL, we insert p in + * a new position and bubble up. If p == NULL, then the element is + * already in place, and key is the position where to start the + * bubble-up. + * Returns 1 on failure (cannot allocate new heap entry) + * + * If offset > 0 the position (index, int) of the element in the heap is + * also stored in the element itself at the given offset in bytes. + */ +#define SET_OFFSET(heap, node) \ + if (heap->offset > 0) \ + *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ; +/* + * RESET_OFFSET sets offset to an invalid value. + */ +#define RESET_OFFSET(heap, node) \ + if (heap->offset > 0) \ + *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ; +int +heap_insert(struct dn_heap *h, dn_key key1, void *p) +{ + int son = h->elements ; + + if (p == NULL) /* data already there, set starting point */ + son = key1 ; + else { /* insert new element at the end, possibly resize */ + son = h->elements ; + if (son == h->size) /* need resize... */ + if (heap_init(h, h->elements+1) ) + return 1 ; /* failure... */ + h->p[son].object = p ; + h->p[son].key = key1 ; + h->elements++ ; + } + while (son > 0) { /* bubble up */ + int father = HEAP_FATHER(son) ; + struct dn_heap_entry tmp ; + + if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) + break ; /* found right position */ + /* son smaller than father, swap and repeat */ + HEAP_SWAP(h->p[son], h->p[father], tmp) ; + SET_OFFSET(h, son); + son = father ; + } + SET_OFFSET(h, son); + return 0 ; +} + +/* + * update element pointing to old_obj to point to new_obj + */ +void +heap_modify(struct dn_heap *h, void *old_obj, void *new_obj) +{ + /* modify specific element, index is at offset */ + if (old_obj != NULL && h->offset > 0) { + int index = *((int *)((char *)old_obj + h->offset)) ; + + if (index < 0 || index >= h->elements) { + printf("heap_modify, index %d out of bound 0..%d\n", + index, h->elements); + panic("heap_modify"); + } + h->p[index].object = new_obj ; + } else { + printf("heap_modify, null obj or index offset\n") ; + panic("heap_modify") ; + } +} + +/* + * remove top element from heap, or obj if obj != NULL + */ +void +heap_extract(struct dn_heap *h, void *obj) +{ + int child, father, max = h->elements - 1 ; + + DDB(check_heap(h, "entering heap_extract")) ; + if (max < 0) { + printf("warning, extract from empty heap 0x%p\n", h); + return ; + } + father = 0 ; /* default: move up smallest child */ + if (obj != NULL) { /* extract specific element, index is at offset */ + if (h->offset > 0) + father = *((int *)((char *)obj + h->offset)) ; + else + for ( ; father < h->elements && h->p[father].object != obj ; + father++ ) ; + if (father < 0 || father >= h->elements) { + printf("dummynet: heap_extract, father %d out of bound 0..%d\n", + father, h->elements); + panic("heap_extract"); + } + } + RESET_OFFSET(h, father); + child = HEAP_LEFT(father) ; /* left child */ + while (child <= max) { /* valid entry */ + if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) + child = child+1 ; /* take right child, otherwise left */ + h->p[father] = h->p[child] ; + SET_OFFSET(h, father); + father = child ; + child = HEAP_LEFT(child) ; /* left child for next loop */ + } + h->elements-- ; + if (father != max) { + /* + * Fill hole with last entry and bubble up, reusing the insert code + */ + h->p[father] = h->p[max] ; + heap_insert(h, father, NULL); /* this one cannot fail */ + } +} + +#if 0 +/* + * change object position and update references + * XXX this one is never used! + */ +void +heap_move(struct dn_heap *h, dn_key new_key, void *object) +{ + int temp; + int i ; + int max = h->elements-1 ; + struct dn_heap_entry buf ; + + if (h->offset <= 0) + panic("cannot move items on this heap"); + + i = *((int *)((char *)object + h->offset)); + if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */ + h->p[i].key = new_key ; + for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ; + i = temp ) { /* bubble up */ + HEAP_SWAP(h->p[i], h->p[temp], buf) ; + SET_OFFSET(h, i); + } + } else { /* must move down */ + h->p[i].key = new_key ; + while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */ + if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key)) + temp++ ; /* select child with min key */ + if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */ + HEAP_SWAP(h->p[i], h->p[temp], buf) ; + SET_OFFSET(h, i); + } else + break ; + i = temp ; + } + } + SET_OFFSET(h, i); +} +#endif /* heap_move, unused */ + +/* + * heapify() will reorganize data inside an array to maintain the + * heap property. It is needed when we delete a bunch of entries. + */ +void +heapify(struct dn_heap *h) +{ + int i ; + + for (i = 0 ; i < h->elements ; i++ ) + heap_insert(h, i , NULL) ; +} + +/* + * cleanup the heap and free data structure + */ +void +heap_free(struct dn_heap *h) +{ + if (h->size >0 ) + free(h->p, M_HEAP); + bzero(h, sizeof(*h) ); +} + +/* + * --- end of heap management functions --- + */ + + --- /dev/null Thu Jul 18 21:51:23 2002 +++ sys/kern/kern_switch_ps.c Fri Jul 19 00:54:06 2002 @@ -0,0 +1,778 @@ +/* + * Copyright (c) 2002 Paolo Valente, Universita` di Pisa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* PRIO_MIN, PRIO_MAX */ + +#include +#include +#include +#include + +#define DBB(x) x +#define DEB(x) +#define DASSERT(a, b) if ((a) == 0) \ + panic(b); + + +/* + * This module implements proportional share scheduler. + * + * Proportional Share + * + * Multiprocessing is implemented by dividing each process into small + * execution requests and multiplexing them on available cpu(s). + * + * Processes (requests) are served according to the WF2Q+ scheduling + * algorithm, which approximates the behaviour of a fluid system where + * each process is given a share of the CPU proportional to the weight + * assigned to it. + * + * In the reference fluid system, all ready processes are served + * simultaneously, and each of them gets a share of the cpu speed + * equal to the ratio between its weight and the sum of the weights + * of all ready processes. The global progress in the fluid system + * is measured by the "system virtual time", which is the amount of + * service received by a running process divided by its weight. + * So, during the service of a request * with length L and weight W, + * the system virtual time will increase by L/W. + * + * We define the "virtual finish time" of a request the value of the + * system virtual time when the request terminates in the fluid system. + * We emulate the fluid system on the real system (where only one process + * per cpu can be run at any time) by scheduling requests in increasing + * order of the virtual finish times. + * + * Internally, the scheduler makes use of three heaps: + * + * ne_heap: + * processes whose virtual start time is greater than the + * system's virtual time are not yet eligible for execution, so + * they are stored here, sorted by virtual start time. + * As the system's virtual time increases, processes become + * eligible for execution and are moved to the run_heap. + * + * run_heap: + * runnable processes, sorted by virtual finish time; next request + * to be run is at the head of the heap + * + * exp_heap: + * processes which did not issue a new request before the + * end (of the execution) of the last one, sorted by last + * request finish time. + * As the system's virtual time increases, processes whose + * virtual finish time becomes lower than the system's virtual + * time are removed (process weight is removed from sum, + * process specific data strutcture are freed, etc...) + * + * A process is called "active" if it is running or it is present + * in one of the previous heaps, otherwise it is called "inactive". +*/ + +static dn_key V; /* system virtual time */ +static u_int sum; /* sum of the weights of active processes */ +static struct dn_heap run_heap, ne_heap, exp_heap; + +/* + * Process weight is computed from p_nice using a lookup table. + * p_nice can assume values between PRIO_MIN (-20) and PRIO_MAX (20). + */ +static const u_short wtable[PRIO_MAX - PRIO_MIN + 1] = { + 1000, 950, 900, 850, 800, + 750, 700, 650, 600, 550, + 500, 450, 400, 350, 300, + 250, 200, 180, 150, 120, + 100, /* base priority/weight */ + 90, 80, 75, 70, 65, + 60, 55, 50, 45, 40, + 35, 30, 25, 20, 15, + 10, 5, 3, 2, 1 +}; + +#define MINWEIGHT 1 +#define MAXWEIGHT 1000 +#define WEIGHT(ps) (wtable[(ps)->proc->p_nice - PRIO_MIN]) +static dn_key lastRtime; /* used to compute virtual time */ +static struct callout_handle thandle; /* current request timeout */ + +/* + * XXX fix for SMP -- can't you use curproc ? + * + * Next variable is used to appropriately handle request termination. + * When a process is selected for execution it becomes outstanding. + * The (running) process remains outstanding unless it issues + * its next request or removes itself from the scheduler. + * At the end of the request execution, if the process is still + * outstanding, it will be inserted into exp_heap. + */ + +static struct proc *outproc; /* outstanding process */ +#define END_PENDING 1 +static char ps_stat; + +/* + * Virtual times are used as keys for the heaps. They increase in steps of + * + * TIMESCALE*ticks/sum_w + * + * where sum_w is the the sum of all weights. + * The role of TIMESCALE is twofold: first and foremost, to perform + * fixed-point calculations; second, to make the rate of advance of + * time independent of "hz". + * + * Max values are: sum_w = 2^20 (e.g. 1000 active processes with + * a weight of 1000 each), hz <= 10000. + * + * In these conditions we have 43 bits to represent time intervals, + * meaning a wrap after roughly 27 years. + */ +#define SCALE_SHIFT 20 /* to obtain a larger precision */ +#define TIMESCALE ((10000<pad3). A zero in p_pad3 means the process does + * not have a descriptor associated with it (this also means + * that 0 is not a valid index in the array). + * + * The base and size of the array are ps_procs and ps_blocks, + * free slots are kept in a linked list (by index) and the first + * element is at offset ps_free_list. + */ + +struct procps { + int nextfree; /* free list */ + struct proc *proc; /* process we are attached to */ + dn_key p_length; /* nominal length of a request (ticks) */ + dn_key p_S, p_F; /* virtual start and finish time */ + dn_key p_RS; /* (real) start time */ + int heap_pos; /* position (index) of struct in exp_heap */ + u_short old_weight; /* to detect weight change */ +}; + +static struct procps *ps_procs; +static int ps_free_list; +static int ps_blocks; +static int ps_free_blocks; + +SYSCTL_INT(_kern, OID_AUTO, ps_free_blocks, CTLFLAG_RD, &ps_free_blocks, 0, ""); + +#define PSBLOCKELEMS 64 +#define P2PS(p) &ps_procs[ *((u_short *)(p->p_pad3)) ] +#define PSISNEW(p) ((p) == ps_procs) + +/* + * After reallocating the procps array, update pointers + * in the various heaps. + */ +static void +procps_update(int delta) +{ + struct dn_heap *h, *work_heap[3] = { &run_heap, &ne_heap, &exp_heap }; + int i, j; + + for ( i = 0; i < 3; i++ ) { + h = work_heap[i]; + for ( j = 0; j < h->elements; j++ ) + (char *)h->p[j].object += delta; + } +} + +/* + * Alloc to process the slot at the head of free list. + * If free list is empty alloc a new block of slots and + * link them to free list + */ +static void +procps_realloc(void) +{ + struct procps *tmp; + int i = ps_blocks + PSBLOCKELEMS; /* new size */ + +printf("+++ procps_realloc %d --> %d\n", ps_blocks, i); + tmp = malloc(i*sizeof (struct procps), M_TEMP, M_NOWAIT); + DASSERT(tmp, "find_slot: couldn't alloc new ps_procs block"); + if (ps_blocks > 0) { + /* + * copy old record to new one. Scan heaps and update + * pointers + */ + bcopy(ps_procs, tmp, ps_blocks * sizeof (struct procps)); + procps_update( (char *)(tmp) - (char *)(ps_procs) ); + free(ps_procs, M_TEMP); + } + ps_procs = tmp; + ps_free_list = ps_blocks; /* first free */ + ps_blocks += PSBLOCKELEMS; /* total blocks */ + if (ps_free_list == 0) /* entry 0 must be unused */ + ps_free_list = 1; + + for (i = ps_free_list; i < ps_blocks - 1; i++) + ps_procs[i].nextfree = i+1; + ps_free_blocks += ps_blocks - ps_free_list; + ps_procs[ps_blocks-1].nextfree = 0; /* last entry */ +} + +static struct procps * +find_slot(struct proc *p1) +{ + struct procps *ps; + + DEB(printf("+++ FIND ps_blocks %d ps_free_blocks %d\n", + ps_blocks, ps_free_blocks);) + if (ps_free_list == 0) + procps_realloc(); + /* assign slot to process */ + *((u_short *)(p1->p_pad3)) = ps_free_list; + ps = &ps_procs[ps_free_list]; + ps_free_blocks--; + /* remove slot from free list */ + ps_free_list = ps->nextfree; + /* initialize structure */ + bzero(ps, sizeof(*ps)); + ps->proc = p1; + return ps; +} + +/* + * Insert slot at the head of free list + */ +static void +free_slot(struct procps *ps) +{ + int index = ps - ps_procs; + + ps_procs[index].nextfree = ps_free_list; + ps_free_list = index; + ps_free_blocks++; + DEB(printf("+++ FREE ps_blocks %d ps_free_blocks %d\n", + ps_blocks, ps_free_blocks);) +} + +/* + * interface functions for the Proportional Share scheduler + */ + + +/* + * Initialize proportional share server when selected. + */ +static void +psinit(void *dummy) +{ + lastRtime = Rtime(); + V = 0; + sum = 0; + outproc = NULL; + ps_stat = 0; + + /* Initialize heaps */ + exp_heap.size = exp_heap.elements = 0; + exp_heap.offset = OFFSET_OF(struct procps, heap_pos); + + ne_heap.size = ne_heap.elements = 0; + ne_heap.offset = 0; + + run_heap.size = run_heap.elements = 0; + run_heap.offset = 0; + + /* Initialize request timeout handler */ + bzero(&thandle, sizeof(struct callout_handle)); +} + +/* request timeout expired: force switch to next request */ +static void +propshare(void *arg) +{ + ps_stat &= ~END_PENDING; + need_resched(); +} + +/* + * updateV() updates the virtual time. + * + * updateV2() expires inactive processes and moves some from + * not-eligible to eligible heap. + * + * This must be called at splhigh(). + */ +static void +updateV(void) +{ + if (sum == 0 && (ne_heap.elements > 0 || run_heap.elements > 0)) + panic("updateV: null weight sum, but pending requests!"); + if (sum == 0) + return; + V += (Rtime() - lastRtime)/sum; + lastRtime = Rtime(); +} + +static void +updateV2(void) +{ + if (sum == 0) + return; + /* + * if none is eligible, V jumps to the start-time of the + * first non-eligible process. + */ + if (run_heap.elements == 0 && ne_heap.elements > 0) + V = MAX64(V, ne_heap.p[0].key); + + /* expire inactive processes */ + while (exp_heap.elements > 0 && DN_KEY_LT(exp_heap.p[0].key, V) ) { + struct procps *ps = (struct procps *)exp_heap.p[0].object; + + DEB(printf("+++ updateV2: expired slot %d\n", ps - ps_procs);) + heap_extract(&exp_heap, NULL); + sum -= ps->old_weight; + if (ps->proc != NULL) + ps->p_S = ps->p_F + 1; /* mark timestamp as invalid */ + else /* remove proc */ + free_slot(ps); + } + + /* + * move from ne_heap to run_heap any requests that have + * become eligible + */ + while (ne_heap.elements > 0 && DN_KEY_LEQ(ne_heap.p[0].key, V) ) { + struct procps *ps = (struct procps *)ne_heap.p[0].object; + + heap_extract(&ne_heap, NULL); + heap_insert(&run_heap, ps->p_F, ps); + } + TSTMP(4, sum, 124, run_heap.elements*100 + ne_heap.elements); +} + +#define INCLENGTH(a, b) ((a) += ((b) < REQ_MAXLENGTH) ? \ + REQ_DELTA : 0) /* XXX */ +#define DECLENGTH(a, b) ((a) -= ((b) > REQ_MINLENGTH) ? \ + REQ_DELTA : 0) /* XXX */ +/* + * Account process for service received. + * Update process virtual finish time and nominal request length. + */ +static void +charge_service(struct procps *ps) +{ + dn_key act_length = ticks - ps->p_RS; + + DASSERT(ps->proc, "charge_service: null process pointer" ); + if (WEIGHT(ps) < MINWEIGHT || WEIGHT(ps) > MAXWEIGHT) { + printf("charge_service: %p %d %s, nice: %d, weight: %d, sum %d\n", + ps->proc, ps->proc->p_pid, ps->proc->p_comm, + ps->proc->p_nice, WEIGHT(ps), sum); + panic("charge_service: weight out of range"); + } + ps->p_F = ps->p_S + (act_length * TIMESCALE) / WEIGHT(ps); +#if 0 /* TESTING */ + if (p->p_stat == SRUN) + INCLENGTH(p->p_length, act_length); + else + DECLENGTH(p->p_length, act_length); +#endif +} + +/* + * setrunheap: schedules process for running. + * First time process is scheduled, adds process weight to sum. + * If invoked by curproc to schedule its next request + * . curproc timestamps are updated (account for service received) before + * inserting new request into heaps + * . outproc is set to NULL + * If weight of an active process has changed, updates sum. + * The process must be runnable. + * This must be called at splhigh(). + */ +static void +ps_setrunqueue(struct proc *p1) +{ + struct procps *ps; + + TSTMP(4, 0, 120, p1->p_pid); + + DEB( printf("setrunheap: %p %d %s slot %d\n", p1, p1->p_pid, p1->p_comm, + *((u_short *)p1->p_pad3) );) + ps = P2PS(p1); + if (PSISNEW(ps)) { /* new process */ + ps = find_slot(p1); + /* mark timestamps as invalid */ + ps->p_S = ps->p_F + 1; +#if 0 + ps->p_length = REQ_MINLENGTH; +#else /* TESTING */ + ps->p_length = hz/100; /* 10 ms */ +#endif + } + + updateV(); + if (DN_KEY_GT(ps->p_S, ps->p_F)) { /* invalid timestamps */ + ps->p_S = V; + sum += WEIGHT(ps); /* add weight of new process */ + } else { /* process with valid timestamp */ + /* + * A process with valid timestamp. + * If it is not running, should be in exp_heap, + * so remove it from there. + */ + if (p1 != curproc) + heap_extract(&exp_heap, ps); + else { + /* + * need to update my timestamps before + * (re)scheduling myself + */ + charge_service(ps); + if (outproc == NULL || outproc != curproc) + panic("setrunheap: invalid outproc"); + outproc = NULL; /* came back into active heaps */ + } + /* if weight has changed then update sum */ + if (WEIGHT(ps) != ps->old_weight) + sum += WEIGHT(ps) - ps->old_weight; + ps->p_S = MAX64(ps->p_F, V); + } + + /* now we are done with possible weight (sum) change */ + if (WEIGHT(ps) != ps->old_weight) + ps->old_weight = WEIGHT(ps); + + if (WEIGHT(ps) < MINWEIGHT || WEIGHT(ps) > MAXWEIGHT) { + printf("setrunqueue: %p %d %s, nice: %d, weight: %d, sum %d\n", + ps->proc, + ps->proc->p_pid, ps->proc->p_comm, ps->proc->p_nice, + WEIGHT(ps), sum); + panic("setrunqueue: weight out of range"); + } + if (p1->p_priority >= PUSER) { + ps->p_length = hz/100; /* default */ + } else { /* blocked on a resource */ + ps->p_length = hz * p1->p_priority / (200*PUSER) + 1; + } + + /* Set request finish time */ + ps->p_F = ps->p_S + ps->p_length*TIMESCALE/WEIGHT(ps); + + /* insert request in proper heap */ + if (ps->p_S == V) + heap_insert(&run_heap, ps->p_F, ps); + else + heap_insert(&ne_heap, ps->p_S, ps); +} + +/* + * Removes an active and not running process from the scheduler. + * This must be called at splhigh(). + */ +static void +ps_remrunqueue(struct proc *p1) +{ + struct procps *ps; + + DEB( printf("remrunqueue: %p %d %s\n", p1, p1->p_pid, p1->p_comm);) + DASSERT(p1->p_stat == SRUN, ("remrunqueue: proc not SRUN")); + + TSTMP(4, 0, 121, p1->p_pid); + ps = P2PS(p1); + if (PSISNEW(ps)) + panic("ps_remrunqueue: new procps"); + if (DN_KEY_LEQ(ps->p_S, V)) + heap_extract(&run_heap, ps); + else + heap_extract(&ne_heap, ps); + heap_insert(&exp_heap, ps->p_S, ps); +} + +/* + * Removes a running process from scheduler. + * Must be called by the process who wants to remove itself from scheduler. + */ +static void +ps_schedulerexit(struct proc *p1) +{ + struct procps *ps; + + ps = P2PS(p1); + DEB(printf("schedulerexit: slot %d pid %d\n", + ps - ps_procs, p1->p_pid);) + if (PSISNEW(ps)) /* may happen on a scheduler algorithm switch */ + return; + if (DN_KEY_GT(ps->p_S, ps->p_F)) /* means timestamps are invalid */ + panic("schedulerexit: invalid timestamps"); + /* + * Process is terminating, proc data must be freed before we + * remove the process from scheduler. + * We mark ps to remember that proc data structure is not + * associated with a process any more. + * When this process will be extracted from exp_heap, it will be + * removed from scheduler. + */ + ps->proc = NULL; + /* we assume that schedulerexit is called by the process itself */ + heap_insert(&exp_heap, ps->p_F, ps); + outproc = NULL; +} + +/* + * returns cpu share percentage + */ +static int +ps_get_share(struct proc *p) +{ + struct procps *ps = P2PS(p); + + if (sum == 0 || PSISNEW(ps)) + panic("ps_get_share: null weight sum or unknown process"); + if (sum == 0 || PSISNEW(ps) ) + return 0; + if (ps->old_weight > sum && ps->proc->p_stat == SRUN) { + printf("ps_get_share: %d %s, nice %d weight %d sum %d", + ps->proc->p_pid, ps->proc->p_comm, + ps->proc->p_nice, WEIGHT(ps), sum); + panic("ps_get_share"); + } + return (ps->old_weight * 100) / sum; +} + + +/* + * Compute the priority of a process when running in user mode. + */ +static void +ps_resetpriority(struct proc *p) +{ + if (p->p_stat == SRUN) + p->p_usrpri = MAXPRI - + (MAXPRI - PUSER) * ps_get_share(p) / 100; + else + p->p_usrpri = PUSER; +} + +/* OPTIMIZATION: implicitly does the job of resetpriority too to avoid + * an explicit call to it + */ +static void +ps_schedcpu1(struct proc *p) +{ + if (p->p_priority >= PUSER) + p->p_priority = p->p_usrpri = MAXPRI - + (MAXPRI - PUSER) * ps_get_share(p) / 100; +} + +static void +ps_schedclock1(struct proc *p) +{ +} + +/* + * procrunnable() returns a boolean true (non-zero) value if there are + * any runnable processes. This is intended to be called from the idle + * loop to avoid the more expensive (and destructive) chooseproc(). + * + * MP SAFE. CALLED WITHOUT THE MP LOCK + */ +static u_int32_t +ps_procrunnable(void) +{ + return run_heap.elements; +} + +/* + * chooseproc() selects the next process to run. Ideally, cpu_switch() + * would have determined that there is a process available before calling + * this, but it is not a requirement. The selected process is removed + * from run_heap. Process real start time is set here. + * This must be called at splhigh(). + * + * XXX For SMP, trivial affinity is implemented by locating the first process + * on the queue that has a matching lastcpu id. Since normal priorities + * are mapped four priority levels per queue, this may allow the cpu to + * choose a slightly lower priority process in order to preserve the cpu + * caches. + */ +static struct proc * +ps_chooseproc(void) +{ + struct proc *p = NULL; + struct procps *ps = NULL; +#ifdef SMP + u_char id; +#endif + if (ps_stat & END_PENDING) + /* + * request terminated before timeout expiration + */ + untimeout(propshare, NULL, thandle); + if (outproc != NULL) { + /* + * outstanding process did not reschedule itself + */ + struct procps *outps = P2PS(outproc); + + if (DN_KEY_GT(outps->p_S, outps->p_F)) + panic("chooseproc: outproc has invalid timestamps"); + if (PSISNEW(outps)) + panic("ps_chooseproc: new procps"); + /* charge process for service received */ + charge_service(outps); + heap_insert(&exp_heap, outps->p_F, outps); + } + updateV(); + updateV2(); + if (run_heap.elements > 0) { /* need to dispatch a new process */ + ps = (struct procps *)run_heap.p[0].object; + heap_extract(&run_heap, NULL); + ps->p_RS = ticks; + thandle = timeout(propshare, NULL, ps->p_length); + ps_stat |= END_PENDING; + p = ps->proc; + /* XXX what to do with SMP ? */ +#if 0 && defined(SMP) + /* + * wander down the current run queue for this + * pri level for a match + */ + id = cpuid; + while (p->p_lastcpu != id) { + p = TAILQ_NEXT(p, p_procq); + if (p == NULL) { + p = TAILQ_FIRST(q); + break; + } + } +#endif /* SMP */ + } + outproc = p; + TSTMP(4, 0, 122, p ? p->p_pid : 0); + return p; +} + +/*- + * Compare priorities. Return: + * <0: priority of p < current priority + * 0: priority of p == current priority + * >0: priority of p > current priority + * Lower priority means nearer virtual finish time => it really means higher + * priority +*/ +static int +ps_curpriority_cmp(struct proc *p) +{ + struct procps *ps = P2PS(p); + struct procps *curps = P2PS(curproc); + + return (ps->p_F == curps->p_F) ? 0 : + DN_KEY_LT(ps->p_F, curps->p_F) ? -1 : 1; +} + +/* + * Moves processes from proportional share scheduler to the destination + * (current) scheduler. + * + * 1. init destination (current) scheduler + * 2. moves active processes to destination scheduler + * 3. removes per process reference to ps data structures + * 4. frees ps data structures + */ +static void +ps_sched_move(void) +{ + struct dn_heap *h, *work_heap[2] = { &run_heap, &ne_heap }; + struct proc *p; + int i, j; + + sched_init(NULL); /* init destination scheduler */ + /* move active processes from ps to destination */ + for ( i = 0; i < 2; i++ ) { + h = work_heap[i]; + for ( j = 0; j < h->elements; j++ ) { + struct procps *ps = (struct procps *)h->p[j].object; + setrunqueue(ps->proc); + } + heap_free(h); + } + heap_free(&exp_heap); /* remove served requests */ + /* remove reference to ps data structures from all processes */ + LIST_FOREACH(p, &allproc, p_list) + *((u_short *)(p->p_pad3)) = 0; + free(ps_procs, M_TEMP); + ps_blocks = 0; + ps_free_list = 0; +} + +struct _sched_interface ps_scheduler = { + NULL, + "ps", + PROPORTIONAL_SHARE, + psinit, + ps_sched_move, + ps_setrunqueue, + ps_remrunqueue, + ps_schedulerexit, + ps_resetpriority, + ps_schedcpu1, + ps_schedclock1, + ps_curpriority_cmp, + ps_chooseproc, + ps_procrunnable +}; + +/* + * Load the scheduler data structures + */ +static void +ps_load(void) +{ + printf("Loaded proportional share scheduler\n"); + ps_scheduler.next = schedulers; + schedulers = &ps_scheduler; +} +SYSINIT(psload, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, ps_load, NULL);