Index: sys/conf/files
===================================================================
RCS file: /home/ncvs/src/sys/conf/files,v
retrieving revision 1.340.2.94
diff -u -r1.340.2.94 files
--- sys/conf/files	26 Mar 2002 10:12:22 -0000	1.340.2.94
+++ sys/conf/files	18 Jul 2002 20:17:01 -0000
@@ -530,6 +530,7 @@
 isofs/cd9660/cd9660_util.c	optional cd9660
 isofs/cd9660/cd9660_vfsops.c	optional cd9660
 isofs/cd9660/cd9660_vnops.c	optional cd9660
+kern/heap.c		standard
 kern/imgact_aout.c	standard
 kern/imgact_elf.c	standard
 kern/imgact_gzip.c	optional gzip
@@ -570,6 +571,7 @@
 kern/kern_sig.c		standard
 kern/kern_subr.c	standard
 kern/kern_switch.c	standard
+kern/kern_switch_ps.c	standard
 kern/kern_synch.c	standard
 kern/kern_syscalls.c	standard
 kern/kern_sysctl.c	standard
Index: sys/sys/proc.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/proc.h,v
retrieving revision 1.99.2.7
diff -u -r1.99.2.7 proc.h
--- sys/sys/proc.h	31 Jan 2002 18:40:29 -0000	1.99.2.7
+++ sys/sys/proc.h	18 Jul 2002 19:58:45 -0000
@@ -55,6 +55,53 @@
 #include <sys/event.h>			/* For struct klist */
 
 /*
+ * Interface for schedulers
+ */
+#define	FEEDBACK_PRIORITY 	0
+#define	PROPORTIONAL_SHARE 	1
+
+typedef void		sched_fun_t	(struct proc *);
+typedef void		sched_conf_t	(void *);
+typedef int		curpriority_cmp_t (struct proc *);
+typedef struct proc *	chooseproc_t	(void);
+typedef u_int32_t	procrunnable_t	(void);
+
+struct _sched_interface {
+	struct _sched_interface *next;
+	char *name;
+	int	id;
+
+	sched_conf_t	*sched_init;
+	void		(*sched_move)(void);
+	sched_fun_t	*setrunqueue;
+	sched_fun_t	*remrunqueue;
+	sched_fun_t	*schedulerexit;
+	sched_fun_t	*resetpriority;
+	sched_fun_t	*schedcpu1;
+	sched_fun_t	*schedclock1;
+	curpriority_cmp_t *curpriority_cmp;
+	chooseproc_t	*chooseproc;
+	procrunnable_t	*procrunnable;
+};
+
+extern struct _sched_interface *schedulers;
+
+extern sched_conf_t	*sched_init;
+extern sched_fun_t	*setrunqueue;
+extern sched_fun_t	*remrunqueue;
+extern sched_fun_t	*schedulerexit;
+extern sched_fun_t	*resetpriority;
+extern sched_fun_t	*schedcpu1;
+extern sched_fun_t	*schedclock1;
+extern curpriority_cmp_t *curpriority_cmp;
+extern chooseproc_t	*Xchooseproc;
+extern procrunnable_t	*Xprocrunnable;
+
+/*
+ * End of scheduler interface
+ */
+
+/*
  * One structure allocated per session.
  */
 struct	session {
@@ -429,15 +476,13 @@
 void	mi_switch __P((void));
 void	procinit __P((void));
 int	p_trespass __P((struct proc *p1, struct proc *p2));
-void	resetpriority __P((struct proc *));
+void    maybe_resched __P((struct proc *chk));
 int	roundrobin_interval __P((void));
 void	schedclock __P((struct proc *));
 void	setrunnable __P((struct proc *));
-void	setrunqueue __P((struct proc *));
 void	sleepinit __P((void));
 int	suser __P((struct proc *));
 int	suser_xxx __P((struct ucred *cred, struct proc *proc, int flag));
-void	remrunqueue __P((struct proc *));
 void	cpu_switch __P((struct proc *));
 void	unsleep __P((struct proc *));
 
Index: sys/kern/kern_exit.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.92.2.10
diff -u -r1.92.2.10 kern_exit.c
--- sys/kern/kern_exit.c	29 Apr 2002 09:42:35 -0000	1.92.2.10
+++ sys/kern/kern_exit.c	18 Jul 2002 22:14:58 -0000
@@ -269,6 +269,7 @@
 		p->p_textvp = NULL;
 		vrele(vtmp);
 	}
+	schedulerexit(p);	/* Remove proc from scheduler */
 
 	/*
 	 * Remove proc from allproc queue and pidhash chain.
Index: sys/kern/kern_switch.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_switch.c,v
retrieving revision 1.3.2.1
diff -u -r1.3.2.1 kern_switch.c
--- sys/kern/kern_switch.c	16 May 2000 06:58:12 -0000	1.3.2.1
+++ sys/kern/kern_switch.c	18 Jul 2002 19:58:26 -0000
@@ -32,6 +32,9 @@
 #include <sys/proc.h>
 #include <sys/rtprio.h>
 #include <sys/queue.h>
+#include <machine/cpu.h>
+#include <machine/ipl.h>
+#include <machine/smp.h>
 
 /*
  * We have NQS (32) run queues per scheduling class.  For the normal
@@ -52,6 +55,28 @@
 u_int32_t rtqueuebits;
 u_int32_t idqueuebits;
 
+static struct callout_handle thandle; 	/* round robin timeout */  
+/*
+ * Force switch among equal priority processes every sched_quantum
+ * (default value is 100ms).
+ */
+/* ARGSUSED */
+static void
+roundrobin(void *arg)
+{
+#ifdef SMP
+	need_resched();
+	forward_roundrobin();
+#else 
+ 	struct proc *p = curproc; /* XXX */
+
+ 	if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type))
+ 		need_resched();
+#endif
+
+ 	thandle = timeout(roundrobin, NULL, sched_quantum);
+}
+
 /*
  * Initialize the run queues at boot time.
  */
@@ -65,8 +90,9 @@
 		TAILQ_INIT(&rtqueues[i]);
 		TAILQ_INIT(&idqueues[i]);
 	}
+	/* Kick off timeout driven events by calling first time. */
+	roundrobin(NULL) ;
 }
-SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
 
 /*
  * setrunqueue() examines a process priority and class and inserts it on
@@ -75,8 +101,8 @@
  * The process must be runnable.
  * This must be called at splhigh().
  */
-void
-setrunqueue(struct proc *p)
+static void
+fp_setrunqueue(struct proc *p)
 {
 	struct rq *q;
 	u_int8_t pri;
@@ -107,8 +133,8 @@
  * clearing the queue busy bit if it becomes empty.
  * This must be called at splhigh().
  */
-void
-remrunqueue(struct proc *p)
+static void
+fp_remrunqueue(struct proc *p)
 {
 	struct rq *q;
 	u_int32_t *which;
@@ -143,8 +169,8 @@
  *
  * MP SAFE.  CALLED WITHOUT THE MP LOCK
  */
-u_int32_t
-procrunnable(void)
+static u_int32_t
+fp_procrunnable(void)
 {
 	return (rtqueuebits || queuebits || idqueuebits);
 }
@@ -162,8 +188,8 @@
  * choose a slightly lower priority process in order to preserve the cpu
  * caches.
  */
-struct proc *
-chooseproc(void)
+static struct proc *
+fp_chooseproc(void)
 {
 	struct proc *p;
 	struct rq *q;
@@ -206,3 +232,133 @@
 		*which &= ~(1 << pri);
 	return p;
 }
+
+/*-
+ * Compare priorities.  Return:
+ *     <0: priority of p < current priority
+ *      0: priority of p == current priority
+ *     >0: priority of p > current priority
+ * The priorities are the normal priorities or the normal realtime priorities
+ * if p is on the same scheduler as curproc.  Otherwise the process on the
+ * more realtimeish scheduler has lowest priority.  As usual, a higher
+ * priority really means a lower priority.
+ */
+static int
+fp_curpriority_cmp(struct proc *p)
+{
+	int c_class, p_class;
+
+	c_class = RTP_PRIO_BASE(curproc->p_rtprio.type);
+	p_class = RTP_PRIO_BASE(p->p_rtprio.type);
+	if (p_class != c_class)
+		return (p_class - c_class);
+	if (p_class == RTP_PRIO_NORMAL)
+		return (((int)p->p_priority - (int)curpriority) / PPQ);
+	return ((int)p->p_rtprio.prio - (int)curproc->p_rtprio.prio);
+}
+
+/*
+ * Dummy function entry: this scheduler doesn't need a schedulerexit
+ */
+static void
+fp_schedulerexit(struct proc *p)
+{
+}
+
+/*
+ * Compute the priority of a process when running in user mode.
+ * Arrange to reschedule if the resulting priority is better
+ * than that of the current process.
+ */
+static void
+fp_resetpriority(struct proc *p)
+{
+	unsigned int newpriority;
+
+	if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
+		newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
+		NICE_WEIGHT * p->p_nice;
+		newpriority = min(newpriority, MAXPRI);
+		p->p_usrpri = newpriority;
+	}
+	maybe_resched(p);
+}
+
+static void
+fp_schedcpu1(struct proc *p)
+{
+	fp_resetpriority(p) ;
+	if (p->p_priority >= PUSER) {
+		if ((p != curproc) &&
+#ifdef SMP
+		    p->p_oncpu == 0xff && 	/* idle */
+#endif
+		    p->p_stat == SRUN &&
+		    (p->p_flag & P_INMEM) &&
+		    (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
+			remrunqueue(p);
+			p->p_priority = p->p_usrpri;
+			setrunqueue(p);
+		} else
+		    p->p_priority = p->p_usrpri;
+	}
+}
+
+static void
+fp_schedclock1(struct proc *p)
+{
+	if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
+		resetpriority(p);
+		if (p->p_priority >= PUSER)
+			p->p_priority = p->p_usrpri;
+	}
+}
+
+/*
+ * Moves processes from Feedback Priority scheduler to the new
+ * current scheduler.
+ */
+static void
+fp_sched_move(void)
+{
+	struct rq *work_queue[3] = { queues, rtqueues, idqueues } ;
+	int i, j;
+	struct proc *p;
+
+	untimeout(roundrobin, NULL, thandle);	/* stop roundrobin */
+	sched_init(NULL);			/* init new scheduler */
+	/* move processes to new scheduler */
+	for (i = 0 ; i < 3 ; i++) 
+		for (j = 0 ; j < NQS ; j++)
+			TAILQ_FOREACH(p, &work_queue[i][j], p_procq) 
+				setrunqueue(p) ;
+
+	queuebits = rtqueuebits = idqueuebits = 0 ; 
+}
+
+static struct _sched_interface fp_scheduler = {
+	NULL,
+	"fp",
+	FEEDBACK_PRIORITY,
+	rqinit,
+	fp_sched_move,
+	fp_setrunqueue,
+	fp_remrunqueue,
+	fp_schedulerexit,
+	fp_resetpriority,
+	fp_schedcpu1,
+	fp_schedclock1,
+	fp_curpriority_cmp,
+	fp_chooseproc,
+	fp_procrunnable
+};
+
+static void
+fp_load(void)
+{
+	printf("Loaded feedback priority scheduler\n");
+	fp_scheduler.next = schedulers;
+	schedulers = &fp_scheduler;
+}
+SYSINIT(fpload, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, fp_load, NULL);
+
Index: sys/kern/kern_synch.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.87.2.5
diff -u -r1.87.2.5 kern_synch.c
--- sys/kern/kern_synch.c	28 Jun 2002 00:21:44 -0000	1.87.2.5
+++ sys/kern/kern_synch.c	18 Jul 2002 19:58:35 -0000
@@ -58,8 +58,33 @@
 #include <machine/ipl.h>
 #include <machine/smp.h>
 
+#define DBB(a) a
+#define DEB(a)
+
+/*
+ * Scheduler interface
+ */
+
+sched_fun_t	*setrunqueue;
+sched_fun_t	*remrunqueue;
+sched_fun_t	*schedulerexit;
+sched_fun_t	*resetpriority;
+sched_fun_t	*schedcpu1;
+sched_fun_t	*schedclock1;
+sched_conf_t	*sched_init;
+static void (*sched_move)(void);
+curpriority_cmp_t *curpriority_cmp;
+
+chooseproc_t	*Xchooseproc;
+procrunnable_t	*Xprocrunnable;
+
+static int sched_algorithm = PROPORTIONAL_SHARE; /* XXX */
+struct _sched_interface *schedulers = NULL; 	 /* XXX */
+
+
 static void sched_setup __P((void *dummy));
-SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
+SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_ANY, sched_setup, NULL)
+/* XXX It was: SI_SUB_KICK_SCHEDULER */
 
 u_char	curpriority;
 int	hogticks;
@@ -80,11 +105,8 @@
 	0.9944598480048967 * FSCALE,	/* exp(-1/180) */
 };
 
-static int	curpriority_cmp __P((struct proc *p));
 static void	endtsleep __P((void *));
 static void	loadav __P((void *arg));
-static void	maybe_resched __P((struct proc *chk));
-static void	roundrobin __P((void *arg));
 static void	schedcpu __P((void *arg));
 static void	updatepri __P((struct proc *p));
 
@@ -107,36 +129,11 @@
 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
 	0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
 
-/*-
- * Compare priorities.  Return:
- *     <0: priority of p < current priority
- *      0: priority of p == current priority
- *     >0: priority of p > current priority
- * The priorities are the normal priorities or the normal realtime priorities
- * if p is on the same scheduler as curproc.  Otherwise the process on the
- * more realtimeish scheduler has lowest priority.  As usual, a higher
- * priority really means a lower priority.
- */
-static int
-curpriority_cmp(p)
-	struct proc *p;
-{
-	int c_class, p_class;
-
-	c_class = RTP_PRIO_BASE(curproc->p_rtprio.type);
-	p_class = RTP_PRIO_BASE(p->p_rtprio.type);
-	if (p_class != c_class)
-		return (p_class - c_class);
-	if (p_class == RTP_PRIO_NORMAL)
-		return (((int)p->p_priority - (int)curpriority) / PPQ);
-	return ((int)p->p_rtprio.prio - (int)curproc->p_rtprio.prio);
-}
-
 /*
  * Arrange to reschedule if necessary, taking the priorities and
  * schedulers into account.
  */
-static void
+void
 maybe_resched(chk)
 	struct proc *chk;
 {
@@ -167,29 +164,6 @@
 }
 
 /*
- * Force switch among equal priority processes every 100ms.
- */
-/* ARGSUSED */
-static void
-roundrobin(arg)
-	void *arg;
-{
-#ifndef SMP
- 	struct proc *p = curproc; /* XXX */
-#endif
- 
-#ifdef SMP
-	need_resched();
-	forward_roundrobin();
-#else 
- 	if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type))
- 		need_resched();
-#endif
-
- 	timeout(roundrobin, NULL, sched_quantum);
-}
-
-/*
  * Constants for digital decay and forget:
  *	90% of (p_estcpu) usage in 5 * loadav time
  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
@@ -323,21 +297,7 @@
 #endif
 		p->p_cpticks = 0;
 		p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
-		resetpriority(p);
-		if (p->p_priority >= PUSER) {
-			if ((p != curproc) &&
-#ifdef SMP
-			    p->p_oncpu == 0xff && 	/* idle */
-#endif
-			    p->p_stat == SRUN &&
-			    (p->p_flag & P_INMEM) &&
-			    (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
-				remrunqueue(p);
-				p->p_priority = p->p_usrpri;
-				setrunqueue(p);
-			} else
-				p->p_priority = p->p_usrpri;
-		}
+		schedcpu1(p);
 		splx(s);
 	}
 	wakeup((caddr_t)&lbolt);
@@ -419,6 +379,7 @@
 	int s, sig, catch = priority & PCATCH;
 	struct callout_handle thandle;
 
+	TSTMP(4, priority, 101, p->p_pid);
 #ifdef KTRACE
 	if (p && KTRPOINT(p, KTR_CSW))
 		ktrcsw(p->p_tracep, 1, 0);
@@ -811,6 +772,7 @@
 	register struct rlimit *rlim;
 	int x;
 
+	TSTMP(4, 0, 102, p ? p->p_pid : 0 );
 	/*
 	 * XXX this spl is almost unnecessary.  It is partly to allow for
 	 * sloppy callers that don't do it (issignal() via CURSIG() is the
@@ -892,6 +854,7 @@
 	register int s;
 
 	s = splhigh();
+	TSTMP(4, 0, 103, p->p_pid);
 	switch (p->p_stat) {
 	case 0:
 	case SRUN:
@@ -922,26 +885,6 @@
 }
 
 /*
- * Compute the priority of a process when running in user mode.
- * Arrange to reschedule if the resulting priority is better
- * than that of the current process.
- */
-void
-resetpriority(p)
-	register struct proc *p;
-{
-	register unsigned int newpriority;
-
-	if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
-		newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
-		    NICE_WEIGHT * p->p_nice;
-		newpriority = min(newpriority, MAXPRI);
-		p->p_usrpri = newpriority;
-	}
-	maybe_resched(p);
-}
-
-/*
  * Compute a tenex style load average of a quantity on
  * 1, 5 and 15 minute intervals.
  */
@@ -973,20 +916,6 @@
 	    loadav, NULL);
 }
 
-/* ARGSUSED */
-static void
-sched_setup(dummy)
-	void *dummy;
-{
-
-	callout_init(&loadav_callout);
-
-	/* Kick off timeout driven events by calling first time. */
-	roundrobin(NULL);
-	schedcpu(NULL);
-	loadav(NULL);
-}
-
 /*
  * We adjust the priority of the current process.  The priority of
  * a process gets worse as it accumulates CPU time.  The cpu usage
@@ -1005,12 +934,108 @@
 schedclock(p)
 	struct proc *p;
 {
-
 	p->p_cpticks++;
 	p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
-	if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
-		resetpriority(p);
-		if (p->p_priority >= PUSER)
-			p->p_priority = p->p_usrpri;
+	schedclock1(p) ;
+}
+
+/* ------------- scheduling algorithm dependent code ----------------------- */
+
+/* ARGSUSED */
+static void
+sched_setup(dummy)
+	void *dummy;
+{
+	struct _sched_interface *p;
+
+	for (p = schedulers; p && p->id != sched_algorithm ; p = p->next) 
+		;
+	
+	if (p == NULL)
+		panic("scheduler not found\n") ;
+	printf("Active scheduler %s\n", p->name);
+
+	sched_init = p->sched_init;
+	sched_move = p->sched_move;
+	setrunqueue = p->setrunqueue;
+	remrunqueue = p->remrunqueue;
+	schedulerexit = p->schedulerexit;
+	resetpriority = p->resetpriority;
+	schedcpu1 = p->schedcpu1;
+	schedclock1 = p->schedclock1;
+	curpriority_cmp = p->curpriority_cmp;
+	Xchooseproc = p->chooseproc;
+	Xprocrunnable = p->procrunnable;
+
+	callout_init(&loadav_callout);
+
+	sched_init(NULL) ;
+	/* Kick off timeout driven events by calling first time. */
+	schedcpu(NULL);
+	loadav(NULL);
+}
+
+/*
+ * Implements scheduling algorithm switch
+ */
+static int
+sysctl_kern_scheduler(SYSCTL_HANDLER_ARGS)
+{
+	int error, new_val, old_val, s ;
+	struct _sched_interface *p;
+
+	old_val = new_val = sched_algorithm ;
+	error = sysctl_handle_int(oidp, &new_val, 0, req);
+        if (error != 0 || req->newptr == NULL)
+		return (error);
+	s = splhigh();
+	for (p = schedulers; p && p->id != new_val ; p = p->next) 
+		;	
+	if (p == NULL) {
+		printf("kern_scheduler %d not found\n", new_val);
+		return (EINVAL);
 	}
+	if (new_val != old_val) { /* switch */
+		sched_algorithm = new_val ;
+		sched_init = p->sched_init;
+		setrunqueue = p->setrunqueue;
+		remrunqueue = p->remrunqueue;
+		schedulerexit = p->schedulerexit;
+		resetpriority = p->resetpriority;
+		schedcpu1 = p->schedcpu1;
+		schedclock1 = p->schedclock1;
+		curpriority_cmp = p->curpriority_cmp;
+		Xchooseproc = p->chooseproc;
+		Xprocrunnable = p->procrunnable;
+		callout_init(&loadav_callout);
+		sched_move();
+		sched_move = p->sched_move;
+	}
+	splx(s) ;
+	return (0);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, scheduler, CTLTYPE_INT|CTLFLAG_RW,
+	0, sizeof sched_algorithm, sysctl_kern_scheduler, "I", 
+	"Current scheduling algorithm");
+/*
+ * procrunnable demultiplexing function
+ */
+u_int32_t
+procrunnable(void)
+{
+	return Xprocrunnable();
+}
+
+/*
+ * chooseproc demultiplexing function
+ */
+struct proc *
+chooseproc(void)
+{
+	return Xchooseproc();
 }
+
+
+/* ------------ end of scheduling algorithm dependent code ----------------- */
+
--- /dev/null	Thu Jul 18 21:51:23 2002
+++ sys/sys/heap.h	Thu Jul 18 22:09:44 2002
@@ -0,0 +1,66 @@
+#ifndef _HEAP_H 
+#define _HEAP_H 
+
+/*
+ * Definition of heap data structures. In the structures, I decided
+ * not to use the macros in <sys/queue.h> in the hope of making the code
+ * easier to port to other architectures. The type of lists and queue we
+ * use here is pretty simple anyways.
+ */
+
+/* 
+ * So we use a key "dn_key" which is 64 bits. Some macros are used to
+ * compare key values and handle wraparounds.
+ * MAX64 returns the largest of two key values.
+ * MY_M is used as a shift count when doing fixed point arithmetic
+ * (a better name would be useful...).
+ */
+
+typedef u_int64_t dn_key ;      /* sorting key */
+#define DN_KEY_LT(a,b)     ((int64_t)((a)-(b)) < 0)
+#define DN_KEY_LEQ(a,b)    ((int64_t)((a)-(b)) <= 0)
+#define DN_KEY_GT(a,b)     ((int64_t)((a)-(b)) > 0)
+#define DN_KEY_GEQ(a,b)    ((int64_t)((a)-(b)) >= 0)
+#define MAX64(x,y)  (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
+/*
+ * The OFFSET_OF macro is used to return the offset of a field within
+ * a structure. It is used by the heap management routines.
+ */
+#define OFFSET_OF(type, field) ((int)&( ((type *)0)->field) )
+
+/*
+ * A heap entry is made of a key and a pointer to the actual
+ * object stored in the heap.
+ * The heap is an array of dn_heap_entry entries, dynamically allocated.
+ * Current size is "size", with "elements" actually in use.
+ * The heap normally supports only ordered insert and extract from the top.
+ * If we want to extract an object from the middle of the heap, we
+ * have to know where the object itself is located in the heap (or we
+ * need to scan the whole array). To this purpose, an object has a
+ * field (int) which contains the index of the object itself into the
+ * heap. When the object is moved, the field must also be updated.
+ * The offset of the index in the object is stored in the 'offset'
+ * field in the heap descriptor. The assumption is that this offset
+ * is non-zero if we want to support extract from the middle.
+ */
+struct dn_heap_entry {
+    dn_key key ;	/* sorting key. Topmost element is smallest one */
+    void *object ;	/* object pointer */
+} ;
+
+struct dn_heap {
+    int size ;
+    int elements ;
+    int offset ; /* XXX if > 0 this is the offset of direct ptr to obj */
+    struct dn_heap_entry *p ;	/* really an array of "size" entries */
+} ;
+
+int heap_init(struct dn_heap *h, int size) ;
+int heap_insert (struct dn_heap *h, dn_key key1, void *p);
+void heap_modify(struct dn_heap *h, void *old, void *new);
+void heap_extract(struct dn_heap *h, void *obj);
+void heapify(struct dn_heap *h);
+void heap_free(struct dn_heap *h);
+
+#endif /* _HEAP_H */
+
--- /dev/null	Thu Jul 18 21:51:23 2002
+++ sys/kern/heap.c	Thu Jul 18 21:58:19 2002
@@ -0,0 +1,280 @@
+#define DEB(x)
+#define DDB(x)	x
+
+/*
+ * This module implements 
+ *  + heap management functions;
+ *
+ * include files marked with XXX are probably not needed
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/queue.h>			/* XXX */
+#include <sys/time.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/heap.h>
+
+MALLOC_DEFINE(M_HEAP, "heap", "heap data structures"); /* XXX Check this */ 
+
+/*
+ * Heap management functions.
+ *
+ * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
+ * Some macros help finding parent/children so we can optimize them.
+ *
+ * heap_init() is called to expand the heap when needed.
+ * Increment size in blocks of 16 entries.
+ * XXX failure to allocate a new element is a pretty bad failure
+ * as we basically stall a whole queue forever!!
+ * Returns 1 on error, 0 on success
+ */
+#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
+#define HEAP_LEFT(x) ( 2*(x) + 1 )
+#define HEAP_IS_LEFT(x) ( (x) & 1 )
+#define HEAP_RIGHT(x) ( 2*(x) + 2 )
+#define	HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
+#define HEAP_INCREMENT	15
+
+/* 
+ * DEBUGGING facility
+ */
+void 
+check_heap(struct dn_heap *h, u_char * s)
+{ 
+    int i, error = 0 ;
+    
+    for (i = 0 ; (i < h->elements) && (!error) ; i++) {
+	void *obj = h->p[i].object ;
+	
+	error = 1 ;
+	if (obj == NULL) 
+	    printf("check_heap: null object") ;
+	else if (h->offset > 0 && *((int *)((char *)obj + h->offset)) != i)
+	    printf("check_heap: internal offset %d index %d", 
+			   *((int *)((char *)obj + h->offset)),i) ;
+	else
+	    error = 0 ;
+    }
+    if (error)
+	panic(s) ;
+    DEB(printf("%s: check_heap OK\n", s) );
+}
+
+int
+heap_init(struct dn_heap *h, int new_size)
+{       
+    struct dn_heap_entry *p;
+
+    if (h->size >= new_size ) {
+	printf("heap_init, Bogus call, have %d want %d\n",
+		h->size, new_size);
+	return 0 ;
+    }   
+    new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
+    p = malloc(new_size * sizeof(*p), M_HEAP, M_DONTWAIT );
+    if (p == NULL) {
+	printf(" heap_init, resize %d failed\n", new_size );
+	return 1 ; /* error */
+    }
+    if (h->size > 0) {
+	bcopy(h->p, p, h->size * sizeof(*p) );
+	free(h->p, M_HEAP);
+    }
+    h->p = p ;
+    h->size = new_size ;
+    DDB(check_heap(h, "heap_init")) ;
+    return 0 ;
+}
+
+/*
+ * Insert element in heap. Normally, p != NULL, we insert p in
+ * a new position and bubble up. If p == NULL, then the element is
+ * already in place, and key is the position where to start the
+ * bubble-up.
+ * Returns 1 on failure (cannot allocate new heap entry)
+ *
+ * If offset > 0 the position (index, int) of the element in the heap is
+ * also stored in the element itself at the given offset in bytes.
+ */
+#define SET_OFFSET(heap, node) \
+    if (heap->offset > 0) \
+	    *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
+/*
+ * RESET_OFFSET sets offset to an invalid value.
+ */
+#define RESET_OFFSET(heap, node) \
+    if (heap->offset > 0) \
+	    *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
+int
+heap_insert(struct dn_heap *h, dn_key key1, void *p)
+{   
+    int son = h->elements ;
+
+    if (p == NULL)	/* data already there, set starting point */
+	son = key1 ;
+    else {		/* insert new element at the end, possibly resize */
+	son = h->elements ;
+	if (son == h->size) /* need resize... */
+	    if (heap_init(h, h->elements+1) )
+		return 1 ; /* failure... */
+	h->p[son].object = p ;
+	h->p[son].key = key1 ;
+	h->elements++ ;
+    }
+    while (son > 0) {				/* bubble up */
+	int father = HEAP_FATHER(son) ;
+	struct dn_heap_entry tmp  ;
+
+	if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
+	    break ; /* found right position */ 
+	/* son smaller than father, swap and repeat */
+	HEAP_SWAP(h->p[son], h->p[father], tmp) ;
+	SET_OFFSET(h, son);
+	son = father ;
+    }
+    SET_OFFSET(h, son);
+    return 0 ;
+}
+
+/*
+ * update element pointing to old_obj to point to new_obj 
+ */
+void
+heap_modify(struct dn_heap *h, void *old_obj, void *new_obj)
+{  
+ /* modify specific element, index is at offset */ 
+    if (old_obj != NULL && h->offset > 0) {
+    	int index = *((int *)((char *)old_obj + h->offset)) ;
+	
+	if (index < 0 || index >= h->elements) {
+	    printf("heap_modify, index %d out of bound 0..%d\n",
+		index, h->elements);
+	    panic("heap_modify");
+	}
+	h->p[index].object = new_obj ;
+    } else {
+	printf("heap_modify, null obj or index offset\n") ;
+	panic("heap_modify") ;
+    }
+}           
+
+/*
+ * remove top element from heap, or obj if obj != NULL
+ */
+void
+heap_extract(struct dn_heap *h, void *obj)
+{  
+    int child, father, max = h->elements - 1 ;
+
+    DDB(check_heap(h, "entering heap_extract")) ;
+    if (max < 0) {
+	printf("warning, extract from empty heap 0x%p\n", h);
+	return ;
+    }
+    father = 0 ; /* default: move up smallest child */
+    if (obj != NULL) { /* extract specific element, index is at offset */
+	if (h->offset > 0)
+	    father = *((int *)((char *)obj + h->offset)) ;
+	else
+	    for ( ; father < h->elements && h->p[father].object != obj ;
+		    father++ ) ;
+	if (father < 0 || father >= h->elements) {
+	    printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
+		father, h->elements);
+	    panic("heap_extract");
+	}
+    }
+    RESET_OFFSET(h, father);
+    child = HEAP_LEFT(father) ;		/* left child */
+    while (child <= max) {		/* valid entry */
+	if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
+	    child = child+1 ;		/* take right child, otherwise left */
+	h->p[father] = h->p[child] ;
+	SET_OFFSET(h, father);
+	father = child ;
+	child = HEAP_LEFT(child) ;   /* left child for next loop */
+    }   
+    h->elements-- ;
+    if (father != max) {
+	/*
+	 * Fill hole with last entry and bubble up, reusing the insert code
+	 */
+	h->p[father] = h->p[max] ;
+	heap_insert(h, father, NULL); /* this one cannot fail */
+    }
+}           
+
+#if 0
+/*
+ * change object position and update references
+ * XXX this one is never used!
+ */
+void
+heap_move(struct dn_heap *h, dn_key new_key, void *object)
+{
+    int temp;
+    int i ;
+    int max = h->elements-1 ;
+    struct dn_heap_entry buf ;
+
+    if (h->offset <= 0)
+	panic("cannot move items on this heap");
+
+    i = *((int *)((char *)object + h->offset));
+    if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
+	h->p[i].key = new_key ;
+	for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
+		 i = temp ) { /* bubble up */
+	    HEAP_SWAP(h->p[i], h->p[temp], buf) ;
+	    SET_OFFSET(h, i);
+	}
+    } else {		/* must move down */
+	h->p[i].key = new_key ;
+	while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
+	    if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
+		temp++ ; /* select child with min key */
+	    if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
+		HEAP_SWAP(h->p[i], h->p[temp], buf) ;
+		SET_OFFSET(h, i);
+	    } else
+		break ;
+	    i = temp ;
+	}
+    }
+    SET_OFFSET(h, i);
+}
+#endif /* heap_move, unused */
+
+/*
+ * heapify() will reorganize data inside an array to maintain the
+ * heap property. It is needed when we delete a bunch of entries.
+ */
+void
+heapify(struct dn_heap *h)
+{
+    int i ;
+
+    for (i = 0 ; i < h->elements ; i++ )
+	heap_insert(h, i , NULL) ;
+}
+
+/*
+ * cleanup the heap and free data structure
+ */
+void
+heap_free(struct dn_heap *h)
+{
+    if (h->size >0 )
+	free(h->p, M_HEAP);
+    bzero(h, sizeof(*h) );
+}
+
+/*
+ * --- end of heap management functions ---
+ */
+
+
--- /dev/null	Thu Jul 18 21:51:23 2002
+++ sys/kern/kern_switch_ps.c	Fri Jul 19 00:54:06 2002
@@ -0,0 +1,778 @@
+/*
+ * Copyright (c) 2002 Paolo Valente, Universita` di Pisa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/rtprio.h>
+#include <sys/queue.h>
+#include <sys/malloc.h>
+#include <sys/sysctl.h>
+#include <sys/resource.h>	/* PRIO_MIN, PRIO_MAX */
+
+#include <sys/heap.h>
+#include <machine/cpu.h>
+#include <machine/ipl.h>
+#include <vm/vm_zone.h>
+
+#define DBB(x)	x
+#define DEB(x)
+#define DASSERT(a, b)    if ((a) == 0) \
+				panic(b);
+
+
+/*
+ * This module implements proportional share scheduler.
+ *
+ * Proportional Share
+ *
+ * Multiprocessing is implemented by dividing each process into small
+ * execution requests and multiplexing them on available cpu(s).
+ *
+ * Processes (requests) are served according to the WF2Q+ scheduling
+ * algorithm, which approximates the behaviour of a fluid system where
+ * each process is given a share of the CPU proportional to the weight
+ * assigned to it.
+ *
+ * In the reference fluid system, all ready processes are served
+ * simultaneously, and each of them gets a share of the cpu speed
+ * equal to the ratio between its weight and the sum of the weights
+ * of all ready processes. The global progress in the fluid system
+ * is measured by the "system virtual time", which is the amount of
+ * service received by a running process divided by its weight.
+ * So, during the service of a request * with length L and weight W,
+ * the system virtual time will increase by L/W.
+ *
+ * We define the "virtual finish time" of a request the value of the
+ * system virtual time when the request terminates in the fluid system.
+ * We emulate the fluid system on the real system (where only one process
+ * per cpu can be run at any time) by scheduling requests in increasing
+ * order of the virtual finish times.
+ *
+ * Internally, the scheduler makes use of three heaps:
+ *
+ * ne_heap:
+ *	processes whose virtual start time is greater than the
+ *	system's virtual time are not yet eligible for execution, so
+ *	they are stored here, sorted by virtual start time.
+ *	As the system's virtual time increases, processes become
+ *	eligible for execution and are moved to the run_heap.
+ *
+ * run_heap:
+ *	runnable processes, sorted by virtual finish time; next request
+ *	to be run is at the head of the heap
+ *
+ * exp_heap:
+ *	processes which did not issue a new request before the
+ *	end (of the execution) of the last one, sorted by last
+ *	request finish time.
+ *	As the system's virtual time increases, processes whose
+ *	virtual finish time becomes lower than the system's virtual
+ *	time are removed (process weight is removed from sum,
+ *	process specific data strutcture are freed, etc...)
+ *
+ * A process is called "active" if it is running or it is present
+ * in one of the previous heaps, otherwise it is called "inactive".
+*/
+
+static dn_key V; 	/* system virtual time */
+static u_int sum; 	/* sum of the weights of active processes */
+static struct dn_heap run_heap, ne_heap, exp_heap;
+
+/*
+ * Process weight is computed from p_nice using a lookup table.
+ * p_nice can assume values between PRIO_MIN (-20) and PRIO_MAX (20).
+ */
+static const u_short wtable[PRIO_MAX - PRIO_MIN + 1] = {
+	1000, 950, 900, 850, 800,
+	750,  700, 650, 600, 550,
+	500,  450, 400, 350, 300,
+	250,  200, 180, 150, 120,
+	100,				/* base priority/weight		*/
+	90,   80,  75,  70,  65,
+	60,   55,  50,  45,  40,
+	35,   30,  25,  20,  15,
+	10,   5,   3,   2,   1
+};
+
+#define MINWEIGHT	1
+#define MAXWEIGHT	1000
+#define WEIGHT(ps)	(wtable[(ps)->proc->p_nice - PRIO_MIN])
+static dn_key lastRtime;		/* used to compute virtual time */
+static struct callout_handle thandle; 	/* current request timeout */
+
+/*
+ * XXX fix for SMP -- can't you use curproc ?
+ *
+ * Next variable is used to appropriately handle request termination.
+ * When a process is selected for execution it becomes outstanding.
+ * The (running) process remains outstanding unless it issues
+ * its next request or removes itself from the scheduler.
+ * At the end of the request execution, if the process is still
+ * outstanding, it will be inserted into exp_heap.
+ */
+
+static struct proc *outproc; /* outstanding process */
+#define END_PENDING 1
+static char ps_stat;
+
+/*
+ * Virtual times are used as keys for the heaps. They increase in steps of
+ *
+ *	TIMESCALE*ticks/sum_w
+ *
+ * where sum_w is the the sum of all weights.
+ * The role of TIMESCALE is twofold: first and foremost, to perform
+ * fixed-point calculations; second, to make the rate of advance of
+ * time independent of "hz".
+ *
+ * Max values are: sum_w = 2^20 (e.g. 1000 active processes with
+ * a weight of 1000 each), hz <= 10000.
+ *
+ * In these conditions we have 43 bits to represent time intervals,
+ * meaning a wrap after roughly 27 years.
+ */
+#define	SCALE_SHIFT	20 	/* to obtain a larger precision */
+#define	TIMESCALE	((10000<<SCALE_SHIFT)/hz)
+#define	Rtime() 	(ticks*TIMESCALE)
+
+/*
+ * The WF2Q+ scheduling algorithm is based on requests finish time.
+ * However, when we insert a request into the scheduler, the actual
+ * duration of the request is unknown (the process might voluntarily
+ * deschedule e.g. when waiting for a resource).
+ * So we use a nominal length that best fits the actual length.
+ */
+#define REQ_DELTA	(hz/1000)	/* 1 ms (in ticks) */
+#define REQ_MINLENGTH	REQ_DELTA	/* smallest length allowed */
+#define REQ_MAXLENGTH	100*REQ_DELTA	/* largest length allowed */
+
+/*
+ * Per process data structures to support WF2Q+
+ *
+ * The struct procps are allocated from an array, and referenced
+ * by the main process descriptor through an unused field in
+ * struct proc (p->pad3). A zero in p_pad3 means the process does
+ * not have a descriptor associated with it (this also means
+ * that 0 is not a valid index in the array).
+ *
+ * The base and size of the array are ps_procs and ps_blocks,
+ * free slots are kept in a linked list (by index) and the first
+ * element is at offset ps_free_list.
+ */
+
+struct procps {
+	int nextfree;		/* free list			*/
+	struct proc *proc;	/* process we are attached to */
+	dn_key  p_length;	/* nominal length of a request (ticks) */
+	dn_key  p_S, p_F;	/* virtual start and finish time */
+	dn_key  p_RS;		/* (real) start time */
+	int heap_pos;		/* position (index) of struct in exp_heap */
+	u_short  old_weight;	/* to detect weight change */
+};
+
+static struct procps *ps_procs;
+static int ps_free_list;
+static int ps_blocks;
+static int ps_free_blocks;
+
+SYSCTL_INT(_kern, OID_AUTO, ps_free_blocks, CTLFLAG_RD, &ps_free_blocks, 0, "");
+
+#define PSBLOCKELEMS	64
+#define P2PS(p)		&ps_procs[ *((u_short *)(p->p_pad3)) ]
+#define PSISNEW(p)	((p) == ps_procs)
+
+/*
+ * After reallocating the procps array, update pointers
+ * in the various heaps.
+ */
+static void
+procps_update(int delta)
+{
+	struct dn_heap *h, *work_heap[3] = { &run_heap, &ne_heap, &exp_heap };
+	int i, j;
+
+	for ( i = 0; i < 3; i++ ) {
+		h = work_heap[i];
+		for ( j = 0; j < h->elements; j++ )
+			(char *)h->p[j].object += delta;
+	}
+}
+
+/*
+ * Alloc to process the slot at the head of free list.
+ * If free list is empty alloc a new block of slots and
+ * link them to free list
+ */
+static void
+procps_realloc(void)
+{
+	struct procps *tmp;
+	int i = ps_blocks + PSBLOCKELEMS; /* new size */
+
+printf("+++ procps_realloc %d --> %d\n", ps_blocks, i);
+	tmp = malloc(i*sizeof (struct procps), M_TEMP, M_NOWAIT);
+	DASSERT(tmp, "find_slot: couldn't alloc new ps_procs block");
+	if (ps_blocks > 0) {
+		/*
+		 * copy old record to new one. Scan heaps and update
+		 * pointers
+		 */
+		bcopy(ps_procs, tmp, ps_blocks * sizeof (struct procps));
+		procps_update( (char *)(tmp) - (char *)(ps_procs) );
+		free(ps_procs, M_TEMP);
+	}
+	ps_procs = tmp;
+	ps_free_list = ps_blocks;	/* first free */
+	ps_blocks += PSBLOCKELEMS;	/* total blocks */
+	if (ps_free_list == 0)		/* entry 0 must be unused */
+		ps_free_list =  1;
+
+	for (i = ps_free_list; i < ps_blocks - 1; i++)
+		ps_procs[i].nextfree = i+1;
+	ps_free_blocks += ps_blocks - ps_free_list;
+	ps_procs[ps_blocks-1].nextfree = 0;	/* last entry */
+}
+
+static struct procps *
+find_slot(struct proc *p1)
+{
+	struct procps *ps;
+
+	DEB(printf("+++ FIND ps_blocks %d ps_free_blocks %d\n",
+		ps_blocks, ps_free_blocks);)
+	if (ps_free_list == 0)
+		procps_realloc();
+	/* assign slot to process */
+	*((u_short *)(p1->p_pad3)) = ps_free_list;
+	ps = &ps_procs[ps_free_list];
+	ps_free_blocks--;
+	/* remove slot from free list */
+	ps_free_list = ps->nextfree;
+	/* initialize structure */
+	bzero(ps, sizeof(*ps));
+	ps->proc = p1;
+	return ps;
+}
+
+/*
+ * Insert slot at the head of free list
+ */
+static void
+free_slot(struct procps *ps)
+{
+	int index = ps - ps_procs;
+
+	ps_procs[index].nextfree = ps_free_list;
+	ps_free_list = index;
+	ps_free_blocks++;
+	DEB(printf("+++ FREE ps_blocks %d ps_free_blocks %d\n",
+		ps_blocks, ps_free_blocks);)
+}
+
+/*
+ * interface functions for the Proportional Share scheduler
+ */
+
+
+/*
+ * Initialize proportional share server when selected.
+ */
+static void
+psinit(void *dummy)
+{
+	lastRtime = Rtime();
+	V = 0;
+	sum = 0;
+	outproc = NULL;
+	ps_stat = 0;
+
+	/* Initialize heaps */
+	exp_heap.size = exp_heap.elements = 0;
+	exp_heap.offset = OFFSET_OF(struct procps, heap_pos);
+
+	ne_heap.size = ne_heap.elements = 0;
+	ne_heap.offset = 0;
+
+	run_heap.size = run_heap.elements = 0;
+	run_heap.offset = 0;
+
+	/* Initialize request timeout handler */
+	bzero(&thandle, sizeof(struct callout_handle));
+}
+
+/* request timeout expired: force switch to next request */
+static void
+propshare(void *arg)
+{
+	ps_stat &= ~END_PENDING;
+	need_resched();
+}
+
+/*
+ * updateV() updates the virtual time.
+ *
+ * updateV2() expires inactive processes and moves some from
+ * not-eligible to eligible heap.
+ *
+ * This must be called at splhigh().
+ */
+static void
+updateV(void)
+{
+	if (sum == 0 && (ne_heap.elements > 0 || run_heap.elements > 0))
+		panic("updateV: null weight sum, but pending requests!");
+	if (sum == 0)
+		return;
+	V += (Rtime() - lastRtime)/sum;
+	lastRtime = Rtime();
+}
+
+static void
+updateV2(void)
+{
+	if (sum == 0)
+		return;
+	/*
+	 * if none is eligible, V jumps to the start-time of the
+	 * first non-eligible process.
+	 */
+	if (run_heap.elements == 0 && ne_heap.elements > 0)
+		V = MAX64(V, ne_heap.p[0].key);
+
+	/* expire inactive processes */
+	while (exp_heap.elements > 0 && DN_KEY_LT(exp_heap.p[0].key, V) ) {
+		struct procps *ps = (struct procps *)exp_heap.p[0].object;
+
+		DEB(printf("+++ updateV2: expired slot %d\n", ps - ps_procs);)
+		heap_extract(&exp_heap, NULL);
+		sum -= ps->old_weight;
+		if (ps->proc != NULL)
+			ps->p_S = ps->p_F + 1; /* mark timestamp as invalid */
+		else /* remove proc */
+			free_slot(ps);
+	}
+
+	/*
+	 * move from ne_heap to run_heap any requests that have
+	 * become eligible
+	 */
+	while (ne_heap.elements > 0 && DN_KEY_LEQ(ne_heap.p[0].key, V) ) {
+		struct procps *ps = (struct procps *)ne_heap.p[0].object;
+
+		heap_extract(&ne_heap, NULL);
+		heap_insert(&run_heap, ps->p_F, ps);
+	}
+	TSTMP(4, sum, 124, run_heap.elements*100 + ne_heap.elements);
+}
+
+#define INCLENGTH(a, b)	((a) += ((b) < REQ_MAXLENGTH) ? \
+			       REQ_DELTA : 0) /* XXX */
+#define DECLENGTH(a, b)	((a) -= ((b) > REQ_MINLENGTH) ? \
+			       REQ_DELTA : 0) /* XXX */
+/*
+ * Account process for service received.
+ * Update process virtual finish time and nominal request length.
+ */
+static void
+charge_service(struct procps *ps)
+{
+	dn_key act_length = ticks - ps->p_RS;
+
+	DASSERT(ps->proc, "charge_service: null process pointer" );
+	if (WEIGHT(ps) < MINWEIGHT || WEIGHT(ps) > MAXWEIGHT) {
+		printf("charge_service: %p %d %s, nice: %d, weight: %d, sum %d\n",
+		    ps->proc, ps->proc->p_pid, ps->proc->p_comm,
+		    ps->proc->p_nice, WEIGHT(ps), sum);
+		panic("charge_service: weight out of range");
+	}
+	ps->p_F = ps->p_S + (act_length * TIMESCALE) / WEIGHT(ps);
+#if 0 /* TESTING */
+	if (p->p_stat == SRUN)
+		INCLENGTH(p->p_length, act_length);
+	else
+		DECLENGTH(p->p_length, act_length);
+#endif
+}
+
+/*
+ * setrunheap: schedules process for running.
+ * First time process is scheduled, adds process weight to sum.
+ * If invoked by curproc to schedule its next request
+ * . curproc timestamps are updated (account for service received) before
+ *   inserting new request into heaps
+ * . outproc is set to NULL
+ * If weight of an active process has changed, updates sum.
+ * The process must be runnable.
+ * This must be called at splhigh().
+ */
+static void
+ps_setrunqueue(struct proc *p1)
+{
+	struct procps *ps;
+
+	TSTMP(4, 0, 120, p1->p_pid);
+
+	DEB( printf("setrunheap: %p %d %s slot %d\n", p1, p1->p_pid, p1->p_comm,
+		*((u_short *)p1->p_pad3) );)
+	ps = P2PS(p1);
+	if (PSISNEW(ps)) { /* new process */
+		ps = find_slot(p1);
+		/* mark timestamps as invalid */
+		ps->p_S = ps->p_F + 1;
+#if 0
+		ps->p_length = REQ_MINLENGTH;
+#else /* TESTING */
+		ps->p_length = hz/100; /* 10 ms */
+#endif
+	}
+
+	updateV();
+	if (DN_KEY_GT(ps->p_S, ps->p_F)) { /* invalid timestamps */
+		ps->p_S = V;
+		sum += WEIGHT(ps); /* add weight of new process */
+	} else { /* process with valid timestamp */
+		/*
+		 * A process with valid timestamp.
+		 * If it is not running, should be in exp_heap,
+		 * so remove it from there.
+		 */
+		if (p1 != curproc)
+			heap_extract(&exp_heap, ps);
+		else {
+			/*
+			 * need to update my timestamps before
+			 * (re)scheduling myself
+			 */
+			charge_service(ps);
+			if (outproc == NULL || outproc != curproc)
+				panic("setrunheap: invalid outproc");
+			outproc = NULL; /* came back into active heaps */
+		}
+		/* if weight has changed then update sum */
+		if (WEIGHT(ps) != ps->old_weight)
+			sum += WEIGHT(ps) - ps->old_weight;
+		ps->p_S = MAX64(ps->p_F, V);
+	}
+
+	/* now we are done with possible weight (sum) change */
+	if (WEIGHT(ps) != ps->old_weight)
+		ps->old_weight = WEIGHT(ps);
+
+	if (WEIGHT(ps) < MINWEIGHT || WEIGHT(ps) > MAXWEIGHT) {
+		printf("setrunqueue: %p %d %s, nice: %d, weight: %d, sum %d\n",
+		    ps->proc,
+		    ps->proc->p_pid, ps->proc->p_comm, ps->proc->p_nice,
+		    WEIGHT(ps), sum);
+		panic("setrunqueue: weight out of range");
+	}
+	if (p1->p_priority >= PUSER) {
+		ps->p_length = hz/100;	/* default */
+	} else {			/* blocked on a resource */
+		ps->p_length = hz * p1->p_priority / (200*PUSER) + 1;
+	}
+		
+	/* Set request finish time */
+	ps->p_F = ps->p_S + ps->p_length*TIMESCALE/WEIGHT(ps);
+
+	/* insert request in proper heap */
+	if (ps->p_S == V)
+		heap_insert(&run_heap, ps->p_F, ps);
+	else
+		heap_insert(&ne_heap, ps->p_S, ps);
+}
+
+/*
+ * Removes an active and not running process from the scheduler.
+ * This must be called at splhigh().
+ */
+static void
+ps_remrunqueue(struct proc *p1)
+{
+	struct procps *ps;
+
+	DEB( printf("remrunqueue: %p %d %s\n", p1, p1->p_pid, p1->p_comm);)
+	DASSERT(p1->p_stat == SRUN, ("remrunqueue: proc not SRUN"));
+
+	TSTMP(4, 0, 121, p1->p_pid);
+	ps = P2PS(p1);
+	if (PSISNEW(ps))
+		panic("ps_remrunqueue: new procps");
+	if (DN_KEY_LEQ(ps->p_S, V))
+		heap_extract(&run_heap, ps);
+	else
+		heap_extract(&ne_heap, ps);
+	heap_insert(&exp_heap, ps->p_S, ps);
+}
+
+/*
+ * Removes a running process from scheduler.
+ * Must be called by the process who wants to remove itself from scheduler.
+ */
+static void
+ps_schedulerexit(struct proc *p1)
+{
+	struct procps *ps;
+
+	ps = P2PS(p1);
+	DEB(printf("schedulerexit: slot %d pid %d\n",
+		ps - ps_procs, p1->p_pid);)
+	if (PSISNEW(ps)) /* may happen on a scheduler algorithm switch */
+		return;
+	if (DN_KEY_GT(ps->p_S, ps->p_F))  /* means timestamps are invalid */
+		panic("schedulerexit: invalid timestamps");
+	/*
+	 * Process is terminating, proc data must be freed before we
+	 * remove the process from scheduler.
+	 * We mark ps to remember that proc data structure is not
+	 * associated with a process any more.
+	 * When this process will be extracted from exp_heap, it will be
+	 * removed from scheduler.
+	 */
+	ps->proc = NULL;
+	/* we assume that schedulerexit is called by the process itself */
+	heap_insert(&exp_heap, ps->p_F, ps);
+	outproc = NULL;
+}
+
+/*
+ * returns cpu share percentage
+ */
+static int
+ps_get_share(struct proc *p)
+{
+	struct procps *ps = P2PS(p);
+
+	if (sum == 0 || PSISNEW(ps))
+		panic("ps_get_share: null weight sum or unknown process");
+	if (sum == 0 || PSISNEW(ps) )
+		return 0;
+	if (ps->old_weight > sum && ps->proc->p_stat == SRUN) {
+		 printf("ps_get_share: %d %s, nice %d weight %d sum %d",
+		     ps->proc->p_pid, ps->proc->p_comm,
+		     ps->proc->p_nice, WEIGHT(ps), sum);
+		 panic("ps_get_share");
+	}
+	return (ps->old_weight * 100) / sum;
+}
+
+
+/*
+ * Compute the priority of a process when running in user mode.
+ */
+static void
+ps_resetpriority(struct proc *p)
+{
+	if (p->p_stat == SRUN)
+		p->p_usrpri = MAXPRI -
+			     (MAXPRI - PUSER) * ps_get_share(p) / 100;
+	else
+		p->p_usrpri = PUSER;
+}
+
+/* OPTIMIZATION: implicitly does the job of resetpriority too to avoid
+ * an explicit call to it
+ */
+static void
+ps_schedcpu1(struct proc *p)
+{
+	if (p->p_priority >= PUSER)
+		p->p_priority = p->p_usrpri  = MAXPRI -
+			     (MAXPRI - PUSER) * ps_get_share(p) / 100;
+}
+
+static void
+ps_schedclock1(struct proc *p)
+{
+}
+
+/*
+ * procrunnable() returns a boolean true (non-zero) value if there are
+ * any runnable processes.  This is intended to be called from the idle
+ * loop to avoid the more expensive (and destructive) chooseproc().
+ *
+ * MP SAFE.  CALLED WITHOUT THE MP LOCK
+ */
+static u_int32_t
+ps_procrunnable(void)
+{
+	return run_heap.elements;
+}
+
+/*
+ * chooseproc() selects the next process to run.  Ideally, cpu_switch()
+ * would have determined that there is a process available before calling
+ * this, but it is not a requirement.  The selected process is removed
+ * from run_heap. Process real start time is set here.
+ * This must be called at splhigh().
+ *
+ * XXX For SMP, trivial affinity is implemented by locating the first process
+ * on the queue that has a matching lastcpu id.  Since normal priorities
+ * are mapped four priority levels per queue, this may allow the cpu to
+ * choose a slightly lower priority process in order to preserve the cpu
+ * caches.
+ */
+static struct proc *
+ps_chooseproc(void)
+{
+	struct proc *p = NULL;
+	struct procps *ps = NULL;
+#ifdef SMP
+	u_char id;
+#endif
+	if (ps_stat & END_PENDING)
+		/*
+		 * request terminated before timeout expiration
+		 */
+		untimeout(propshare, NULL, thandle);
+	if (outproc != NULL) {
+		/*
+		 * outstanding process did not reschedule itself
+		 */
+		struct procps *outps = P2PS(outproc);
+
+		if (DN_KEY_GT(outps->p_S, outps->p_F))
+			panic("chooseproc: outproc has invalid timestamps");
+		if (PSISNEW(outps))
+			panic("ps_chooseproc: new procps");
+		/* charge process for service received */
+		charge_service(outps);
+		heap_insert(&exp_heap, outps->p_F, outps);
+	}
+	updateV();
+	updateV2();
+	if (run_heap.elements > 0) { /* need to dispatch a new process */
+		ps = (struct procps *)run_heap.p[0].object;
+		heap_extract(&run_heap, NULL);
+		ps->p_RS = ticks;
+		thandle = timeout(propshare, NULL, ps->p_length);
+		ps_stat |= END_PENDING;
+		p = ps->proc;
+		/* XXX what to do with SMP ? */
+#if 0 && defined(SMP)
+		/*
+		 * wander down the current run queue for this
+		 * pri level for a match
+		 */
+		id = cpuid;
+		while (p->p_lastcpu != id) {
+			p = TAILQ_NEXT(p, p_procq);
+			if (p == NULL) {
+				p = TAILQ_FIRST(q);
+				break;
+			}
+		}
+#endif /* SMP */
+	}
+	outproc = p;
+	TSTMP(4, 0, 122, p ? p->p_pid : 0);
+	return p;
+}
+
+/*-
+ * Compare priorities.  Return:
+ *     <0: priority of p < current priority
+ *      0: priority of p == current priority
+ *     >0: priority of p > current priority
+ * Lower priority means nearer virtual finish time => it really means higher
+ * priority
+*/
+static int
+ps_curpriority_cmp(struct proc *p)
+{
+	struct procps *ps = P2PS(p);
+	struct procps *curps = P2PS(curproc);
+
+	return (ps->p_F == curps->p_F) ? 0 :
+    	    DN_KEY_LT(ps->p_F, curps->p_F) ? -1 : 1;
+}
+
+/*
+ * Moves processes from proportional share scheduler to the destination
+ * (current) scheduler.
+ *
+ * 1. init destination (current) scheduler
+ * 2. moves active processes to destination scheduler
+ * 3. removes per process reference to ps data structures
+ * 4. frees ps data structures
+ */
+static void
+ps_sched_move(void)
+{
+	struct dn_heap *h, *work_heap[2] = { &run_heap, &ne_heap };
+	struct proc *p;
+	int i, j;
+
+	sched_init(NULL); /* init destination scheduler */
+	/* move active processes from ps to destination */
+	for ( i = 0; i < 2; i++ ) {
+		h = work_heap[i];
+		for ( j = 0; j < h->elements; j++ ) {
+			struct procps *ps = (struct procps *)h->p[j].object;
+			setrunqueue(ps->proc);
+		}
+		heap_free(h);
+	}
+	heap_free(&exp_heap); /* remove served requests */
+	/* remove reference to ps data structures from all processes */
+	LIST_FOREACH(p, &allproc, p_list)
+		*((u_short *)(p->p_pad3)) = 0;
+	free(ps_procs, M_TEMP);
+	ps_blocks = 0;
+	ps_free_list = 0;
+}
+
+struct _sched_interface ps_scheduler = {
+	NULL,
+	"ps",
+	PROPORTIONAL_SHARE,
+        psinit,
+        ps_sched_move,
+        ps_setrunqueue,
+        ps_remrunqueue,
+        ps_schedulerexit,
+        ps_resetpriority,
+        ps_schedcpu1,
+        ps_schedclock1,
+        ps_curpriority_cmp,
+        ps_chooseproc,
+        ps_procrunnable
+};
+
+/*
+ * Load the scheduler data structures
+ */
+static void
+ps_load(void)
+{
+        printf("Loaded proportional share scheduler\n");
+        ps_scheduler.next = schedulers;
+        schedulers = &ps_scheduler;
+}
+SYSINIT(psload, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, ps_load, NULL);