patch-2.0.30 linux/kernel/sched.c

Next file: linux/kernel/sys.c
Previous file: linux/kernel/resource.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.0.29/linux/kernel/sched.c linux/kernel/sched.c
@@ -4,6 +4,9 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
  *  1996-04-21	Modified by Ulrich Windl to make NTP work
+ *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
+ *              make semaphores SMP safe
+ *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
  */
 
 /*
@@ -482,32 +485,47 @@
 	printk("       *q = %p\n",*q);
 }
 
+
 /*
  * Semaphores are implemented using a two-way counter:
  * The "count" variable is decremented for each process
- * that tries to sleep, while the "waiting" variable is
- * incremented _while_ the process is sleeping on that
- * semaphore. 
+ * that tries to sleep, while the "waking" variable is
+ * incremented when the "up()" code goes to wake up waiting
+ * processes.
  *
  * Notably, the inline "up()" and "down()" functions can
  * efficiently test if they need to do any extra work (up
  * needs to do something only if count was negative before
  * the increment operation.
+ *
+ * This routine must execute atomically.
  */
-static inline void normalize_semaphore(struct semaphore *sem)
+static inline int waking_non_zero(struct semaphore *sem)
 {
-	atomic_add(xchg(&sem->waiting,0), &sem->count);
+	int	ret ;
+	long	flags ;
+
+	get_buzz_lock(&sem->lock) ;
+	save_flags(flags) ;
+	cli() ;
+
+	if ((ret = (sem->waking > 0)))
+		sem->waking-- ;
+
+	restore_flags(flags) ;
+	give_buzz_lock(&sem->lock) ;
+	return(ret) ;
 }
 
 /*
  * When __up() is called, the count was negative before
- * incrementing it, and we need to wake up somebody. In
- * most cases "waiting" will be positive, and the normalization
- * will allow things to continue. However, if somebody has
- * /just/ done a down(), it may be that count was negative
- * without waiting being positive (or in the generic case
- * "count is more negative than waiting is positive"), and
- * the waiter needs to check this itself (see __down).
+ * incrementing it, and we need to wake up somebody.
+ *
+ * This routine adds one to the count of processes that need to
+ * wake up and exit.  ALL waiting processes actually wake up but
+ * only the one that gets to the "waking" field first will gate
+ * through and acquire the semaphore.  The others will go back
+ * to sleep.
  *
  * Note that these functions are only called when there is
  * contention on the lock, and as such all this is the
@@ -517,55 +535,86 @@
  */
 void __up(struct semaphore *sem)
 {
-	normalize_semaphore(sem);
+	atomic_inc(&sem->waking) ;
 	wake_up(&sem->wait);
 }
 
-void __down(struct semaphore * sem)
+/*
+ * Perform the "down" function.  Return zero for semaphore acquired,
+ * return negative for signalled out of the function.
+ *
+ * If called from __down, the return is ignored and the wait loop is
+ * not interruptible.  This means that a task waiting on a semaphore
+ * using "down()" cannot be killed until someone does an "up()" on
+ * the semaphore.
+ *
+ * If called from __down_interruptible, the return value gets checked
+ * upon return.  If the return value is negative then the task continues
+ * with the negative value in the return register (it can be tested by
+ * the caller).
+ *
+ * Either form may be used in conjunction with "up()".
+ *
+ */
+int __do_down(struct semaphore * sem, int task_state)
 {
 	struct task_struct *tsk = current;
 	struct wait_queue wait = { tsk, NULL };
+	int		  ret = 0 ;
 
-	/*
-	 * The order here is important. We add ourselves to the
-	 * wait queues and mark ourselves sleeping _first_. That
-	 * way, if a "up()" comes in here, we'll either get
-	 * woken up (up happens after the wait queues are set up)
-	 * OR we'll have "waiting > 0".
-	 */
-	tsk->state = TASK_UNINTERRUPTIBLE;
+	tsk->state = task_state;
 	add_wait_queue(&sem->wait, &wait);
-	atomic_inc(&sem->waiting);
 
 	/*
-	 * Ok, we're set up. The only race here is really that
-	 * an "up()" might have incremented count before we got
-	 * here, so we check "count+waiting". If that is larger
-	 * than zero, we shouldn't sleep, but re-try the lock.
+	 * Ok, we're set up.  sem->count is known to be less than zero
+	 * so we must wait.
+	 *
+	 * We can let go the lock for purposes of waiting.
+	 * We re-acquire it after awaking so as to protect
+	 * all semaphore operations.
+	 *
+	 * If "up()" is called before we call waking_non_zero() then
+	 * we will catch it right away.  If it is called later then
+	 * we will have to go through a wakeup cycle to catch it.
+	 *
+	 * Multiple waiters contend for the semaphore lock to see
+	 * who gets to gate through and who has to wait some more.
 	 */
-	if (sem->count+sem->waiting <= 0) {
-		/*
-		 * If "count+waiting" <= 0, we have to wait
-		 * for a up(), which will normalize the count.
-		 * Remember, at this point we have decremented
-		 * count, and incremented up, so if count is
-		 * zero or positive we need to return to re-try
-		 * the lock.  It _may_ be that both count and
-		 * waiting is zero and that it is still locked,
-		 * but we still want to re-try the lock in that
-		 * case to make count go negative again so that
-		 * the optimized "up()" wake_up sequence works.
-		 */
-		do {
-			schedule();
-			tsk->state = TASK_UNINTERRUPTIBLE;
-		} while (sem->count < 0);
+	for (;;)
+	{
+		if (waking_non_zero(sem))	/* are we waking up?  */
+		    break ;			/* yes, exit loop */
+
+		if (   task_state == TASK_INTERRUPTIBLE
+		    && (tsk->signal & ~tsk->blocked)	/* signalled */
+		   )
+		{
+		    ret = -EINTR ;		/* interrupted */
+		    atomic_inc(&sem->count) ;	/* give up on down operation */
+		    break ;
+		}
+
+		schedule();
+		tsk->state = task_state;
 	}
+
 	tsk->state = TASK_RUNNING;
 	remove_wait_queue(&sem->wait, &wait);
-	normalize_semaphore(sem);
+	return(ret) ;
+
+} /* __do_down */
+
+void __down(struct semaphore * sem)
+{
+	__do_down(sem,TASK_UNINTERRUPTIBLE) ; 
+}
+
+int __down_interruptible(struct semaphore * sem)
+{
+	return(__do_down(sem,TASK_INTERRUPTIBLE)) ; 
 }
 
+
 static inline void __sleep_on(struct wait_queue **p, int state)
 {
 	unsigned long flags;
@@ -596,70 +645,170 @@
 	__sleep_on(p,TASK_UNINTERRUPTIBLE);
 }
 
-/*
- * The head for the timer-list has a "expires" field of MAX_UINT,
- * and the sorting routine counts on this..
- */
-static struct timer_list timer_head = { &timer_head, &timer_head, ~0, 0, NULL };
+#define TVN_BITS 6
+#define TVR_BITS 8
+#define TVN_SIZE (1 << TVN_BITS)
+#define TVR_SIZE (1 << TVR_BITS)
+#define TVN_MASK (TVN_SIZE - 1)
+#define TVR_MASK (TVR_SIZE - 1)
+
 #define SLOW_BUT_DEBUGGING_TIMERS 0
 
-void add_timer(struct timer_list * timer)
+struct timer_vec {
+        int index;
+        struct timer_list *vec[TVN_SIZE];
+};
+
+struct timer_vec_root {
+        int index;
+        struct timer_list *vec[TVR_SIZE];
+};
+
+static struct timer_vec tv5 = { 0 };
+static struct timer_vec tv4 = { 0 };
+static struct timer_vec tv3 = { 0 };
+static struct timer_vec tv2 = { 0 };
+static struct timer_vec_root tv1 = { 0 };
+
+static struct timer_vec * const tvecs[] = {
+	(struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
+};
+
+#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
+
+static unsigned long timer_jiffies = 0;
+
+static inline void insert_timer(struct timer_list *timer,
+				struct timer_list **vec, int idx)
 {
-	unsigned long flags;
-	struct timer_list *p;
+	if ((timer->next = vec[idx]))
+		vec[idx]->prev = timer;
+	vec[idx] = timer;
+	timer->prev = (struct timer_list *)&vec[idx];
+}
 
-#if SLOW_BUT_DEBUGGING_TIMERS
-	if (timer->next || timer->prev) {
-		printk("add_timer() called with non-zero list from %p\n",
-			__builtin_return_address(0));
-		return;
+static inline void internal_add_timer(struct timer_list *timer)
+{
+	/*
+	 * must be cli-ed when calling this
+	 */
+	unsigned long expires = timer->expires;
+	unsigned long idx = expires - timer_jiffies;
+
+	if (idx < TVR_SIZE) {
+		int i = expires & TVR_MASK;
+		insert_timer(timer, tv1.vec, i);
+	} else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
+		int i = (expires >> TVR_BITS) & TVN_MASK;
+		insert_timer(timer, tv2.vec, i);
+	} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
+		int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
+		insert_timer(timer, tv3.vec, i);
+	} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
+		int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
+		insert_timer(timer, tv4.vec, i);
+	} else if (expires < timer_jiffies) {
+		/* can happen if you add a timer with expires == jiffies,
+		 * or you set a timer to go off in the past
+		 */
+		insert_timer(timer, tv1.vec, tv1.index);
+	} else if (idx < 0xffffffffUL) {
+		int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
+		insert_timer(timer, tv5.vec, i);
+	} else {
+		/* Can only get here on architectures with 64-bit jiffies */
+		timer->next = timer->prev = timer;
 	}
-#endif
-	p = &timer_head;
+}
+
+void add_timer(struct timer_list *timer)
+{
+	unsigned long flags;
 	save_flags(flags);
 	cli();
-	do {
-		p = p->next;
-	} while (timer->expires > p->expires);
-	timer->next = p;
-	timer->prev = p->prev;
-	p->prev = timer;
-	timer->prev->next = timer;
+#if SLOW_BUT_DEBUGGING_TIMERS
+        if (timer->next || timer->prev) {
+                printk("add_timer() called with non-zero list from %p\n",
+		       __builtin_return_address(0));
+		goto out;
+        }
+#endif
+	internal_add_timer(timer);
+#if SLOW_BUT_DEBUGGING_TIMERS
+out:
+#endif
 	restore_flags(flags);
 }
 
-int del_timer(struct timer_list * timer)
+static inline int detach_timer(struct timer_list *timer)
 {
 	int ret = 0;
-	if (timer->next) {
-		unsigned long flags;
-		struct timer_list * next;
-		save_flags(flags);
-		cli();
-		if ((next = timer->next) != NULL) {
-			(next->prev = timer->prev)->next = next;
-			timer->next = timer->prev = NULL;
-			ret = 1;
-		}
-		restore_flags(flags);
+	struct timer_list *next, *prev;
+	next = timer->next;
+	prev = timer->prev;
+	if (next) {
+		next->prev = prev;
+	}
+	if (prev) {
+		ret = 1;
+		prev->next = next;
 	}
 	return ret;
 }
 
-static inline void run_timer_list(void)
+
+int del_timer(struct timer_list * timer)
 {
-	struct timer_list * timer;
+	int ret;
+	unsigned long flags;
+	save_flags(flags);
+	cli();
+	ret = detach_timer(timer);
+	timer->next = timer->prev = 0;
+	restore_flags(flags);
+	return ret;
+}
 
+static inline void cascade_timers(struct timer_vec *tv)
+{
+        /* cascade all the timers from tv up one level */
+        struct timer_list *timer;
+        timer = tv->vec[tv->index];
+        /*
+         * We are removing _all_ timers from the list, so we don't  have to
+         * detach them individually, just clear the list afterwards.
+         */
+        while (timer) {
+                struct timer_list *tmp = timer;
+                timer = timer->next;
+                internal_add_timer(tmp);
+        }
+        tv->vec[tv->index] = NULL;
+        tv->index = (tv->index + 1) & TVN_MASK;
+}
+
+static inline void run_timer_list(void)
+{
 	cli();
-	while ((timer = timer_head.next) != &timer_head && timer->expires <= jiffies) {
-		void (*fn)(unsigned long) = timer->function;
-		unsigned long data = timer->data;
-		timer->next->prev = timer->prev;
-		timer->prev->next = timer->next;
-		timer->next = timer->prev = NULL;
-		sti();
-		fn(data);
-		cli();
+	while ((long)(jiffies - timer_jiffies) >= 0) {
+		struct timer_list *timer;
+		if (!tv1.index) {
+			int n = 1;
+			do {
+				cascade_timers(tvecs[n]);
+			} while (tvecs[n]->index == 1 && ++n < NOOF_TVECS);
+		}
+		while ((timer = tv1.vec[tv1.index])) {
+			void (*fn)(unsigned long) = timer->function;
+			unsigned long data = timer->data;
+			detach_timer(timer);
+			timer->next = timer->prev = NULL;
+			sti();
+			fn(data);
+			cli();
+		}
+		++timer_jiffies; 
+		tv1.index = (tv1.index + 1) & TVR_MASK;
 	}
 	sti();
 }
@@ -1255,8 +1404,7 @@
 	if (p->next_run)
 		move_last_runqueue(p);
 	sti();
-	schedule();
-
+	need_resched = 1;
 	return 0;
 }
 
@@ -1313,6 +1461,7 @@
 	cli();
 	move_last_runqueue(current);
 	sti();
+	need_resched = 1;
 	return 0;
 }
 

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov