patch-2.3.9 linux/fs/buffer.c

Next file: linux/fs/coda/dir.c
Previous file: linux/fs/binfmt_java.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.3.8/linux/fs/buffer.c linux/fs/buffer.c
@@ -24,6 +24,8 @@
  * - RMK
  */
 
+/* Thread it... -DaveM */
+
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/malloc.h>
@@ -57,31 +59,39 @@
 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 
 					     number of unused buffer heads */
 
-/*
- * Hash table mask..
+/* Anti-deadlock ordering:
+ *	lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
  */
-static unsigned long bh_hash_mask = 0;
-
-static int grow_buffers(int size);
 
-static struct buffer_head ** hash_table;
-static struct buffer_head * lru_list[NR_LIST] = {NULL, };
-static struct buffer_head * free_list[NR_SIZES] = {NULL, };
+/*
+ * Hash table gook..
+ */
+static unsigned int bh_hash_mask = 0;
+static unsigned int bh_hash_shift = 0;
+static struct buffer_head **hash_table;
+static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
 
-static kmem_cache_t *bh_cachep;
+static struct buffer_head *lru_list[NR_LIST];
+static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
+static int nr_buffers_type[NR_LIST] = {0,};
 
 static struct buffer_head * unused_list = NULL;
-static struct buffer_head * reuse_list = NULL;
+static int nr_unused_buffer_heads = 0;
+static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
 
-static int nr_buffers = 0;
-static int nr_buffers_type[NR_LIST] = {0,};
-static int nr_buffer_heads = 0;
-static int nr_unused_buffer_heads = 0;
-static int nr_hashed_buffers = 0;
+struct bh_free_head {
+	struct buffer_head *list;
+	spinlock_t lock;
+};
+static struct bh_free_head free_list[NR_SIZES];
+
+static kmem_cache_t *bh_cachep;
+
+static int grow_buffers(int size);
 
 /* This is used by some architectures to estimate available memory. */
-int buffermem = 0;
+atomic_t buffermem = ATOMIC_INIT(0);
 
 /* Here is the parameter block for the bdflush process. If you add or
  * remove any of the parameters, make sure to update kernel/sysctl.c.
@@ -131,7 +141,7 @@
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
 
-	bh->b_count++;
+	atomic_inc(&bh->b_count);
 	add_wait_queue(&bh->b_wait, &wait);
 repeat:
 	tsk->state = TASK_UNINTERRUPTIBLE;
@@ -142,7 +152,7 @@
 	}
 	tsk->state = TASK_RUNNING;
 	remove_wait_queue(&bh->b_wait, &wait);
-	bh->b_count--;
+	atomic_dec(&bh->b_count);
 }
 
 /* Call sync_buffers with wait!=0 to ensure that the call does not
@@ -167,17 +177,19 @@
 	 */
 	do {
 		retry = 0;
-repeat:
+
 		/* We search all lists as a failsafe mechanism, not because we expect
 		 * there to be dirty buffers on any of the other lists.
 		 */
+repeat:
+		spin_lock(&lru_list_lock);
 		bh = lru_list[BUF_DIRTY];
 		if (!bh)
 			goto repeat2;
+
 		for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
-			if (bh->b_list != BUF_DIRTY)
-				goto repeat;
 			next = bh->b_next_free;
+
 			if (!lru_list[BUF_DIRTY])
 				break;
 			if (dev && bh->b_dev != dev)
@@ -190,7 +202,10 @@
 					retry = 1;
 					continue;
 				}
+				atomic_inc(&bh->b_count);
+				spin_unlock(&lru_list_lock);
 				wait_on_buffer (bh);
+				atomic_dec(&bh->b_count);
 				goto repeat;
 			}
 
@@ -209,30 +224,24 @@
 			if (!buffer_dirty(bh) || pass >= 2)
 				continue;
 
-			/* Don't bother about locked buffers.
-			 *
-			 * XXX We checked if it was locked above and there is no
-			 * XXX way we could have slept in between. -DaveM
-			 */
-			if (buffer_locked(bh))
-				continue;
-			bh->b_count++;
-			next->b_count++;
+			atomic_inc(&bh->b_count);
 			bh->b_flushtime = 0;
+			spin_unlock(&lru_list_lock);
 			ll_rw_block(WRITE, 1, &bh);
-			bh->b_count--;
-			next->b_count--;
+			atomic_dec(&bh->b_count);
 			retry = 1;
+			goto repeat;
 		}
 
     repeat2:
 		bh = lru_list[BUF_LOCKED];
-		if (!bh)
+		if (!bh) {
+			spin_unlock(&lru_list_lock);
 			break;
+		}
 		for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
-			if (bh->b_list != BUF_LOCKED)
-				goto repeat2;
 			next = bh->b_next_free;
+
 			if (!lru_list[BUF_LOCKED])
 				break;
 			if (dev && bh->b_dev != dev)
@@ -245,10 +254,15 @@
 					retry = 1;
 					continue;
 				}
+				atomic_inc(&bh->b_count);
+				spin_unlock(&lru_list_lock);
 				wait_on_buffer (bh);
+				spin_lock(&lru_list_lock);
+				atomic_dec(&bh->b_count);
 				goto repeat2;
 			}
 		}
+		spin_unlock(&lru_list_lock);
 
 		/* If we are waiting for the sync to succeed, and if any dirty
 		 * blocks were written, then repeat; on the second pass, only
@@ -282,17 +296,19 @@
 int fsync_dev(kdev_t dev)
 {
 	sync_buffers(dev, 0);
+
+	lock_kernel();
 	sync_supers(dev);
 	sync_inodes(dev);
 	DQUOT_SYNC(dev);
+	unlock_kernel();
+
 	return sync_buffers(dev, 1);
 }
 
 asmlinkage int sys_sync(void)
 {
-	lock_kernel();
 	fsync_dev(0);
-	unlock_kernel();
 	return 0;
 }
 
@@ -396,19 +412,28 @@
 
 void invalidate_buffers(kdev_t dev)
 {
-	int i;
 	int nlist;
-	struct buffer_head * bh;
 
+	spin_lock(&lru_list_lock);
 	for(nlist = 0; nlist < NR_LIST; nlist++) {
+		struct buffer_head * bh;
+		int i;
+	retry:
 		bh = lru_list[nlist];
+		if (!bh)
+			continue;
 		for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
 			if (bh->b_dev != dev)
 				continue;
-			wait_on_buffer(bh);
-			if (bh->b_dev != dev)
-				continue;
-			if (bh->b_count)
+			if (buffer_locked(bh)) {
+				atomic_inc(&bh->b_count);
+				spin_unlock(&lru_list_lock);
+				wait_on_buffer(bh);
+				spin_lock(&lru_list_lock);
+				atomic_dec(&bh->b_count);
+				goto retry;
+			}
+			if (atomic_read(&bh->b_count))
 				continue;
 			bh->b_flushtime = 0;
 			clear_bit(BH_Protected, &bh->b_state);
@@ -417,157 +442,119 @@
 			clear_bit(BH_Req, &bh->b_state);
 		}
 	}
+	spin_unlock(&lru_list_lock);
 }
 
-#define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block)) & bh_hash_mask)
-#define hash(dev,block) hash_table[_hashfn(dev,block)]
+/* After several hours of tedious analysis, the following hash
+ * function won.  Do not mess with it... -DaveM
+ */
+#define _hashfn(dev,block)	\
+	((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
+	 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
+#define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
 
-static void insert_into_hash_list(struct buffer_head * bh)
+static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
 {
-	bh->b_next = NULL;
-	bh->b_pprev = NULL;
-	if (bh->b_dev) {
-		struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr);
-		struct buffer_head *next = *bhp;
-
-		if (next) {
-			bh->b_next = next;
-			next->b_pprev = &bh->b_next;
-		}
-		*bhp = bh;
-		bh->b_pprev = bhp;
-		nr_hashed_buffers++;
-	}
+	if ((bh->b_next = *head) != NULL)
+		bh->b_next->b_pprev = &bh->b_next;
+	*head = bh;
+	bh->b_pprev = head;
 }
 
-static void remove_from_hash_queue(struct buffer_head * bh)
+static __inline__ void __hash_unlink(struct buffer_head *bh)
 {
-	struct buffer_head **pprev = bh->b_pprev;
-	if (pprev) {
-		struct buffer_head * next = bh->b_next;
-		if (next) {
-			next->b_pprev = pprev;
-			bh->b_next = NULL;
-		}
-		*pprev = next;
-		bh->b_pprev = NULL;
-		nr_hashed_buffers--;
-	}
+	if (bh->b_next)
+		bh->b_next->b_pprev = bh->b_pprev;
+	*(bh->b_pprev) = bh->b_next;
+	bh->b_pprev = NULL;
 }
 
-static void insert_into_lru_list(struct buffer_head * bh)
+static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 {
-	struct buffer_head **bhp = &lru_list[bh->b_list];
-
-	if (bh->b_dev == B_FREE)
-		BUG();
+	struct buffer_head **bhp = &lru_list[blist];
 
 	if(!*bhp) {
 		*bhp = bh;
 		bh->b_prev_free = bh;
 	}
-
-	if (bh->b_next_free)
-		panic("VFS: buffer LRU pointers corrupted");
-
 	bh->b_next_free = *bhp;
 	bh->b_prev_free = (*bhp)->b_prev_free;
 	(*bhp)->b_prev_free->b_next_free = bh;
 	(*bhp)->b_prev_free = bh;
-
-	nr_buffers++;
-	nr_buffers_type[bh->b_list]++;
+	nr_buffers_type[blist]++;
 }
 
-static void remove_from_lru_list(struct buffer_head * bh)
+static void __remove_from_lru_list(struct buffer_head * bh, int blist)
 {
-	if (!(bh->b_prev_free) || !(bh->b_next_free))
-		return;
-
-	if (bh->b_dev == B_FREE) {
-		printk("LRU list corrupted");
-		*(int*)0 = 0;
-	}
-	bh->b_prev_free->b_next_free = bh->b_next_free;
-	bh->b_next_free->b_prev_free = bh->b_prev_free;
-
-	if (lru_list[bh->b_list] == bh)
-		 lru_list[bh->b_list] = bh->b_next_free;
-	if (lru_list[bh->b_list] == bh)
-		 lru_list[bh->b_list] = NULL;
-	bh->b_next_free = bh->b_prev_free = NULL;
-
-	nr_buffers--;
-	nr_buffers_type[bh->b_list]--;
+	if (bh->b_prev_free || bh->b_next_free) {
+		bh->b_prev_free->b_next_free = bh->b_next_free;
+		bh->b_next_free->b_prev_free = bh->b_prev_free;
+		if (lru_list[blist] == bh)
+			lru_list[blist] = bh->b_next_free;
+		if (lru_list[blist] == bh)
+			lru_list[blist] = NULL;
+		bh->b_next_free = bh->b_prev_free = NULL;
+		nr_buffers_type[blist]--;
+	}
 }
 
-static void remove_from_free_list(struct buffer_head * bh)
+static void __remove_from_free_list(struct buffer_head * bh, int index)
 {
-	int isize = BUFSIZE_INDEX(bh->b_size);
-	if (!(bh->b_prev_free) || !(bh->b_next_free))
-		panic("VFS: Free block list corrupted");
-	if(bh->b_dev != B_FREE)
-		panic("Free list corrupted");
-	if(!free_list[isize])
-		panic("Free list empty");
 	if(bh->b_next_free == bh)
-		 free_list[isize] = NULL;
+		 free_list[index].list = NULL;
 	else {
 		bh->b_prev_free->b_next_free = bh->b_next_free;
 		bh->b_next_free->b_prev_free = bh->b_prev_free;
-		if (free_list[isize] == bh)
-			 free_list[isize] = bh->b_next_free;
+		if (free_list[index].list == bh)
+			 free_list[index].list = bh->b_next_free;
 	}
 	bh->b_next_free = bh->b_prev_free = NULL;
 }
 
-static void remove_from_queues(struct buffer_head * bh)
+/* The following two functions must operate atomically
+ * because they control the visibility of a buffer head
+ * to the rest of the kernel.
+ */
+static __inline__ void __remove_from_queues(struct buffer_head *bh)
 {
-	if (bh->b_dev == B_FREE)
-		BUG();
-	remove_from_hash_queue(bh);
-	remove_from_lru_list(bh);
+	write_lock(&hash_table_lock);
+	if (bh->b_pprev)
+		__hash_unlink(bh);
+	__remove_from_lru_list(bh, bh->b_list);
+	write_unlock(&hash_table_lock);
 }
 
-static void put_last_free(struct buffer_head * bh)
+static void insert_into_queues(struct buffer_head *bh)
 {
-	if (bh) {
-		struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
+	struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 
-		if (bh->b_count)
-			BUG();
-
-		bh->b_dev = B_FREE;  /* So it is obvious we are on the free list. */
-
-		/* Add to back of free list. */
-		if(!*bhp) {
-			*bhp = bh;
-			bh->b_prev_free = bh;
-		}
-
-		bh->b_next_free = *bhp;
-		bh->b_prev_free = (*bhp)->b_prev_free;
-		(*bhp)->b_prev_free->b_next_free = bh;
-		(*bhp)->b_prev_free = bh;
-	}
+	spin_lock(&lru_list_lock);
+	write_lock(&hash_table_lock);
+	__hash_link(bh, head);
+	__insert_into_lru_list(bh, bh->b_list);
+	write_unlock(&hash_table_lock);
+	spin_unlock(&lru_list_lock);
 }
 
-struct buffer_head * find_buffer(kdev_t dev, int block, int size)
-{		
-	struct buffer_head * next;
+/* This function must only run if there are no other
+ * references _anywhere_ to this buffer head.
+ */
+static void put_last_free(struct buffer_head * bh)
+{
+	struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
+	struct buffer_head **bhp = &head->list;
 
-	next = hash(dev,block);
-	for (;;) {
-		struct buffer_head *tmp = next;
-		if (!next)
-			break;
-		next = tmp->b_next;
-		if (tmp->b_blocknr != block || tmp->b_size != size || tmp->b_dev != dev)
-			continue;
-		next = tmp;
-		break;
+	spin_lock(&head->lock);
+	bh->b_dev = B_FREE;
+	if(!*bhp) {
+		*bhp = bh;
+		bh->b_prev_free = bh;
 	}
-	return next;
+	bh->b_next_free = *bhp;
+	bh->b_prev_free = (*bhp)->b_prev_free;
+	(*bhp)->b_prev_free->b_next_free = bh;
+	(*bhp)->b_prev_free = bh;
+	spin_unlock(&head->lock);
 }
 
 /*
@@ -579,10 +566,19 @@
  */
 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 {
-	struct buffer_head * bh;
-	bh = find_buffer(dev,block,size);
+	struct buffer_head **head = &hash(dev, block);
+	struct buffer_head *bh;
+
+	read_lock(&hash_table_lock);
+	for(bh = *head; bh; bh = bh->b_next)
+		if (bh->b_blocknr == block	&&
+		    bh->b_size    == size	&&
+		    bh->b_dev     == dev)
+			break;
 	if (bh)
-		bh->b_count++;
+		atomic_inc(&bh->b_count);
+	read_unlock(&hash_table_lock);
+
 	return bh;
 }
 
@@ -631,6 +627,8 @@
 	 * around on the free list, and we can get in a loop if we are not careful.
 	 */
 	for(nlist = 0; nlist < NR_LIST; nlist++) {
+	repeat:
+		spin_lock(&lru_list_lock);
 		bh = lru_list[nlist];
 		for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
 			if(!bh)
@@ -641,21 +639,25 @@
 				 continue;
 			if (bh->b_size == size)
 				 continue;
-			bhnext->b_count++;
-			bh->b_count++;
-			wait_on_buffer(bh);
-			bhnext->b_count--;
+			if (buffer_locked(bh)) {
+				atomic_inc(&bh->b_count);
+				spin_unlock(&lru_list_lock);
+				wait_on_buffer(bh);
+				atomic_dec(&bh->b_count);
+				goto repeat;
+			}
 			if (bh->b_dev == dev && bh->b_size != size) {
 				clear_bit(BH_Dirty, &bh->b_state);
 				clear_bit(BH_Uptodate, &bh->b_state);
 				clear_bit(BH_Req, &bh->b_state);
 				bh->b_flushtime = 0;
 			}
-			if (--bh->b_count)
-				continue;
-			remove_from_queues(bh);
-			put_last_free(bh);
+			if (atomic_read(&bh->b_count) == 0) {
+				__remove_from_queues(bh);
+				put_last_free(bh);
+			}
 		}
+		spin_unlock(&lru_list_lock);
 	}
 }
 
@@ -671,13 +673,10 @@
 	}
 }
 
-void init_buffer(struct buffer_head *bh, kdev_t dev, int block,
-		 bh_end_io_t *handler, void *dev_id)
+void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *dev_id)
 {
 	bh->b_list = BUF_CLEAN;
 	bh->b_flushtime = 0;
-	bh->b_dev = dev;
-	bh->b_blocknr = block;
 	bh->b_end_io = handler;
 	bh->b_dev_id = dev_id;
 }
@@ -688,6 +687,92 @@
 	unlock_buffer(bh);
 }
 
+static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
+{
+	mark_buffer_uptodate(bh, uptodate);
+	unlock_buffer(bh);
+	BUG();
+}
+
+static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
+{
+	static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
+	unsigned long flags;
+	struct buffer_head *tmp;
+	struct page *page;
+	int free;
+
+	mark_buffer_uptodate(bh, uptodate);
+
+	/* This is a temporary buffer used for page I/O. */
+	page = mem_map + MAP_NR(bh->b_data);
+
+	if (!uptodate)
+		SetPageError(page);
+
+	/*
+	 * Be _very_ careful from here on. Bad things can happen if
+	 * two buffer heads end IO at almost the same time and both
+	 * decide that the page is now completely done.
+	 *
+	 * Async buffer_heads are here only as labels for IO, and get
+	 * thrown away once the IO for this page is complete.  IO is
+	 * deemed complete once all buffers have been visited
+	 * (b_count==0) and are now unlocked. We must make sure that
+	 * only the _last_ buffer that decrements its count is the one
+	 * that free's the page..
+	 */
+	spin_lock_irqsave(&page_uptodate_lock, flags);
+	unlock_buffer(bh);
+	atomic_dec(&bh->b_count);
+	tmp = bh->b_this_page;
+	while (tmp != bh) {
+		if (atomic_read(&tmp->b_count) &&
+		    (tmp->b_end_io == end_buffer_io_async))
+			goto still_busy;
+		tmp = tmp->b_this_page;
+	}
+
+	/* OK, the async IO on this page is complete. */
+	spin_unlock_irqrestore(&page_uptodate_lock, flags);
+
+	/*
+	 * if none of the buffers had errors then we can set the
+	 * page uptodate:
+	 */
+	if (!PageError(page))
+		SetPageUptodate(page);
+
+	/*
+	 * Run the hooks that have to be done when a page I/O has completed.
+	 *
+	 * Note - we need to test the flags before we unlock the page, but
+	 * we must not actually free the page until after the unlock!
+	 */
+	if (test_and_clear_bit(PG_decr_after, &page->flags))
+		atomic_dec(&nr_async_pages);
+
+	if (test_and_clear_bit(PG_free_swap_after, &page->flags))
+		swap_free(page->offset);
+
+	free = test_and_clear_bit(PG_free_after, &page->flags);
+
+	if (page->owner != -1)
+		PAGE_BUG(page);
+	page->owner = (int)current;
+	UnlockPage(page);
+
+	if (free)
+		__free_page(page);
+
+	return;
+
+still_busy:
+	spin_unlock_irqrestore(&page_uptodate_lock, flags);
+	return;
+}
+
+
 /*
  * Ok, this is getblk, and it isn't very clear, again to hinder
  * race-conditions. Most of the code is seldom used, (ie repeating),
@@ -713,22 +798,26 @@
 	}
 
 	isize = BUFSIZE_INDEX(size);
-get_free:
-	bh = free_list[isize];
+	spin_lock(&free_list[isize].lock);
+	bh = free_list[isize].list;
+	if (bh) {
+		__remove_from_free_list(bh, isize);
+		atomic_set(&bh->b_count, 1);
+	}
+	spin_unlock(&free_list[isize].lock);
 	if (!bh)
 		goto refill;
-	remove_from_free_list(bh);
 
 	/* OK, FINALLY we know that this buffer is the only one of its kind,
-	 * and that it's unused (b_count=0), unlocked, and clean.
+	 * we hold a reference (b_count>0), it is unlocked, and it is clean.
 	 */
-	init_buffer(bh, dev, block, end_buffer_io_sync, NULL);
-	bh->b_count = 1;
-	bh->b_state = 0;
+	init_buffer(bh, end_buffer_io_sync, NULL);
+	bh->b_dev = dev;
+	bh->b_blocknr = block;
+	bh->b_state = 1 << BH_Mapped;
 
 	/* Insert the buffer into the regular lists */
-	insert_into_lru_list(bh);
-	insert_into_hash_list(bh);
+	insert_into_queues(bh);
 	goto out;
 
 	/*
@@ -737,24 +826,12 @@
 	 */
 refill:
 	refill_freelist(size);
-	if (!find_buffer(dev,block,size))
-		goto get_free;
 	goto repeat;
 out:
 	return bh;
 }
 
 /*
- * Put a buffer into the appropriate list, without side-effects.
- */
-static void file_buffer(struct buffer_head *bh, int list)
-{
-	remove_from_lru_list(bh);
-	bh->b_list = list;
-	insert_into_lru_list(bh);
-}
-
-/*
  * if a new dirty buffer is created we need to balance bdflush.
  *
  * in the future we might want to make bdflush aware of different
@@ -783,6 +860,7 @@
 static inline void __mark_dirty(struct buffer_head *bh, int flag)
 {
 	bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
+	clear_bit(BH_New, &bh->b_state);
 	refile_buffer(bh);
 }
 
@@ -791,34 +869,29 @@
 	__mark_dirty(bh, flag);
 }
 
-void __atomic_mark_buffer_dirty(struct buffer_head *bh, int flag)
-{
-	lock_kernel();
-	__mark_dirty(bh, flag);
-	unlock_kernel();
-}
-
 /*
  * A buffer may need to be moved from one buffer list to another
  * (e.g. in case it is not shared any more). Handle this.
  */
-void refile_buffer(struct buffer_head * buf)
+static __inline__ void __refile_buffer(struct buffer_head *bh)
 {
-	int dispose;
-
-	if (buf->b_dev == B_FREE) {
-		printk("Attempt to refile free buffer\n");
-		return;
-	}
-
-	dispose = BUF_CLEAN;
-	if (buffer_locked(buf))
+	int dispose = BUF_CLEAN;
+	if (buffer_locked(bh))
 		dispose = BUF_LOCKED;
-	if (buffer_dirty(buf))
+	if (buffer_dirty(bh))
 		dispose = BUF_DIRTY;
+	if (dispose != bh->b_list) {
+		__remove_from_lru_list(bh, bh->b_list);
+		bh->b_list = dispose;
+		__insert_into_lru_list(bh, dispose);
+	}
+}
 
-	if (dispose != buf->b_list)
-		file_buffer(buf, dispose);
+void refile_buffer(struct buffer_head *bh)
+{
+	spin_lock(&lru_list_lock);
+	__refile_buffer(bh);
+	spin_unlock(&lru_list_lock);
 }
 
 /*
@@ -828,9 +901,8 @@
 {
 	touch_buffer(buf);
 
-	if (buf->b_count) {
-		buf->b_count--;
-		wake_up(&buffer_wait);
+	if (atomic_read(&buf->b_count)) {
+		atomic_dec(&buf->b_count);
 		return;
 	}
 	printk("VFS: brelse: Trying to free free buffer\n");
@@ -844,14 +916,21 @@
  */
 void __bforget(struct buffer_head * buf)
 {
-	if (buf->b_count != 1 || buffer_locked(buf)) {
-		__brelse(buf);
-		return;
+	spin_lock(&lru_list_lock);
+	write_lock(&hash_table_lock);
+	if (atomic_read(&buf->b_count) != 1 || buffer_locked(buf)) {
+		touch_buffer(buf);
+		atomic_dec(&buf->b_count);
+	} else {
+		atomic_set(&buf->b_count, 0);
+		buf->b_state = 0;
+		if (buf->b_pprev)
+			__hash_unlink(buf);
+		__remove_from_lru_list(buf, buf->b_list);
+		put_last_free(buf);
 	}
-	buf->b_count = 0;
-	buf->b_state = 0;
-	remove_from_queues(buf);
-	put_last_free(buf);
+	write_unlock(&hash_table_lock);
+	spin_unlock(&lru_list_lock);
 }
 
 /*
@@ -941,49 +1020,25 @@
 /*
  * Note: the caller should wake up the buffer_wait list if needed.
  */
-static void put_unused_buffer_head(struct buffer_head * bh)
+static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
 {
 	if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
-		nr_buffer_heads--;
 		kmem_cache_free(bh_cachep, bh);
-		return;
+	} else {
+		bh->b_blocknr = -1;
+		init_waitqueue_head(&bh->b_wait);
+		nr_unused_buffer_heads++;
+		bh->b_next_free = unused_list;
+		bh->b_this_page = NULL;
+		unused_list = bh;
 	}
-
-//	memset(bh, 0, sizeof(*bh));
-	bh->b_blocknr = -1;
-	init_waitqueue_head(&bh->b_wait);
-	nr_unused_buffer_heads++;
-	bh->b_next_free = unused_list;
-	unused_list = bh;
 }
 
-/* 
- * We can't put completed temporary IO buffer_heads directly onto the
- * unused_list when they become unlocked, since the device driver
- * end_request routines still expect access to the buffer_head's
- * fields after the final unlock.  So, the device driver puts them on
- * the reuse_list instead once IO completes, and we recover these to
- * the unused_list here.
- *
- * Note that we don't do a wakeup here, but return a flag indicating
- * whether we got any buffer heads. A task ready to sleep can check
- * the returned value, and any tasks already sleeping will have been
- * awakened when the buffer heads were added to the reuse list.
- */
-static inline int recover_reusable_buffer_heads(void)
-{
-	struct buffer_head *head = xchg(&reuse_list, NULL);
-	int found = 0;
-	
-	if (head) {
-		do {
-			struct buffer_head *bh = head;
-			head = head->b_next_free;
-			put_unused_buffer_head(bh);
-		} while (head);
-		found = 1;
-	}
-	return found;
+static void put_unused_buffer_head(struct buffer_head *bh)
+{
+	spin_lock(&unused_list_lock);
+	__put_unused_buffer_head(bh);
+	spin_unlock(&unused_list_lock);
 }
 
 /*
@@ -995,13 +1050,15 @@
 {
 	struct buffer_head * bh;
 
-	recover_reusable_buffer_heads();
+	spin_lock(&unused_list_lock);
 	if (nr_unused_buffer_heads > NR_RESERVED) {
 		bh = unused_list;
 		unused_list = bh->b_next_free;
 		nr_unused_buffer_heads--;
+		spin_unlock(&unused_list_lock);
 		return bh;
 	}
+	spin_unlock(&unused_list_lock);
 
 	/* This is critical.  We can't swap out pages to get
 	 * more buffer heads, because the swap-out may need
@@ -1010,20 +1067,23 @@
 	if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
 		memset(bh, 0, sizeof(*bh));
 		init_waitqueue_head(&bh->b_wait);
-		nr_buffer_heads++;
 		return bh;
 	}
 
 	/*
 	 * If we need an async buffer, use the reserved buffer heads.
 	 */
-	if (async && unused_list) {
-		bh = unused_list;
-		unused_list = bh->b_next_free;
-		nr_unused_buffer_heads--;
-		return bh;
+	if (async) {
+		spin_lock(&unused_list_lock);
+		if (unused_list) {
+			bh = unused_list;
+			unused_list = bh->b_next_free;
+			nr_unused_buffer_heads--;
+			spin_unlock(&unused_list_lock);
+			return bh;
+		}
+		spin_unlock(&unused_list_lock);
 	}
-
 #if 0
 	/*
 	 * (Pending further analysis ...)
@@ -1035,7 +1095,6 @@
 	   (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
 		memset(bh, 0, sizeof(*bh));
 		init_waitqueue_head(&bh->b_wait);
-		nr_buffer_heads++;
 		return bh;
 	}
 #endif
@@ -1052,8 +1111,7 @@
  * from ordinary buffer allocations, and only async requests are allowed
  * to sleep waiting for buffer heads. 
  */
-static struct buffer_head * create_buffers(unsigned long page, 
-						unsigned long size, int async)
+static struct buffer_head * create_buffers(unsigned long page, unsigned long size, int async)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	struct buffer_head *bh, *head;
@@ -1073,11 +1131,14 @@
 
 		bh->b_state = 0;
 		bh->b_next_free = NULL;
-		bh->b_count = 0;
+		bh->b_pprev = NULL;
+		atomic_set(&bh->b_count, 0);
 		bh->b_size = size;
 
 		bh->b_data = (char *) (page+offset);
-		bh->b_list = 0;
+		bh->b_list = BUF_CLEAN;
+		bh->b_flushtime = 0;
+		bh->b_end_io = end_buffer_io_bad;
 	}
 	return head;
 /*
@@ -1118,115 +1179,16 @@
 	 */
 	add_wait_queue(&buffer_wait, &wait);
 	current->state = TASK_UNINTERRUPTIBLE;
-	if (!recover_reusable_buffer_heads())
+	if (nr_unused_buffer_heads < MAX_BUF_PER_PAGE) {
+		current->policy |= SCHED_YIELD;
 		schedule();
+	}
 	remove_wait_queue(&buffer_wait, &wait);
 	current->state = TASK_RUNNING;
 	goto try_again;
 }
 
-/* Run the hooks that have to be done when a page I/O has completed. */
-static inline void after_unlock_page (struct page * page)
-{
-	if (test_and_clear_bit(PG_decr_after, &page->flags)) {
-		atomic_dec(&nr_async_pages);
-#ifdef DEBUG_SWAP
-		printk ("DebugVM: Finished IO on page %p, nr_async_pages %d\n",
-			(char *) page_address(page), 
-			atomic_read(&nr_async_pages));
-#endif
-	}
-	if (test_and_clear_bit(PG_swap_unlock_after, &page->flags))
-		swap_after_unlock_page(page->offset);
-	if (test_and_clear_bit(PG_free_after, &page->flags))
-		__free_page(page);
-}
-
-/*
- * Free all temporary buffers belonging to a page.
- * This needs to be called with interrupts disabled.
- */
-static inline void free_async_buffers (struct buffer_head * bh)
-{
-	struct buffer_head *tmp, *tail;
-
-	/*
-	 * Link all the buffers into the b_next_free list,
-	 * so we only have to do one xchg() operation ...
-	 */
-	tail = bh;
-	while ((tmp = tail->b_this_page) != bh) {
-		tail->b_next_free = tmp;
-		tail = tmp;
-	};
-
-	/* Update the reuse list */
-	tail->b_next_free = xchg(&reuse_list, NULL);
-	reuse_list = bh;
-
-	/* Wake up any waiters ... */
-	wake_up(&buffer_wait);
-}
-
-static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
-{
-	unsigned long flags;
-	struct buffer_head *tmp;
-	struct page *page;
-
-	mark_buffer_uptodate(bh, uptodate);
-
-	/* This is a temporary buffer used for page I/O. */
-	page = mem_map + MAP_NR(bh->b_data);
-
-	if (!uptodate)
-		SetPageError(page);
-
-	/*
-	 * Be _very_ careful from here on. Bad things can happen if
-	 * two buffer heads end IO at almost the same time and both
-	 * decide that the page is now completely done.
-	 *
-	 * Async buffer_heads are here only as labels for IO, and get
-	 * thrown away once the IO for this page is complete.  IO is
-	 * deemed complete once all buffers have been visited
-	 * (b_count==0) and are now unlocked. We must make sure that
-	 * only the _last_ buffer that decrements its count is the one
-	 * that free's the page..
-	 */
-	save_flags(flags);
-	cli();
-	unlock_buffer(bh);
-	tmp = bh->b_this_page;
-	while (tmp != bh) {
-		if (buffer_locked(tmp))
-			goto still_busy;
-		tmp = tmp->b_this_page;
-	}
-
-	/* OK, the async IO on this page is complete. */
-	restore_flags(flags);
-
-	after_unlock_page(page);
-	/*
-	 * if none of the buffers had errors then we can set the
-	 * page uptodate:
-	 */
-	if (!PageError(page))
-		SetPageUptodate(page);
-	if (page->owner != -1)
-		PAGE_BUG(page);
-	page->owner = (int)current;
-	UnlockPage(page);
-
-	return;
-
-still_busy:
-	restore_flags(flags);
-	return;
-}
-
-static int create_page_buffers (int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
+static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
 {
 	struct buffer_head *head, *bh, *tail;
 	int block;
@@ -1240,9 +1202,7 @@
 	 * They show up in the buffer hash table and are registered in
 	 * page->buffers.
 	 */
-	lock_kernel();
 	head = create_buffers(page_address(page), size, 1);
-	unlock_kernel();
 	if (page->buffers)
 		BUG();
 	if (!head)
@@ -1252,7 +1212,9 @@
 		block = *(b++);
 
 		tail = bh;
-		init_buffer(bh, dev, block, end_buffer_io_async, NULL);
+		init_buffer(bh, end_buffer_io_async, NULL);
+		bh->b_dev = dev;
+		bh->b_blocknr = block;
 
 		/*
 		 * When we use bmap, we define block zero to represent
@@ -1261,9 +1223,11 @@
 		 * two cases.
 		 */
 		if (bmap && !block) {
-			set_bit(BH_Uptodate, &bh->b_state);
 			memset(bh->b_data, 0, size);
+			set_bit(BH_Uptodate, &bh->b_state);
+			continue;
 		}
+		set_bit(BH_Mapped, &bh->b_state);
 	}
 	tail->b_this_page = head;
 	get_page(page);
@@ -1287,7 +1251,6 @@
 		BUG();
 	if (!page->buffers)
 		return 0;
-	lock_kernel();
 
 	head = page->buffers;
 	bh = head;
@@ -1299,14 +1262,16 @@
 		 * is this block fully flushed?
 		 */
 		if (offset <= curr_off) {
-			if (bh->b_blocknr) {
-				bh->b_count++;
+			if (buffer_mapped(bh)) {
+				atomic_inc(&bh->b_count);
 				wait_on_buffer(bh);
 				if (bh->b_dev == B_FREE)
 					BUG();
 				mark_buffer_clean(bh);
+				clear_bit(BH_Uptodate, &bh->b_state);
+				clear_bit(BH_Mapped, &bh->b_state);
 				bh->b_blocknr = 0;
-				bh->b_count--;
+				atomic_dec(&bh->b_count);
 			}
 		}
 		curr_off = next_off;
@@ -1318,22 +1283,24 @@
 	 * the 'final' flushpage. We have invalidated the bmap
 	 * cached value unconditionally, so real IO is not
 	 * possible anymore.
-	 */
-	if (!offset)
-		try_to_free_buffers(page);
+	 *
+	 * If the free doesn't work out, the buffers can be
+	 * left around - they just turn into anonymous buffers
+	 * instead.
+	 */
+	if (!offset) {
+		if (!try_to_free_buffers(page))
+			atomic_add(PAGE_CACHE_SIZE, &buffermem);
+	}
 
-	unlock_kernel();
 	return 0;
 }
 
-static void create_empty_buffers (struct page *page,
-			struct inode *inode, unsigned long blocksize)
+static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
 {
 	struct buffer_head *bh, *head, *tail;
 
-	lock_kernel();
 	head = create_buffers(page_address(page), blocksize, 1);
-	unlock_kernel();
 	if (page->buffers)
 		BUG();
 
@@ -1341,6 +1308,7 @@
 	do {
 		bh->b_dev = inode->i_dev;
 		bh->b_blocknr = 0;
+		bh->b_end_io = end_buffer_io_bad;
 		tail = bh;
 		bh = bh->b_this_page;
 	} while (bh);
@@ -1353,12 +1321,12 @@
  * block_write_full_page() is SMP-safe - currently it's still
  * being called with the kernel lock held, but the code is ready.
  */
-int block_write_full_page (struct file *file, struct page *page, fs_getblock_t fs_get_block)
+int block_write_full_page(struct file *file, struct page *page)
 {
 	struct dentry *dentry = file->f_dentry;
 	struct inode *inode = dentry->d_inode;
-	int err, created, i;
-	unsigned long block, phys, offset;
+	int err, i;
+	unsigned long block, offset;
 	struct buffer_head *bh, *head;
 
 	if (!PageLocked(page))
@@ -1381,23 +1349,22 @@
 		if (!bh)
 			BUG();
 
-		if (!bh->b_blocknr) {
-			err = -EIO;
-			phys = fs_get_block (inode, block, 1, &err, &created);
-			if (!phys)
+		/*
+		 * If the buffer isn't up-to-date, we can't be sure
+		 * that the buffer has been initialized with the proper
+		 * block number information etc..
+		 *
+		 * Leave it to the low-level FS to make all those
+		 * decisions (block #0 may actually be a valid block)
+		 */
+		bh->b_end_io = end_buffer_io_sync;
+		if (!buffer_mapped(bh)) {
+			err = inode->i_op->get_block(inode, block, bh, 1);
+			if (err)
 				goto out;
-
-			init_buffer(bh, inode->i_dev, phys, end_buffer_io_sync, NULL);
-			bh->b_state = (1<<BH_Uptodate);
-		} else {
-			/*
-			 * block already exists, just mark it uptodate and
-			 * dirty:
-			 */
-			bh->b_end_io = end_buffer_io_sync;
-			set_bit(BH_Uptodate, &bh->b_state);
 		}
-		atomic_mark_buffer_dirty(bh,0);
+		set_bit(BH_Uptodate, &bh->b_state);
+		mark_buffer_dirty(bh,0);
 
 		bh = bh->b_this_page;
 		block++;
@@ -1410,15 +1377,15 @@
 	return err;
 }
 
-int block_write_partial_page (struct file *file, struct page *page, unsigned long offset, unsigned long bytes, const char * buf, fs_getblock_t fs_get_block)
+int block_write_partial_page(struct file *file, struct page *page, unsigned long offset, unsigned long bytes, const char * buf)
 {
 	struct dentry *dentry = file->f_dentry;
 	struct inode *inode = dentry->d_inode;
 	unsigned long block;
-	int err, created, partial;
+	int err, partial;
 	unsigned long blocksize, start_block, end_block;
 	unsigned long start_offset, start_bytes, end_bytes;
-	unsigned long bbits, phys, blocks, i, len;
+	unsigned long bbits, blocks, i, len;
 	struct buffer_head *bh, *head;
 	char * target_buf;
 
@@ -1469,46 +1436,35 @@
 				partial = 1;
 			goto skip;
 		}
-		if (!bh->b_blocknr) {
-			err = -EIO;
-			phys = fs_get_block (inode, block, 1, &err, &created);
-			if (!phys)
-				goto out;
 
-			init_buffer(bh, inode->i_dev, phys, end_buffer_io_sync, NULL);
+		/*
+		 * If the buffer is not up-to-date, we need to ask the low-level
+		 * FS to do something for us (we used to have assumptions about
+		 * the meaning of b_blocknr etc, that's bad).
+		 *
+		 * If "update" is set, that means that the low-level FS should
+		 * try to make sure that the block is up-to-date because we're
+		 * not going to fill it completely.
+		 */
+		bh->b_end_io = end_buffer_io_sync;
+		if (!buffer_mapped(bh)) {
+			err = inode->i_op->get_block(inode, block, bh, 1);
+			if (err)
+				goto out;
+		}
 
-			/*
-			 * if partially written block which has contents on
-			 * disk, then we have to read it first.
-			 * We also rely on the fact that filesystem holes
-			 * cannot be written.
-			 */
-			if (start_offset || (end_bytes && (i == end_block))) {
-				if (created) {
-					memset(bh->b_data, 0, bh->b_size);
-				} else {
-					bh->b_state = 0;
-					ll_rw_block(READ, 1, &bh);
-					lock_kernel();
-					wait_on_buffer(bh);
-					unlock_kernel();
-					err = -EIO;
-					if (!buffer_uptodate(bh))
-						goto out;
-				}
+		if (!buffer_uptodate(bh) && (start_offset || (end_bytes && (i == end_block)))) {
+			if (buffer_new(bh)) {
+				memset(bh->b_data, 0, bh->b_size);
+			} else {
+				ll_rw_block(READ, 1, &bh);
+				wait_on_buffer(bh);
+				err = -EIO;
+				if (!buffer_uptodate(bh))
+					goto out;
 			}
-
-			bh->b_state = (1<<BH_Uptodate);
-		} else {
-			/*
-			 * block already exists, just mark it uptodate:
-			 */
-			bh->b_end_io = end_buffer_io_sync;
-			set_bit(BH_Uptodate, &bh->b_state);
-			created = 0;
 		}
 
-		err = -EFAULT;
 		len = blocksize;
 		if (start_offset) {
 			len = start_bytes;
@@ -1517,8 +1473,7 @@
 			len = end_bytes;
 			end_bytes = 0;
 		}
-		if (copy_from_user(target_buf, buf, len))
-			goto out;
+		err = copy_from_user(target_buf, buf, len);
 		target_buf += len;
 		buf += len;
 
@@ -1538,12 +1493,18 @@
 		 * should not penalize them for somebody else writing
 		 * lots of dirty pages.
 		 */
+		set_bit(BH_Uptodate, &bh->b_state);
 		if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
-			__atomic_mark_buffer_dirty(bh, bdf_prm.b_un.age_buffer);
+			__mark_dirty(bh, 0);
 			if (too_many_dirty_buffers)
 				balance_dirty(bh->b_dev);
 		}
 
+		if (err) {
+			err = -EFAULT;
+			goto out;
+		}
+
 skip:
 		i++;
 		block++;
@@ -1572,6 +1533,9 @@
  *
  * brw_page() is SMP-safe, although it's being called with the
  * kernel lock held - but the code is ready.
+ *
+ * FIXME: we need a swapper_inode->get_block function to remove
+ *        some of the bmap kludges and interface ugliness here.
  */
 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
 {
@@ -1600,7 +1564,7 @@
 	do {
 		block = *(b++);
 
-		if (fresh && (bh->b_count != 0))
+		if (fresh && (atomic_read(&bh->b_count) != 0))
 			BUG();
 		if (rw == READ) {
 			if (!fresh)
@@ -1613,6 +1577,7 @@
 					BUG();
 				if (!buffer_uptodate(bh)) {
 					arr[nr++] = bh;
+					atomic_inc(&bh->b_count);
 				}
 			}
 		} else { /* WRITE */
@@ -1625,8 +1590,9 @@
 					BUG();
 			}
 			set_bit(BH_Uptodate, &bh->b_state);
-			atomic_mark_buffer_dirty(bh, 0);
+			set_bit(BH_Dirty, &bh->b_state);
 			arr[nr++] = bh;
+			atomic_inc(&bh->b_count);
 		}
 		bh = bh->b_this_page;
 	} while (bh != head);
@@ -1649,30 +1615,7 @@
 }
 
 /*
- * This is called by end_request() when I/O has completed.
- */
-void mark_buffer_uptodate(struct buffer_head * bh, int on)
-{
-	if (on) {
-		struct buffer_head *tmp = bh;
-		struct page *page;
-		set_bit(BH_Uptodate, &bh->b_state);
-		/* If a page has buffers and all these buffers are uptodate,
-		 * then the page is uptodate. */
-		do {
-			if (!test_bit(BH_Uptodate, &tmp->b_state))
-				return;
-			tmp=tmp->b_this_page;
-		} while (tmp && tmp != bh);
-		page = mem_map + MAP_NR(bh->b_data);
-		SetPageUptodate(page);
-		return;
-	}
-	clear_bit(BH_Uptodate, &bh->b_state);
-}
-
-/*
- * Generic "readpage" function for block devices that have the normal
+ * Generic "read page" function for block devices that have the normal
  * bmap functionality. This is most of the block device filesystems.
  * Reads the page asynchronously --- the unlock_buffer() and
  * mark_buffer_uptodate() functions propagate buffer state into the
@@ -1682,7 +1625,7 @@
 {
 	struct dentry *dentry = file->f_dentry;
 	struct inode *inode = dentry->d_inode;
-	unsigned long iblock, phys_block;
+	unsigned long iblock;
 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
 	unsigned int blocksize, blocks;
 	int nr;
@@ -1700,33 +1643,25 @@
 	head = page->buffers;
 	bh = head;
 	nr = 0;
+
 	do {
-		phys_block = bh->b_blocknr;
-		/*
-		 * important, we have to retry buffers that already have
-		 * their bnr cached but had an IO error!
-		 */
-		if (!buffer_uptodate(bh)) {
-			phys_block = inode->i_op->bmap(inode, iblock);
-			/*
-			 * this is safe to do because we hold the page lock:
-			 */
-			if (phys_block) {
-				init_buffer(bh, inode->i_dev, phys_block,
-						end_buffer_io_async, NULL);
-				arr[nr] = bh;
-				nr++;
-			} else {
-				/*
-				 * filesystem 'hole' represents zero-contents:
-				 */
+		if (buffer_uptodate(bh))
+			continue;
+
+		if (!buffer_mapped(bh)) {
+			inode->i_op->get_block(inode, iblock, bh, 0);
+			if (!buffer_mapped(bh)) {
 				memset(bh->b_data, 0, blocksize);
 				set_bit(BH_Uptodate, &bh->b_state);
+				continue;
 			}
 		}
-		iblock++;
-		bh = bh->b_this_page;
-	} while (bh != head);
+
+		init_buffer(bh, end_buffer_io_async, NULL);
+		atomic_inc(&bh->b_count);
+		arr[nr] = bh;
+		nr++;
+	} while (iblock++, (bh = bh->b_this_page) != head);
 
 	++current->maj_flt;
 	if (nr) {
@@ -1770,8 +1705,9 @@
 	}
 
 	isize = BUFSIZE_INDEX(size);
-	insert_point = free_list[isize];
 
+	spin_lock(&free_list[isize].lock);
+	insert_point = free_list[isize].list;
 	tmp = bh;
 	while (1) {
 		if (insert_point) {
@@ -1790,9 +1726,11 @@
 			break;
 	}
 	tmp->b_this_page = bh;
-	free_list[isize] = bh;
+	free_list[isize].list = bh;
+	spin_unlock(&free_list[isize].lock);
+
 	mem_map[MAP_NR(page)].buffers = bh;
-	buffermem += PAGE_SIZE;
+	atomic_add(PAGE_SIZE, &buffermem);
 	return 1;
 }
 
@@ -1800,7 +1738,7 @@
  * Can the buffer be thrown out?
  */
 #define BUFFER_BUSY_BITS	((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
-#define buffer_busy(bh)	((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
+#define buffer_busy(bh)		(atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
 
 /*
  * try_to_free_buffers() checks if all the buffers on this particular page
@@ -1808,90 +1746,70 @@
  *
  * Wake up bdflush() if this fails - if we're running low on memory due
  * to dirty buffers, we need to flush them out as quickly as possible.
+ *
+ * NOTE: There are quite a number of ways that threads of control can
+ *       obtain a reference to a buffer head within a page.  So we must
+ *	 lock out all of these paths to cleanly toss the page.
  */
 int try_to_free_buffers(struct page * page)
 {
 	struct buffer_head * tmp, * bh = page->buffers;
+	int index = BUFSIZE_INDEX(bh->b_size);
+	int ret;
 
+	spin_lock(&lru_list_lock);
+	write_lock(&hash_table_lock);
+	spin_lock(&free_list[index].lock);
 	tmp = bh;
 	do {
 		struct buffer_head * p = tmp;
 
 		tmp = tmp->b_this_page;
-		if (!buffer_busy(p))
-			continue;
-
-		too_many_dirty_buffers = 1;
-		wakeup_bdflush(0);
-		return 0;
+		if (buffer_busy(p))
+			goto busy_buffer_page;
 	} while (tmp != bh);
 
+	spin_lock(&unused_list_lock);
 	tmp = bh;
 	do {
 		struct buffer_head * p = tmp;
 		tmp = tmp->b_this_page;
 
-		/* The buffer can be either on the regular queues or on the free list.. */		
-		if (p->b_dev == B_FREE)
-			remove_from_free_list(p);
-		else
-			remove_from_queues(p);
-
-		put_unused_buffer_head(p);
+		/* The buffer can be either on the regular
+		 * queues or on the free list..
+		 */
+		if (p->b_dev == B_FREE) {
+			__remove_from_free_list(p, index);
+		} else {
+			if (p->b_pprev)
+				__hash_unlink(p);
+			__remove_from_lru_list(p, p->b_list);
+		}
+		__put_unused_buffer_head(p);
 	} while (tmp != bh);
+	spin_unlock(&unused_list_lock);
 
 	/* Wake up anyone waiting for buffer heads */
 	wake_up(&buffer_wait);
 
 	/* And free the page */
 	page->buffers = NULL;
-	if (__free_page(page)) {
-		buffermem -= PAGE_SIZE;
-		return 1;
-	}
-	return 0;
-}
-
-/* ================== Debugging =================== */
-
-void show_buffers(void)
-{
-	struct buffer_head * bh;
-	int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
-	int protected = 0;
-	int nlist;
-	static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
-
-	printk("Buffer memory:   %6dkB\n",buffermem>>10);
-	printk("Buffer heads:    %6d\n",nr_buffer_heads);
-	printk("Buffer blocks:   %6d\n",nr_buffers);
-	printk("Buffer hashed:   %6d\n",nr_hashed_buffers);
-
-	for(nlist = 0; nlist < NR_LIST; nlist++) {
-	  found = locked = dirty = used = lastused = protected = 0;
-	  bh = lru_list[nlist];
-	  if(!bh) continue;
-
-	  do {
-		found++;
-		if (buffer_locked(bh))
-			locked++;
-		if (buffer_protected(bh))
-			protected++;
-		if (buffer_dirty(bh))
-			dirty++;
-		if (bh->b_count)
-			used++, lastused = found;
-		bh = bh->b_next_free;
-	  } while (bh != lru_list[nlist]);
-	  printk("%8s: %d buffers, %d used (last=%d), "
-		 "%d locked, %d protected, %d dirty\n",
-		 buf_types[nlist], found, used, lastused,
-		 locked, protected, dirty);
-	};
+	__free_page(page);
+	ret = 1;
+out:
+	spin_unlock(&free_list[index].lock);
+	write_unlock(&hash_table_lock);
+	spin_unlock(&lru_list_lock);
+	return ret;
+
+busy_buffer_page:
+	/* Uhhuh, start writeback so that we don't end up with all dirty pages */
+	too_many_dirty_buffers = 1;
+	wakeup_bdflush(0);
+	ret = 0;
+	goto out;
 }
 
-
 /* ===================== Init ======================= */
 
 /*
@@ -1901,31 +1819,53 @@
  */
 void __init buffer_init(unsigned long memory_size)
 {
-	int order;
+	int order, i;
 	unsigned int nr_hash;
 
-	/* we need to guess at the right sort of size for a buffer cache.
-	   the heuristic from working with large databases and getting
-	   fsync times (ext2) manageable, is the following */
-
-	memory_size >>= 22;
-	for (order = 5; (1UL << order) < memory_size; order++);
+	/* The buffer cache hash table is less important these days,
+	 * trim it a bit.
+	 */
+	memory_size >>= 14;
+	memory_size *= sizeof(struct buffer_head *);
+	for (order = 0; (PAGE_SIZE << order) < memory_size; order++)
+		;
 
 	/* try to allocate something until we get it or we're asking
 	   for something that is really too small */
 
 	do {
-		nr_hash = (1UL << order) * PAGE_SIZE /
-		    sizeof(struct buffer_head *);
+		unsigned long tmp;
+
+		nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
+		bh_hash_mask = (nr_hash - 1);
+
+		tmp = nr_hash;
+		bh_hash_shift = 0;
+		while((tmp >>= 1UL) != 0UL)
+			bh_hash_shift++;
+
 		hash_table = (struct buffer_head **)
 		    __get_free_pages(GFP_ATOMIC, order);
-	} while (hash_table == NULL && --order > 4);
-	printk("buffer-cache hash table entries: %d (order: %d, %ld bytes)\n", nr_hash, order, (1UL<<order) * PAGE_SIZE);
-	
+	} while (hash_table == NULL && --order > 0);
+	printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
+	       nr_hash, order, (1UL<<order) * PAGE_SIZE);
+
 	if (!hash_table)
 		panic("Failed to allocate buffer hash table\n");
-	memset(hash_table, 0, nr_hash * sizeof(struct buffer_head *));
-	bh_hash_mask = nr_hash-1;
+
+	/* Setup hash chains. */
+	for(i = 0; i < nr_hash; i++)
+		hash_table[i] = NULL;
+
+	/* Setup free lists. */
+	for(i = 0; i < NR_SIZES; i++) {
+		free_list[i].list = NULL;
+		free_list[i].lock = SPIN_LOCK_UNLOCKED;
+	}
+
+	/* Setup lru lists. */
+	for(i = 0; i < NR_LIST; i++)
+		lru_list[i] = NULL;
 
 	bh_cachep = kmem_cache_create("buffer_head",
 				      sizeof(struct buffer_head),
@@ -1933,21 +1873,6 @@
 				      SLAB_HWCACHE_ALIGN, NULL, NULL);
 	if(!bh_cachep)
 		panic("Cannot create buffer head SLAB cache\n");
-	/*
-	 * Allocate the reserved buffer heads.
-	 */
-	while (nr_buffer_heads < NR_RESERVED) {
-		struct buffer_head * bh;
-
-		bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
-		if (!bh)
-			break;
-		put_unused_buffer_head(bh);
-		nr_buffer_heads++;
-	}
-
-	lru_list[BUF_CLEAN] = 0;
-	grow_buffers(BLOCK_SIZE);
 }
 
 
@@ -1983,70 +1908,49 @@
 
 static int sync_old_buffers(void)
 {
-	int i;
-	int ndirty, nwritten;
 	int nlist;
-	int ncount;
-	struct buffer_head * bh, *next;
 
+	lock_kernel();
 	sync_supers(0);
 	sync_inodes(0);
+	unlock_kernel();
 
-	ncount = 0;
-#ifdef DEBUG
-	for(nlist = 0; nlist < NR_LIST; nlist++)
-#else
-	for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++)
-#endif
-	{
-		ndirty = 0;
-		nwritten = 0;
+	for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
+		struct buffer_head *bh;
 	repeat:
-
+		spin_lock(&lru_list_lock);
 		bh = lru_list[nlist];
-		if(bh) 
-			 for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
-				 /* We may have stalled while waiting for I/O to complete. */
-				 if(bh->b_list != nlist) goto repeat;
-				 next = bh->b_next_free;
-				 if(!lru_list[nlist]) {
-					 printk("Dirty list empty %d\n", i);
-					 break;
-				 }
-				 
-				 /* Clean buffer on dirty list?  Refile it */
-				 if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh)) {
-					 refile_buffer(bh);
-					 continue;
-				 }
-				  
-				  /* Unlocked buffer on locked list?  Refile it */
-				  if (nlist == BUF_LOCKED && !buffer_locked(bh)) {
-					  refile_buffer(bh);
-					  continue;
-				  }
+		if(bh) {
+			struct buffer_head *next;
+			int i;
+			for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
+				next = bh->b_next_free;
+
+				/* If the buffer is not on the proper list,
+				 * then refile it.
+				 */
+				if ((nlist == BUF_DIRTY &&
+				     (!buffer_dirty(bh) && !buffer_locked(bh))) ||
+				    (nlist == BUF_LOCKED && !buffer_locked(bh))) {
+					__refile_buffer(bh);
+					continue;
+				}
 				 
-				 if (buffer_locked(bh) || !buffer_dirty(bh))
-					  continue;
-				 ndirty++;
-				 nwritten++;
-				 next->b_count++;
-				 bh->b_count++;
-				 bh->b_flushtime = 0;
-#ifdef DEBUG
-				 if(nlist != BUF_DIRTY) ncount++;
-#endif
-				 ll_rw_block(WRITE, 1, &bh);
-				 bh->b_count--;
-				 next->b_count--;
-			 }
+				if (buffer_locked(bh) || !buffer_dirty(bh))
+					continue;
+
+				/* OK, now we are committed to write it out. */
+				bh->b_flushtime = 0;
+				atomic_inc(&bh->b_count);
+				spin_unlock(&lru_list_lock);
+				ll_rw_block(WRITE, 1, &bh);
+				atomic_dec(&bh->b_count);
+				goto repeat;
+			}
+		}
+		spin_unlock(&lru_list_lock);
 	}
 	run_task_queue(&tq_disk);
-#ifdef DEBUG
-	if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
-	printk("Wrote %d/%d buffers\n", nwritten, ndirty);
-#endif
-	run_task_queue(&tq_disk);
 	return 0;
 }
 
@@ -2060,7 +1964,6 @@
 {
 	int i, error = -EPERM;
 
-	lock_kernel();
 	if (!capable(CAP_SYS_ADMIN))
 		goto out;
 
@@ -2092,7 +1995,6 @@
 	 */
 	error = 0;
 out:
-	unlock_kernel();
 	return error;
 }
 
@@ -2114,52 +2016,37 @@
 	sprintf(current->comm, "kflushd");
 	bdflush_tsk = current;
 
-	/*
-	 *	As a kernel thread we want to tamper with system buffers
-	 *	and other internals and thus be subject to the SMP locking
-	 *	rules. (On a uniprocessor box this does nothing).
-	 */
-	lock_kernel();
-		 
 	for (;;) {
 		int nlist;
 
 		CHECK_EMERGENCY_SYNC
 
-		for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++)
-		{
-			int nr;
-			int written = 0;
+		for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
+			int nr, major, written = 0;
 			struct buffer_head *next;
-			int major;
 
 		repeat:
+			spin_lock(&lru_list_lock);
 			next = lru_list[nlist];
 			nr = nr_buffers_type[nlist];
-
 			while (nr-- > 0) {
 				struct buffer_head *bh = next;
-				/* We may have stalled while waiting for I/O to complete. */
-				if (next->b_list != nlist)
-					goto repeat;
+
 				next = next->b_next_free;
 					
-				/* Clean buffer on dirty list?  Refile it */
-				if (nlist == BUF_DIRTY && !buffer_dirty(bh)) {
-					refile_buffer(bh);
-					continue;
-				}
-					
-				/* Unlocked buffer on locked list?  Refile it */
-				if (nlist == BUF_LOCKED && !buffer_locked(bh)) {
-					refile_buffer(bh);
+				/* If the buffer is not on the correct list,
+				 * then refile it.
+				 */
+				if ((nlist == BUF_DIRTY &&
+				     (!buffer_dirty(bh) && !buffer_locked(bh))) ||
+				    (nlist == BUF_LOCKED && !buffer_locked(bh))) {
+					__refile_buffer(bh);
 					continue;
 				}
 
-				/*
-				 * If we aren't in panic mode, don't write out too much
-				 * at a time. Also, don't write out buffers we don't really
-				 * have to write out yet..
+				/* If we aren't in panic mode, don't write out too much
+				 * at a time. Also, don't write out buffers we don't
+				 * really have to write out yet..
 				 */
 				if (!too_many_dirty_buffers) {
 					if (written > bdf_prm.b_un.ndirty)
@@ -2172,9 +2059,6 @@
 					 continue;
 
 				major = MAJOR(bh->b_dev);
-				if (next)
-					next->b_count++;
-				bh->b_count++;
 				written++;
 				bh->b_flushtime = 0;
 
@@ -2182,18 +2066,18 @@
 				 * For the loop major we can try to do asynchronous writes,
 				 * but we have to guarantee that we're making some progress..
 				 */
+				atomic_inc(&bh->b_count);
+				spin_unlock(&lru_list_lock);
 				if (major == LOOP_MAJOR && written > 1) {
 					ll_rw_block(WRITEA, 1, &bh);
 					if (buffer_dirty(bh))
 						--written;
 				} else
 					ll_rw_block(WRITE, 1, &bh);
-
-				bh->b_count--;
-				if (next)
-					next->b_count--;
-				wake_up(&buffer_wait);
+				atomic_dec(&bh->b_count);
+				goto repeat;
 			}
+			spin_unlock(&lru_list_lock);
 		}
 		run_task_queue(&tq_disk);
 		wake_up(&bdflush_done);
@@ -2206,7 +2090,10 @@
 		 */
 		if (!too_many_dirty_buffers || nr_buffers_type[BUF_DIRTY] < bdf_prm.b_un.ndirty) {
 			too_many_dirty_buffers = 0;
-			sleep_on_timeout(&bdflush_wait, 5*HZ);
+			spin_lock_irq(&current->sigmask_lock);
+			flush_signals(current);
+			spin_unlock_irq(&current->sigmask_lock);
+			interruptible_sleep_on_timeout(&bdflush_wait, 5*HZ);
 		}
 	}
 }

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)