
From: Neil Brown <neilb@cse.unsw.edu.au>

When an NFS request arrives, it contains a filehandle which needs to be
converted to a dentry.  Many filesystems use find_exported_dentry in
fs/exportfs/expfs.c.  A key part of this on filesystem where a 32bit inode
number uniquely locates a file is export_iget which calls iget(sb, inum).

iget will either:

   1/ find the inode in the inode cache and return it

 or

   2/ create a new inode and call ->read_inode to load it from the
      storage device.

export_iget then verifies the inode is really a good inode (->read_inode
didn't detect any problems) and the right inode (base on generation number
from the file handle).

For this to work reliably, it is important that whenever an inode is *not* in
the cache, the on-device version is up-to-date.  Otherwise, when read_inode
loads the inode it will get bad data.

For a file that has not been deleted, this condition always holds: a dirty
inode is always flushed to disc before the inode is unhashed.

However for a file that is being deleted this condition doesn't (didn't)
hold.  When iput -> iput_final -> generic_drop_inode -> generic_delete_inode
is called we would unhash the inode before calling into the filesytem through
->delete_inode.

So there is a small window between when generic_delete_inode unhashes the
inode, and when ->delete_inode writes something to disc, where a call to
->read_inode (for export_iget) might discover what it thinks is a valid
inode, but is really one that is in the process of being destroyed.

It is this window that I want to close by moving the unhashing to the end of
generic_delete_inode.



 fs/fs-writeback.c |    3 ++-
 fs/inode.c        |   54 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 55 insertions(+), 2 deletions(-)

diff -puN fs/fs-writeback.c~inode-unhashing-fix-2 fs/fs-writeback.c
--- 25/fs/fs-writeback.c~inode-unhashing-fix-2	2003-05-13 00:30:26.000000000 -0700
+++ 25-akpm/fs/fs-writeback.c	2003-05-13 00:30:26.000000000 -0700
@@ -90,7 +90,8 @@ void __mark_inode_dirty(struct inode *in
 		 * Only add valid (hashed) inodes to the superblock's
 		 * dirty list.  Add blockdev inodes as well.
 		 */
-		if (hlist_unhashed(&inode->i_hash) && !S_ISBLK(inode->i_mode))
+		if ((hlist_unhashed(&inode->i_hash) || (inode->i_state & (I_FREEING|I_CLEAR)))
+		    && !S_ISBLK(inode->i_mode))
 			goto out;
 
 		/*
diff -puN fs/inode.c~inode-unhashing-fix-2 fs/inode.c
--- 25/fs/inode.c~inode-unhashing-fix-2	2003-05-13 00:30:26.000000000 -0700
+++ 25-akpm/fs/inode.c	2003-05-13 00:32:45.000000000 -0700
@@ -466,6 +466,7 @@ static int shrink_icache_memory(int nr, 
 	return inodes_stat.nr_unused;
 }
 
+static void __wait_on_freeing_inode(struct inode *inode);
 /*
  * Called with the inode lock held.
  * NOTE: we are not increasing the inode-refcount, you must call __iget()
@@ -477,6 +478,7 @@ static struct inode * find_inode(struct 
 	struct hlist_node *node;
 	struct inode * inode = NULL;
 
+repeat:
 	hlist_for_each (node, head) { 
 		prefetch(node->next);
 		inode = hlist_entry(node, struct inode, i_hash);
@@ -484,6 +486,10 @@ static struct inode * find_inode(struct 
 			continue;
 		if (!test(inode, data))
 			continue;
+		if (inode->i_state & (I_FREEING|I_CLEAR)) {
+			__wait_on_freeing_inode(inode);
+			goto repeat;
+		}
 		break;
 	}
 	return node ? inode : NULL;
@@ -498,6 +504,7 @@ static struct inode * find_inode_fast(st
 	struct hlist_node *node;
 	struct inode * inode = NULL;
 
+repeat:
 	hlist_for_each (node, head) {
 		prefetch(node->next);
 		inode = list_entry(node, struct inode, i_hash);
@@ -505,6 +512,10 @@ static struct inode * find_inode_fast(st
 			continue;
 		if (inode->i_sb != sb)
 			continue;
+		if (inode->i_state & (I_FREEING|I_CLEAR)) {
+			__wait_on_freeing_inode(inode);
+			goto repeat;
+		}
 		break;
 	}
 	return node ? inode : NULL;
@@ -933,11 +944,22 @@ void remove_inode_hash(struct inode *ino
 	spin_unlock(&inode_lock);
 }
 
+/*
+ * Tell the filesystem that this inode is no longer of any interest and should
+ * be completely destroyed.
+ *
+ * We leave the inode in the inode hash table until *after* the filesystem's
+ * ->delete_inode completes.  This ensures that an iget (such as nfsd might
+ * instigate) will always find up-to-date information either in the hash or on
+ * disk.
+ *
+ * I_FREEING is set so that no-one will take a new reference to the inode while
+ * it is being deleted.
+ */
 void generic_delete_inode(struct inode *inode)
 {
 	struct super_operations *op = inode->i_sb->s_op;
 
-	hlist_del_init(&inode->i_hash);
 	list_del_init(&inode->i_list);
 	inode->i_state|=I_FREEING;
 	inodes_stat.nr_inodes--;
@@ -956,6 +978,10 @@ void generic_delete_inode(struct inode *
 		delete(inode);
 	} else
 		clear_inode(inode);
+	spin_lock(&inode_lock);
+	hlist_del_init(&inode->i_hash);
+	spin_unlock(&inode_lock);
+	wake_up_inode(inode);
 	if (inode->i_state != I_CLEAR)
 		BUG();
 	destroy_inode(inode);
@@ -1229,6 +1255,32 @@ repeat:
 	__set_current_state(TASK_RUNNING);
 }
 
+/*
+ * If we try to find an inode in the inode hash while it is being deleted, we
+ * have to wait until the filesystem completes its deletion before reporting
+ * that it isn't found.  This is because iget will immediately call
+ * ->read_inode, and we want to be sure that evidence of the deletion is found
+ * by ->read_inode.
+ *
+ * This call might return early if an inode which shares the waitq is woken up.
+ * This is most easily handled by the caller which will loop around again
+ * looking for the inode.
+ *
+ * This is called with inode_lock held.
+ */
+static void __wait_on_freeing_inode(struct inode *inode)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	wait_queue_head_t *wq = i_waitq_head(inode);
+
+	add_wait_queue(wq, &wait);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	spin_unlock(&inode_lock);
+	schedule();
+	remove_wait_queue(wq, &wait);
+	spin_lock(&inode_lock);
+}
+
 void wake_up_inode(struct inode *inode)
 {
 	wait_queue_head_t *wq = i_waitq_head(inode);

_
