

__unhash_process acquires the dcache_lock while holding the
tasklist_lock for writing. This can deadlock. Additionally,
fs/proc/base.c incorrectly assumed that p->pid would be set to 0 during
release_task.

The patch fixes that by adding a new spinlock to the task structure and
fixing all references to (!p->pid).

The alternative to the new spinlock would be to hold dcache_lock around
__unhash_process.

- fs/proc/base.c assumed that p->pid is reset to 0 during exit.  This is
  not the case anymore.  I now look at the count of the pid structure for
  PIDTYPE_PID.

- de_thread now tested - as broken as it was before: open handles to
  /proc/<pid> are either stale or invalid after an exec of a nptl process,
  if the exec was call from a secondary thread.

- a few lock_kernels removed - that part of /proc doesn't need it.

- additional instances of 'if(current->pid)' replaced with pid_alive.



 fs/exec.c                 |   37 ++---------
 fs/proc/base.c            |  154 ++++++++++++++++++++++++++++++++--------------
 fs/proc/root.c            |    2 
 include/linux/init_task.h |    1 
 include/linux/proc_fs.h   |    2 
 include/linux/sched.h     |    2 
 kernel/exit.c             |   40 +++--------
 7 files changed, 135 insertions(+), 103 deletions(-)

diff -puN fs/exec.c~dcache_lock-vs-tasklist_lock-take-3 fs/exec.c
--- 25/fs/exec.c~dcache_lock-vs-tasklist_lock-take-3	2003-05-11 21:36:07.000000000 -0700
+++ 25-akpm/fs/exec.c	2003-05-11 21:36:07.000000000 -0700
@@ -529,30 +529,6 @@ static int exec_mmap(struct mm_struct *m
 	return 0;
 }
 
-static struct dentry *clean_proc_dentry(struct task_struct *p)
-{
-	struct dentry *proc_dentry = p->proc_dentry;
-
-	if (proc_dentry) {
-		spin_lock(&dcache_lock);
-		if (!d_unhashed(proc_dentry)) {
-			dget_locked(proc_dentry);
-			__d_drop(proc_dentry);
-		} else
-			proc_dentry = NULL;
-		spin_unlock(&dcache_lock);
-	}
-	return proc_dentry;
-}
-
-static inline void put_proc_dentry(struct dentry *dentry)
-{
-	if (dentry) {
-		shrink_dcache_parent(dentry);
-		dput(dentry);
-	}
-}
-
 /*
  * This function makes sure the current process has its own signal table,
  * so that flush_signal_handlers can later reset the handlers without
@@ -660,9 +636,11 @@ static inline int de_thread(struct task_
 		while (leader->state != TASK_ZOMBIE)
 			yield();
 
+		spin_lock(&leader->proc_lock);
+		spin_lock(&current->proc_lock);
+		proc_dentry1 = proc_pid_unhash(current);
+		proc_dentry2 = proc_pid_unhash(leader);
 		write_lock_irq(&tasklist_lock);
-		proc_dentry1 = clean_proc_dentry(current);
-		proc_dentry2 = clean_proc_dentry(leader);
 
 		if (leader->tgid != current->tgid)
 			BUG();
@@ -702,9 +680,10 @@ static inline int de_thread(struct task_
 		state = leader->state;
 
 		write_unlock_irq(&tasklist_lock);
-
-		put_proc_dentry(proc_dentry1);
-		put_proc_dentry(proc_dentry2);
+		spin_unlock(&leader->proc_lock);
+		spin_unlock(&current->proc_lock);
+		proc_pid_flush(proc_dentry1);
+		proc_pid_flush(proc_dentry2);
 
 		if (state != TASK_ZOMBIE)
 			BUG();
diff -puN fs/proc/base.c~dcache_lock-vs-tasklist_lock-take-3 fs/proc/base.c
--- 25/fs/proc/base.c~dcache_lock-vs-tasklist_lock-take-3	2003-05-11 21:36:07.000000000 -0700
+++ 25-akpm/fs/proc/base.c	2003-05-11 21:57:42.000000000 -0700
@@ -622,6 +622,12 @@ static struct inode_operations proc_pid_
 	.follow_link	= proc_pid_follow_link
 };
 
+static int pid_alive(struct task_struct *p)
+{
+	BUG_ON(p->pids[PIDTYPE_PID].pidptr != &p->pids[PIDTYPE_PID].pid);
+	return atomic_read(&p->pids[PIDTYPE_PID].pid.count);
+}
+
 #define NUMBUF 10
 
 static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
@@ -633,6 +639,9 @@ static int proc_readfd(struct file * fil
 	char buf[NUMBUF];
 	struct files_struct * files;
 
+	retval = -ENOENT;
+	if (!pid_alive(p))
+		goto out;
 	retval = 0;
 	pid = p->pid;
 
@@ -694,48 +703,46 @@ static int proc_base_readdir(struct file
 	int pid;
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct pid_entry *p;
-	int ret = 0;
+	int ret;
 
-	lock_kernel();
+	ret = -ENOENT;
+	if (!pid_alive(proc_task(inode)))
+		goto out;
 
+	ret = 0;
 	pid = proc_task(inode)->pid;
-	if (!pid) {
-		ret = -ENOENT;
-		goto out;
-	}
 	i = filp->f_pos;
 	switch (i) {
-		case 0:
-			if (filldir(dirent, ".", 1, i, inode->i_ino, DT_DIR) < 0)
-				goto out;
-			i++;
-			filp->f_pos++;
-			/* fall through */
-		case 1:
-			if (filldir(dirent, "..", 2, i, PROC_ROOT_INO, DT_DIR) < 0)
+	case 0:
+		if (filldir(dirent, ".", 1, i, inode->i_ino, DT_DIR) < 0)
+			goto out;
+		i++;
+		filp->f_pos++;
+		/* fall through */
+	case 1:
+		if (filldir(dirent, "..", 2, i, PROC_ROOT_INO, DT_DIR) < 0)
+			goto out;
+		i++;
+		filp->f_pos++;
+		/* fall through */
+	default:
+		i -= 2;
+		if (i >= ARRAY_SIZE(base_stuff)) {
+			ret = 1;
+			goto out;
+		}
+		p = base_stuff + i;
+		while (p->name) {
+			if (filldir(dirent, p->name, p->len, filp->f_pos,
+				    fake_ino(pid, p->type), p->mode >> 12) < 0)
 				goto out;
-			i++;
 			filp->f_pos++;
-			/* fall through */
-		default:
-			i -= 2;
-			if (i>=sizeof(base_stuff)/sizeof(base_stuff[0])) {
-				ret = 1;
-				goto out;
-			}
-			p = base_stuff + i;
-			while (p->name) {
-				if (filldir(dirent, p->name, p->len, filp->f_pos,
-					    fake_ino(pid, p->type), p->mode >> 12) < 0)
-					goto out;
-				filp->f_pos++;
-				p++;
-			}
+			p++;
+		}
 	}
 
 	ret = 1;
 out:
-	unlock_kernel();
 	return ret;
 }
 
@@ -772,7 +779,7 @@ static struct inode *proc_pid_make_inode
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_ino = fake_ino(task->pid, ino);
 
-	if (!task->pid)
+	if (!pid_alive(task))
 		goto out_unlock;
 
 	/*
@@ -806,7 +813,7 @@ out_unlock:
  */
 static int pid_revalidate(struct dentry * dentry, int flags)
 {
-	if (proc_task(dentry->d_inode)->pid)
+	if (pid_alive(proc_task(dentry->d_inode)))
 		return 1;
 	d_drop(dentry);
 	return 0;
@@ -840,18 +847,23 @@ static int pid_fd_revalidate(struct dent
 static void pid_base_iput(struct dentry *dentry, struct inode *inode)
 {
 	struct task_struct *task = proc_task(inode);
-	write_lock_irq(&tasklist_lock);
+	spin_lock(&task->proc_lock);
 	if (task->proc_dentry == dentry)
 		task->proc_dentry = NULL;
-	write_unlock_irq(&tasklist_lock);
+	spin_unlock(&task->proc_lock);
 	iput(inode);
 }
 
 static int pid_delete_dentry(struct dentry * dentry)
 {
-	return proc_task(dentry->d_inode)->pid == 0;
+	/* Is the task we represent dead?
+	 * If so, then don't put the dentry on the lru list,
+	 * kill it immediately.
+	 */
+	return !pid_alive(proc_task(dentry->d_inode));
 }
 
+
 static struct dentry_operations pid_fd_dentry_operations =
 {
 	.d_revalidate	= pid_fd_revalidate,
@@ -907,6 +919,8 @@ static struct dentry *proc_lookupfd(stru
 
 	if (fd == ~0U)
 		goto out;
+	if (!pid_alive(task))
+		goto out;
 
 	inode = proc_pid_make_inode(dir->i_sb, task, PROC_PID_FD_DIR+fd);
 	if (!inode)
@@ -935,8 +949,6 @@ static struct dentry *proc_lookupfd(stru
 	ei->op.proc_get_link = proc_fd_link;
 	dentry->d_op = &pid_fd_dentry_operations;
 	d_add(dentry, inode);
-	if (!proc_task(dentry->d_inode)->pid)
-		d_drop(dentry);
 	return NULL;
 
 out_unlock2:
@@ -973,6 +985,9 @@ static struct dentry *proc_base_lookup(s
 	error = -ENOENT;
 	inode = NULL;
 
+	if (!pid_alive(task))
+		goto out;
+
 	for (p = base_stuff; p->name; p++) {
 		if (p->len != dentry->d_name.len)
 			continue;
@@ -1054,8 +1069,6 @@ static struct dentry *proc_base_lookup(s
 	}
 	dentry->d_op = &pid_dentry_operations;
 	d_add(dentry, inode);
-	if (!proc_task(dentry->d_inode)->pid)
-		d_drop(dentry);
 	return NULL;
 
 out:
@@ -1093,6 +1106,55 @@ static struct inode_operations proc_self
 	.follow_link	= proc_self_follow_link,
 };
 
+/**
+ * proc_pid_unhash -  Unhash /proc/<pid> entry from the dcache.
+ * @p: task that should be flushed.
+ *
+ * Drops the /proc/<pid> dcache entry from the hash chains.
+ *
+ * Dropping /proc/<pid> entries and detach_pid must be synchroneous,
+ * otherwise e.g. /proc/<pid>/exe might point to the wrong executable,
+ * if the pid value is immediately reused. This is enforced by
+ * - caller must acquire spin_lock(p->proc_lock)
+ * - must be called before detach_pid()
+ * - proc_pid_lookup acquires proc_lock, and checks that
+ *   the target is not dead by looking at the attach count
+ *   of PIDTYPE_PID.
+ */
+
+struct dentry *proc_pid_unhash(struct task_struct *p)
+{
+	struct dentry *proc_dentry;
+
+	proc_dentry = p->proc_dentry;
+	if (proc_dentry != NULL) {
+
+		spin_lock(&dcache_lock);
+		if (!d_unhashed(proc_dentry)) {
+			dget_locked(proc_dentry);
+			__d_drop(proc_dentry);
+		} else
+			proc_dentry = NULL;
+		spin_unlock(&dcache_lock);
+	}
+	return proc_dentry;
+}
+
+/**
+ * proc_pid_flush - recover memory used by stale /proc/<pid>/x entries
+ * @proc_entry: directoy to prune.
+ *
+ * Shrink the /proc directory that was used by the just killed thread.
+ */
+	
+void proc_pid_flush(struct dentry *proc_dentry)
+{
+	if(proc_dentry != NULL) {
+		shrink_dcache_parent(proc_dentry);
+		dput(proc_dentry);
+	}
+}
+
 /* SMP-safe */
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry)
 {
@@ -1141,12 +1203,12 @@ struct dentry *proc_pid_lookup(struct in
 	inode->i_flags|=S_IMMUTABLE;
 
 	dentry->d_op = &pid_base_dentry_operations;
+
+	spin_lock(&task->proc_lock);
+	task->proc_dentry = dentry;
 	d_add(dentry, inode);
-	read_lock(&tasklist_lock);
-	proc_task(dentry->d_inode)->proc_dentry = dentry;
-	read_unlock(&tasklist_lock);
-	if (!proc_task(dentry->d_inode)->pid)
-		d_drop(dentry);
+	spin_unlock(&task->proc_lock);
+
 	return NULL;
 out:
 	return ERR_PTR(-ENOENT);
@@ -1169,7 +1231,7 @@ static int get_pid_list(int index, unsig
 	read_lock(&tasklist_lock);
 	for_each_process(p) {
 		int pid = p->pid;
-		if (!pid)
+		if (!pid_alive(p))
 			continue;
 		if (--index >= 0)
 			continue;
diff -puN fs/proc/root.c~dcache_lock-vs-tasklist_lock-take-3 fs/proc/root.c
--- 25/fs/proc/root.c~dcache_lock-vs-tasklist_lock-take-3	2003-05-11 21:36:07.000000000 -0700
+++ 25-akpm/fs/proc/root.c	2003-05-11 21:36:07.000000000 -0700
@@ -110,9 +110,9 @@ static int proc_root_readdir(struct file
 		}
 		filp->f_pos = FIRST_PROCESS_ENTRY;
 	}
+	unlock_kernel();
 
 	ret = proc_pid_readdir(filp, dirent, filldir);
-	unlock_kernel();
 	return ret;
 }
 
diff -puN include/linux/init_task.h~dcache_lock-vs-tasklist_lock-take-3 include/linux/init_task.h
--- 25/include/linux/init_task.h~dcache_lock-vs-tasklist_lock-take-3	2003-05-11 21:36:07.000000000 -0700
+++ 25-akpm/include/linux/init_task.h	2003-05-11 21:36:07.000000000 -0700
@@ -101,6 +101,7 @@
 	.blocked	= {{0}},					\
 	 .posix_timers	 = LIST_HEAD_INIT(tsk.posix_timers),		   \
 	.alloc_lock	= SPIN_LOCK_UNLOCKED,				\
+	.proc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.switch_lock	= SPIN_LOCK_UNLOCKED,				\
 	.journal_info	= NULL,						\
 }
diff -puN include/linux/proc_fs.h~dcache_lock-vs-tasklist_lock-take-3 include/linux/proc_fs.h
--- 25/include/linux/proc_fs.h~dcache_lock-vs-tasklist_lock-take-3	2003-05-11 21:36:07.000000000 -0700
+++ 25-akpm/include/linux/proc_fs.h	2003-05-11 21:36:07.000000000 -0700
@@ -87,6 +87,8 @@ extern void proc_root_init(void);
 extern void proc_misc_init(void);
 
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry);
+struct dentry *proc_pid_unhash(struct task_struct *p);
+void proc_pid_flush(struct dentry *proc_dentry);
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
 
 extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
diff -puN include/linux/sched.h~dcache_lock-vs-tasklist_lock-take-3 include/linux/sched.h
--- 25/include/linux/sched.h~dcache_lock-vs-tasklist_lock-take-3	2003-05-11 21:36:07.000000000 -0700
+++ 25-akpm/include/linux/sched.h	2003-05-11 21:38:31.000000000 -0700
@@ -429,6 +429,8 @@ struct task_struct {
    	u32 self_exec_id;
 /* Protection of (de-)allocation: mm, files, fs, tty */
 	spinlock_t alloc_lock;
+/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
+	spinlock_t proc_lock;
 /* context-switch lock */
 	spinlock_t switch_lock;
 
diff -puN kernel/exit.c~dcache_lock-vs-tasklist_lock-take-3 kernel/exit.c
--- 25/kernel/exit.c~dcache_lock-vs-tasklist_lock-take-3	2003-05-11 21:36:07.000000000 -0700
+++ 25-akpm/kernel/exit.c	2003-05-11 21:38:31.000000000 -0700
@@ -21,6 +21,7 @@
 #include <linux/ptrace.h>
 #include <linux/profile.h>
 #include <linux/mount.h>
+#include <linux/proc_fs.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -31,10 +32,8 @@ extern struct task_struct *child_reaper;
 
 int getrusage(struct task_struct *, int, struct rusage *);
 
-static struct dentry * __unhash_process(struct task_struct *p)
+static void __unhash_process(struct task_struct *p)
 {
-	struct dentry *proc_dentry;
-
 	nr_threads--;
 	detach_pid(p, PIDTYPE_PID);
 	detach_pid(p, PIDTYPE_TGID);
@@ -46,34 +45,25 @@ static struct dentry * __unhash_process(
 	}
 
 	REMOVE_LINKS(p);
-	proc_dentry = p->proc_dentry;
-	if (unlikely(proc_dentry != NULL)) {
-		spin_lock(&dcache_lock);
-		if (!d_unhashed(proc_dentry)) {
-			dget_locked(proc_dentry);
-			__d_drop(proc_dentry);
-		} else
-			proc_dentry = NULL;
-		spin_unlock(&dcache_lock);
-	}
-	return proc_dentry;
 }
 
 void release_task(struct task_struct * p)
 {
-	struct dentry *proc_dentry;
 	task_t *leader;
+	struct dentry *proc_dentry;
  
 	BUG_ON(p->state < TASK_ZOMBIE);
  
 	atomic_dec(&p->user->processes);
+	spin_lock(&p->proc_lock);
+	proc_dentry = proc_pid_unhash(p);
 	write_lock_irq(&tasklist_lock);
 	if (unlikely(p->ptrace))
 		__ptrace_unlink(p);
 	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
 	__exit_signal(p);
 	__exit_sighand(p);
-	proc_dentry = __unhash_process(p);
+	__unhash_process(p);
 
 	/*
 	 * If we are the last non-leader member of the thread
@@ -92,11 +82,8 @@ void release_task(struct task_struct * p
 	p->parent->cnswap += p->nswap + p->cnswap;
 	sched_exit(p);
 	write_unlock_irq(&tasklist_lock);
-
-	if (unlikely(proc_dentry != NULL)) {
-		shrink_dcache_parent(proc_dentry);
-		dput(proc_dentry);
-	}
+	spin_unlock(&p->proc_lock);
+	proc_pid_flush(proc_dentry);
 	release_thread(p);
 	put_task_struct(p);
 }
@@ -107,14 +94,13 @@ void unhash_process(struct task_struct *
 {
 	struct dentry *proc_dentry;
 
+	spin_lock(&p->proc_lock);
+	proc_dentry = proc_pid_unhash(p);
 	write_lock_irq(&tasklist_lock);
-	proc_dentry = __unhash_process(p);
+	__unhash_process(p);
 	write_unlock_irq(&tasklist_lock);
-
-	if (unlikely(proc_dentry != NULL)) {
-		shrink_dcache_parent(proc_dentry);
-		dput(proc_dentry);
-	}
+	spin_unlock(&p->proc_lock);
+	proc_pid_flush(proc_dentry);
 }
 
 /*

_
