From 64fd72e0a44bdd62c5ca277cb24d0d02b2d8e9dc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 28 May 2014 09:48:44 -0400
Subject: [PATCH 1/6] lift the "already marked killed" case into
 shrink_dentry_list()

It can happen only when dentry_kill() is called with unlock_on_failure
equal to 0 - other callers had dentry pinned until the moment they've
got ->d_lock and DCACHE_DENTRY_KILLED is set only after lockref_mark_dead().

IOW, only one of three call sites of dentry_kill() might end up reaching
that code.  Just move it there.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 42ae01eefc07..6888dde4d568 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -455,12 +455,6 @@ dentry_kill(struct dentry *dentry, int unlock_on_failure)
 	struct dentry *parent = NULL;
 	bool can_free = true;
 
-	if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) {
-		can_free = dentry->d_flags & DCACHE_MAY_FREE;
-		spin_unlock(&dentry->d_lock);
-		goto out;
-	}
-
 	inode = dentry->d_inode;
 	if (inode && !spin_trylock(&inode->i_lock)) {
 relock:
@@ -815,6 +809,15 @@ static void shrink_dentry_list(struct list_head *list)
 			continue;
 		}
 
+
+		if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) {
+			bool can_free = dentry->d_flags & DCACHE_MAY_FREE;
+			spin_unlock(&dentry->d_lock);
+			if (can_free)
+				dentry_free(dentry);
+			continue;
+		}
+
 		parent = dentry_kill(dentry, 0);
 		/*
 		 * If dentry_kill returns NULL, we have nothing more to do.

From e55fd011549eae01a230e3cace6f4d031b6a3453 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 28 May 2014 13:51:12 -0400
Subject: [PATCH 2/6] split dentry_kill()

... into trylocks and everything else.  The latter (actual killing)
is __dentry_kill().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 62 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 26 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 6888dde4d568..1577c14dfb4e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -441,36 +441,12 @@ void d_drop(struct dentry *dentry)
 }
 EXPORT_SYMBOL(d_drop);
 
-/*
- * Finish off a dentry we've decided to kill.
- * dentry->d_lock must be held, returns with it unlocked.
- * If ref is non-zero, then decrement the refcount too.
- * Returns dentry requiring refcount drop, or NULL if we're done.
- */
-static struct dentry *
-dentry_kill(struct dentry *dentry, int unlock_on_failure)
-	__releases(dentry->d_lock)
+static void __dentry_kill(struct dentry *dentry)
 {
-	struct inode *inode;
 	struct dentry *parent = NULL;
 	bool can_free = true;
-
-	inode = dentry->d_inode;
-	if (inode && !spin_trylock(&inode->i_lock)) {
-relock:
-		if (unlock_on_failure) {
-			spin_unlock(&dentry->d_lock);
-			cpu_relax();
-		}
-		return dentry; /* try again with same dentry */
-	}
 	if (!IS_ROOT(dentry))
 		parent = dentry->d_parent;
-	if (parent && !spin_trylock(&parent->d_lock)) {
-		if (inode)
-			spin_unlock(&inode->i_lock);
-		goto relock;
-	}
 
 	/*
 	 * The dentry is now unrecoverably dead to the world.
@@ -514,10 +490,44 @@ relock:
 		can_free = false;
 	}
 	spin_unlock(&dentry->d_lock);
-out:
 	if (likely(can_free))
 		dentry_free(dentry);
+}
+
+/*
+ * Finish off a dentry we've decided to kill.
+ * dentry->d_lock must be held, returns with it unlocked.
+ * If ref is non-zero, then decrement the refcount too.
+ * Returns dentry requiring refcount drop, or NULL if we're done.
+ */
+static struct dentry *
+dentry_kill(struct dentry *dentry, int unlock_on_failure)
+	__releases(dentry->d_lock)
+{
+	struct inode *inode = dentry->d_inode;
+	struct dentry *parent = NULL;
+
+	if (inode && unlikely(!spin_trylock(&inode->i_lock)))
+		goto failed;
+
+	if (!IS_ROOT(dentry)) {
+		parent = dentry->d_parent;
+		if (unlikely(!spin_trylock(&parent->d_lock))) {
+			if (inode)
+				spin_unlock(&inode->i_lock);
+			goto failed;
+		}
+	}
+
+	__dentry_kill(dentry);
 	return parent;
+
+failed:
+	if (unlock_on_failure) {
+		spin_unlock(&dentry->d_lock);
+		cpu_relax();
+	}
+	return dentry; /* try again with same dentry */
 }
 
 /* 

From ff2fde9929feb2aef45377ce56b8b12df85dda69 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 28 May 2014 13:59:13 -0400
Subject: [PATCH 3/6] expand dentry_kill(dentry, 0) in shrink_dentry_list()

Result will be massaged to saner shape in the next commits.  It is
ugly, no questions - the point of that one is to be a provably
equivalent transformation (and it might be worth splitting a bit
more).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 1577c14dfb4e..c23f78a9d156 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -801,6 +801,7 @@ static void shrink_dentry_list(struct list_head *list)
 	struct dentry *dentry, *parent;
 
 	while (!list_empty(list)) {
+		struct inode *inode;
 		dentry = list_entry(list->prev, struct dentry, d_lru);
 		spin_lock(&dentry->d_lock);
 		/*
@@ -828,23 +829,26 @@ static void shrink_dentry_list(struct list_head *list)
 			continue;
 		}
 
-		parent = dentry_kill(dentry, 0);
-		/*
-		 * If dentry_kill returns NULL, we have nothing more to do.
-		 */
-		if (!parent)
-			continue;
-
-		if (unlikely(parent == dentry)) {
-			/*
-			 * trylocks have failed and d_lock has been held the
-			 * whole time, so it could not have been added to any
-			 * other lists. Just add it back to the shrink list.
-			 */
+		inode = dentry->d_inode;
+		if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
 			d_shrink_add(dentry, list);
 			spin_unlock(&dentry->d_lock);
 			continue;
 		}
+
+		parent = NULL;
+		if (!IS_ROOT(dentry)) {
+			parent = dentry->d_parent;
+			if (unlikely(!spin_trylock(&parent->d_lock))) {
+				if (inode)
+					spin_unlock(&inode->i_lock);
+				d_shrink_add(dentry, list);
+				spin_unlock(&dentry->d_lock);
+				continue;
+			}
+		}
+
+		__dentry_kill(dentry);
 		/*
 		 * We need to prune ancestors too. This is necessary to prevent
 		 * quadratic behavior of shrink_dcache_parent(), but is also

From 046b961b45f93a92e4c70525a12f3d378bced130 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 29 May 2014 08:54:52 -0400
Subject: [PATCH 4/6] shrink_dentry_list(): take parent's ->d_lock earlier

The cause of livelocks there is that we are taking ->d_lock on
dentry and its parent in the wrong order, forcing us to use
trylock on the parent's one.  d_walk() takes them in the right
order, and unfortunately it's not hard to create a situation
when shrink_dentry_list() can't make progress since trylock
keeps failing, and shrink_dcache_parent() or check_submounts_and_drop()
keeps calling d_walk() disrupting the very shrink_dentry_list() it's
waiting for.

Solution is straightforward - if that trylock fails, let's unlock
the dentry itself and take locks in the right order.  We need to
stabilize ->d_parent without holding ->d_lock, but that's doable
using RCU.  And we'd better do that in the very beginning of the
loop in shrink_dentry_list(), since the checks on refcount, etc.
would need to be redone anyway.

That deals with a half of the problem - killing dentries on the
shrink list itself.  Another one (dropping their parents) is
in the next commit.

locking parent is interesting - it would be easy to do rcu_read_lock(),
lock whatever we think is a parent, lock dentry itself and check
if the parent is still the right one.  Except that we need to check
that *before* locking the dentry, or we are risking taking ->d_lock
out of order.  Fortunately, once the D1 is locked, we can check if
D2->d_parent is equal to D1 without the need to lock D2; D2->d_parent
can start or stop pointing to D1 only under D1->d_lock, so taking
D1->d_lock is enough.  In other words, the right solution is
rcu_read_lock/lock what looks like parent right now/check if it's
still our parent/rcu_read_unlock/lock the child.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 53 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 41 insertions(+), 12 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index c23f78a9d156..d54a99baf4f3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -530,6 +530,38 @@ failed:
 	return dentry; /* try again with same dentry */
 }
 
+static inline struct dentry *lock_parent(struct dentry *dentry)
+{
+	struct dentry *parent = dentry->d_parent;
+	if (IS_ROOT(dentry))
+		return NULL;
+	if (likely(spin_trylock(&parent->d_lock)))
+		return parent;
+	spin_unlock(&dentry->d_lock);
+	rcu_read_lock();
+again:
+	parent = ACCESS_ONCE(dentry->d_parent);
+	spin_lock(&parent->d_lock);
+	/*
+	 * We can't blindly lock dentry until we are sure
+	 * that we won't violate the locking order.
+	 * Any changes of dentry->d_parent must have
+	 * been done with parent->d_lock held, so
+	 * spin_lock() above is enough of a barrier
+	 * for checking if it's still our child.
+	 */
+	if (unlikely(parent != dentry->d_parent)) {
+		spin_unlock(&parent->d_lock);
+		goto again;
+	}
+	rcu_read_unlock();
+	if (parent != dentry)
+		spin_lock(&dentry->d_lock);
+	else
+		parent = NULL;
+	return parent;
+}
+
 /* 
  * This is dput
  *
@@ -804,6 +836,8 @@ static void shrink_dentry_list(struct list_head *list)
 		struct inode *inode;
 		dentry = list_entry(list->prev, struct dentry, d_lru);
 		spin_lock(&dentry->d_lock);
+		parent = lock_parent(dentry);
+
 		/*
 		 * The dispose list is isolated and dentries are not accounted
 		 * to the LRU here, so we can simply remove it from the list
@@ -817,6 +851,8 @@ static void shrink_dentry_list(struct list_head *list)
 		 */
 		if ((int)dentry->d_lockref.count > 0) {
 			spin_unlock(&dentry->d_lock);
+			if (parent)
+				spin_unlock(&parent->d_lock);
 			continue;
 		}
 
@@ -824,6 +860,8 @@ static void shrink_dentry_list(struct list_head *list)
 		if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) {
 			bool can_free = dentry->d_flags & DCACHE_MAY_FREE;
 			spin_unlock(&dentry->d_lock);
+			if (parent)
+				spin_unlock(&parent->d_lock);
 			if (can_free)
 				dentry_free(dentry);
 			continue;
@@ -833,22 +871,13 @@ static void shrink_dentry_list(struct list_head *list)
 		if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
 			d_shrink_add(dentry, list);
 			spin_unlock(&dentry->d_lock);
+			if (parent)
+				spin_unlock(&parent->d_lock);
 			continue;
 		}
 
-		parent = NULL;
-		if (!IS_ROOT(dentry)) {
-			parent = dentry->d_parent;
-			if (unlikely(!spin_trylock(&parent->d_lock))) {
-				if (inode)
-					spin_unlock(&inode->i_lock);
-				d_shrink_add(dentry, list);
-				spin_unlock(&dentry->d_lock);
-				continue;
-			}
-		}
-
 		__dentry_kill(dentry);
+
 		/*
 		 * We need to prune ancestors too. This is necessary to prevent
 		 * quadratic behavior of shrink_dcache_parent(), but is also

From b2b80195d8829921506880f6dccd21cabd163d0d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 29 May 2014 09:11:45 -0400
Subject: [PATCH 5/6] dealing with the rest of shrink_dentry_list() livelock

We have the same problem with ->d_lock order in the inner loop, where
we are dropping references to ancestors.  Same solution, basically -
instead of using dentry_kill() we use lock_parent() (introduced in the
previous commit) to get that lock in a safe way, recheck ->d_count
(in case if lock_parent() has ended up dropping and retaking ->d_lock
and somebody managed to grab a reference during that window), trylock
the inode->i_lock and use __dentry_kill() to do the rest.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index d54a99baf4f3..eb7c7255470c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -885,8 +885,26 @@ static void shrink_dentry_list(struct list_head *list)
 		 * fragmentation.
 		 */
 		dentry = parent;
-		while (dentry && !lockref_put_or_lock(&dentry->d_lockref))
-			dentry = dentry_kill(dentry, 1);
+		while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) {
+			parent = lock_parent(dentry);
+			if (dentry->d_lockref.count != 1) {
+				dentry->d_lockref.count--;
+				spin_unlock(&dentry->d_lock);
+				if (parent)
+					spin_unlock(&parent->d_lock);
+				break;
+			}
+			inode = dentry->d_inode;	/* can't be NULL */
+			if (unlikely(!spin_trylock(&inode->i_lock))) {
+				spin_unlock(&dentry->d_lock);
+				if (parent)
+					spin_unlock(&parent->d_lock);
+				cpu_relax();
+				continue;
+			}
+			__dentry_kill(dentry);
+			dentry = parent;
+		}
 	}
 }
 

From 8cbf74da435d1bd13dbb790f94c7ff67b2fb6af4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 29 May 2014 09:18:26 -0400
Subject: [PATCH 6/6] dentry_kill() doesn't need the second argument now

it's 1 in the only remaining caller.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index eb7c7255470c..bce851dc03ef 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -500,8 +500,7 @@ static void __dentry_kill(struct dentry *dentry)
  * If ref is non-zero, then decrement the refcount too.
  * Returns dentry requiring refcount drop, or NULL if we're done.
  */
-static struct dentry *
-dentry_kill(struct dentry *dentry, int unlock_on_failure)
+static struct dentry *dentry_kill(struct dentry *dentry)
 	__releases(dentry->d_lock)
 {
 	struct inode *inode = dentry->d_inode;
@@ -523,10 +522,8 @@ dentry_kill(struct dentry *dentry, int unlock_on_failure)
 	return parent;
 
 failed:
-	if (unlock_on_failure) {
-		spin_unlock(&dentry->d_lock);
-		cpu_relax();
-	}
+	spin_unlock(&dentry->d_lock);
+	cpu_relax();
 	return dentry; /* try again with same dentry */
 }
 
@@ -615,7 +612,7 @@ repeat:
 	return;
 
 kill_it:
-	dentry = dentry_kill(dentry, 1);
+	dentry = dentry_kill(dentry);
 	if (dentry)
 		goto repeat;
 }