diff mbox series

[2/2] fsnotify: allow sleepable child flag update

Message ID 20221018041233.376977-3-stephen.s.brennan@oracle.com (mailing list archive)
State New, archived
Headers show
Series fsnotify: fix softlockups iterating over d_subdirs | expand

Commit Message

Stephen Brennan Oct. 18, 2022, 4:12 a.m. UTC
With very large d_subdirs lists, iteration can take a long time. Since
iteration needs to hold parent->d_lock, this can trigger soft lockups.
It would be best to make this iteration sleepable. Since we have the
inode locked exclusive, we can drop the parent->d_lock and sleep,
holding a reference to a child dentry, and continue iteration once we
wake.

Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
---
 fs/notify/fsnotify.c | 72 ++++++++++++++++++++++++++++++--------------
 1 file changed, 50 insertions(+), 22 deletions(-)

Comments

Amir Goldstein Oct. 18, 2022, 5:36 a.m. UTC | #1
On Tue, Oct 18, 2022 at 7:12 AM Stephen Brennan
<stephen.s.brennan@oracle.com> wrote:
>
> With very large d_subdirs lists, iteration can take a long time. Since
> iteration needs to hold parent->d_lock, this can trigger soft lockups.
> It would be best to make this iteration sleepable. Since we have the
> inode locked exclusive, we can drop the parent->d_lock and sleep,
> holding a reference to a child dentry, and continue iteration once we
> wake.
>
> Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
> ---
>  fs/notify/fsnotify.c | 72 ++++++++++++++++++++++++++++++--------------
>  1 file changed, 50 insertions(+), 22 deletions(-)
>
> diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
> index e887a195983b..499b19272b32 100644
> --- a/fs/notify/fsnotify.c
> +++ b/fs/notify/fsnotify.c
> @@ -102,10 +102,13 @@ void fsnotify_sb_delete(struct super_block *sb)
>   * on a child we run all of our children and set a dentry flag saying that the
>   * parent cares.  Thus when an event happens on a child it can quickly tell
>   * if there is a need to find a parent and send the event to the parent.
> + *
> + * Context: inode locked exclusive

Please add code assertion

        WARN_ON_ONCE(!inode_is_locked(inode));

and it probably wouldn't hurt to add an inline wrapper
fsnotify_update_child_dentry_flags()
that locks the inode and calls this helper.

>   */
>  void __fsnotify_update_child_dentry_flags(struct inode *inode)
>  {
> -       struct dentry *alias;
> +       struct dentry *child, *alias, *last_ref = NULL;
> +       struct list_head *p;
>         int watched;
>
>         if (!S_ISDIR(inode->i_mode))
> @@ -114,30 +117,55 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
>         /* determine if the children should tell inode about their events */
>         watched = fsnotify_inode_watches_children(inode);
>
> -       spin_lock(&inode->i_lock);
> -       /* run all of the dentries associated with this inode.  Since this is a
> -        * directory, there damn well better only be one item on this list */
> -       hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
> -               struct dentry *child;
> -
> -               /* run all of the children of the original inode and fix their
> -                * d_flags to indicate parental interest (their parent is the
> -                * original inode) */
> -               spin_lock(&alias->d_lock);
> -               list_for_each_entry(child, &alias->d_subdirs, d_child) {
> -                       if (!child->d_inode)
> -                               continue;
> +       alias = d_find_any_alias(inode);

Please make the alias change in a separate patch.
It is not explained in commit message and it clutters
the diff which makes reviewing the actual logic changes
harder.

> +
> +       /*
> +        * These lists can get very long, so we may need to sleep during
> +        * iteration. Normally this would be impossible without a cursor,
> +        * but since we have the inode locked exclusive, we're guaranteed
> +        * that the directory won't be modified, so whichever dentry we
> +        * pick to sleep on won't get moved. So, start a manual iteration
> +        * over d_subdirs which will allow us to sleep.
> +        */
> +       spin_lock(&alias->d_lock);
> +       p = alias->d_subdirs.next;
>
> -                       spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
> -                       if (watched)
> -                               child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
> -                       else
> -                               child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
> -                       spin_unlock(&child->d_lock);
> +       while (p != &alias->d_subdirs) {
> +               child = list_entry(p, struct dentry, d_child);

IMO it would be better to use list iterator helpers.
What was wrong with list_for_each_entry()?
Why did you feel that you need to open code it?

> +               if (need_resched()) {
> +                       /*
> +                        * We need to hold a reference while we sleep. But when
> +                        * we wake, dput() could free the dentry, invalidating
> +                        * the list pointers. We can't look at the list pointers
> +                        * until we re-lock the parent, and we can't dput() once
> +                        * we have the parent locked. So the solution is to hold
> +                        * onto our reference and free it the *next* time we drop
> +                        * alias->d_lock: either at the end of the function, or
> +                        * at the time of the next sleep.
> +                        */
> +                       dget(child);
> +                       spin_unlock(&alias->d_lock);
> +                       if (last_ref)
> +                               dput(last_ref);
> +                       last_ref = child;
> +                       cond_resched();
> +                       spin_lock(&alias->d_lock);
>                 }
> -               spin_unlock(&alias->d_lock);
> +               p = p->next;
> +
> +               if (!child->d_inode)
> +                       continue;
> +
> +               spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
> +               if (watched)
> +                       child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
> +               else
> +                       child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
> +               spin_unlock(&child->d_lock);
>         }
> -       spin_unlock(&inode->i_lock);
> +       spin_unlock(&alias->d_lock);
> +       if (last_ref)
> +               dput(last_ref);

Nit: if not needed. dput(NULL) works just fine.

Thanks,
Amir.
Yujie Liu Oct. 27, 2022, 7:50 a.m. UTC | #2
Greeting,

FYI, we noticed WARNING:possible_recursive_locking_detected due to commit (built with clang-14):

commit: bed2685d9557ff9a7705f4172651a138e5f705af ("[PATCH 2/2] fsnotify: allow sleepable child flag update")
url: https://github.com/intel-lab-lkp/linux/commits/Stephen-Brennan/fsnotify-Protect-i_fsnotify_mask-and-child-flags-with-inode-rwsem/20221018-131326
base: https://git.kernel.org/cgit/linux/kernel/git/jack/linux-fs.git fsnotify
patch link: https://lore.kernel.org/linux-fsdevel/20221018041233.376977-3-stephen.s.brennan@oracle.com
patch subject: [PATCH 2/2] fsnotify: allow sleepable child flag update

in testcase: boot

on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G

caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):


[   31.979147][    T1]
[   31.979446][    T1] ============================================
[   31.980051][    T1] WARNING: possible recursive locking detected
[   31.980674][    T1] 6.0.0-rc4-00066-gbed2685d9557 #1 Not tainted
[   31.981286][    T1] --------------------------------------------
[   31.981889][    T1] systemd/1 is trying to acquire lock:
[   31.982432][    T1] ffff88813f542510 (&dentry->d_lock){+.+.}-{2:2}, at: lockref_get+0xd/0x80
[   31.983314][    T1]
[   31.983314][    T1] but task is already holding lock:
[   31.984040][    T1] ffff888100441b18 (&dentry->d_lock){+.+.}-{2:2}, at: __fsnotify_update_child_dentry_flags+0x85/0x2c0
[   31.985132][    T1]
[   31.985132][    T1] other info that might help us debug this:
[   31.985967][    T1]  Possible unsafe locking scenario:
[   31.985967][    T1]
[   31.986694][    T1]        CPU0
[   31.987025][    T1]        ----
[   31.987366][    T1]   lock(&dentry->d_lock);
[   31.987828][    T1]   lock(&dentry->d_lock);
[   31.988283][    T1]
[   31.988283][    T1]  *** DEADLOCK ***
[   31.988283][    T1]
[   31.989061][    T1]  May be due to missing lock nesting notation
[   31.989061][    T1]
[   31.989888][    T1] 3 locks held by systemd/1:
[   31.990361][    T1]  #0: ffff88815249e128 (&group->mark_mutex){+.+.}-{3:3}, at: __x64_sys_inotify_add_watch+0x2fc/0xc00
[   31.991473][    T1]  #1: ffff888100480af8 (&sb->s_type->i_mutex_key){++++}-{3:3}, at: fsnotify_recalc_mask+0xf1/0x1c0
[   31.992528][    T1]  #2: ffff888100441b18 (&dentry->d_lock){+.+.}-{2:2}, at: __fsnotify_update_child_dentry_flags+0x85/0x2c0
[   31.993671][    T1]
[   31.993671][    T1] stack backtrace:
[   31.994260][    T1] CPU: 0 PID: 1 Comm: systemd Not tainted 6.0.0-rc4-00066-gbed2685d9557 #1 1afcec0fe797aeed18cb95313bac4a75fb6852d3
[   31.995440][    T1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-4 04/01/2014
[   31.996441][    T1] Call Trace:
[   31.996791][    T1]  <TASK>
[   31.997101][    T1]  dump_stack_lvl+0x6a/0x100
[   31.997590][    T1]  __lock_acquire+0x1110/0x7480
[   31.998105][    T1]  ? mark_lock+0x9a/0x380
[   31.998560][    T1]  ? mark_held_locks+0xad/0x1c0
[   31.999056][    T1]  ? lockdep_hardirqs_on_prepare+0x1a8/0x400
[   31.999650][    T1]  ? asm_sysvec_apic_timer_interrupt+0x1a/0x20
[   32.000276][    T1]  lock_acquire+0x177/0x480
[   32.000739][    T1]  ? lockref_get+0xd/0x80
[   32.001178][    T1]  _raw_spin_lock+0x2f/0x40
[   32.001656][    T1]  ? lockref_get+0xd/0x80
[   32.002093][    T1]  lockref_get+0xd/0x80
[   32.002529][    T1]  __fsnotify_update_child_dentry_flags+0x142/0x2c0
[   32.003178][    T1]  fsnotify_recalc_mask+0x126/0x1c0
[   32.003711][    T1]  fsnotify_add_mark_locked+0xd9e/0x1280
[   32.004292][    T1]  __x64_sys_inotify_add_watch+0x755/0xc00
[   32.004898][    T1]  ? syscall_enter_from_user_mode+0x26/0x180
[   32.005660][    T1]  do_syscall_64+0x6d/0xc0
[   32.006125][    T1]  entry_SYSCALL_64_after_hwframe+0x46/0xb0
[   32.006735][    T1] RIP: 0033:0x7f839dd0a8f7
[   32.007188][    T1] Code: f0 ff ff 73 01 c3 48 8b 0d 96 f5 0b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 fe 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 69 f5 0b 00 f7 d8 64 89 01 48
[   32.009103][    T1] RSP: 002b:00007ffe52095c98 EFLAGS: 00000202 ORIG_RAX: 00000000000000fe
[   32.009945][    T1] RAX: ffffffffffffffda RBX: 0000555bc72cf930 RCX: 00007f839dd0a8f7
[   32.010685][    T1] RDX: 0000000000000d84 RSI: 0000555bc72cf930 RDI: 000000000000001a
[   32.011469][    T1] RBP: 0000555bc72cf931 R08: 00000000fe000000 R09: 0000555bc72a1e90
[   32.012266][    T1] R10: 00007ffe52095c2c R11: 0000000000000202 R12: 0000000000000000
[   32.012976][    T1] R13: 0000555bc72a1e90 R14: 0000000000000d84 R15: 0000555bc72cf930
[   32.013705][    T1]  </TASK>


If you fix the issue, kindly add following tag
| Reported-by: kernel test robot <yujie.liu@intel.com>
| Link: https://lore.kernel.org/oe-lkp/202210271500.731e3808-yujie.liu@intel.com


To reproduce:

        # build kernel
	cd linux
	cp config-6.0.0-rc4-00066-gbed2685d9557 .config
	make HOSTCC=clang-14 CC=clang-14 ARCH=x86_64 olddefconfig prepare modules_prepare bzImage modules
	make HOSTCC=clang-14 CC=clang-14 ARCH=x86_64 INSTALL_MOD_PATH=<mod-install-dir> modules_install
	cd <mod-install-dir>
	find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz


        git clone https://github.com/intel/lkp-tests.git
        cd lkp-tests
        bin/lkp qemu -k <bzImage> -m modules.cgz job-script # job-script is attached in this email

        # if come across any failure that blocks the test,
        # please remove ~/.lkp and /lkp dir to run from a clean state.
Yujie Liu Oct. 27, 2022, 8:44 a.m. UTC | #3
On Thu, Oct 27, 2022 at 03:50:17PM +0800, kernel test robot wrote:
> Greeting,
> 
> FYI, we noticed WARNING:possible_recursive_locking_detected due to commit (built with clang-14):
> 
> commit: bed2685d9557ff9a7705f4172651a138e5f705af ("[PATCH 2/2] fsnotify: allow sleepable child flag update")
> url: https://github.com/intel-lab-lkp/linux/commits/Stephen-Brennan/fsnotify-Protect-i_fsnotify_mask-and-child-flags-with-inode-rwsem/20221018-131326
> base: https://git.kernel.org/cgit/linux/kernel/git/jack/linux-fs.git fsnotify
> patch link: https://lore.kernel.org/linux-fsdevel/20221018041233.376977-3-stephen.s.brennan@oracle.com
> patch subject: [PATCH 2/2] fsnotify: allow sleepable child flag update
> 
> in testcase: boot
> 
> on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G
> 
> caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):

Sorry, this report is for the v1 patch which seems to be obsolete now.
Please kindly check the details in report, if the issue has already been
fixed in v2, please ignore this report. Thanks.

--
Best Regards,
Yujie

> [   31.979147][    T1]
> [   31.979446][    T1] ============================================
> [   31.980051][    T1] WARNING: possible recursive locking detected
> [   31.980674][    T1] 6.0.0-rc4-00066-gbed2685d9557 #1 Not tainted
> [   31.981286][    T1] --------------------------------------------
> [   31.981889][    T1] systemd/1 is trying to acquire lock:
> [   31.982432][    T1] ffff88813f542510 (&dentry->d_lock){+.+.}-{2:2}, at: lockref_get+0xd/0x80
> [   31.983314][    T1]
> [   31.983314][    T1] but task is already holding lock:
> [   31.984040][    T1] ffff888100441b18 (&dentry->d_lock){+.+.}-{2:2}, at: __fsnotify_update_child_dentry_flags+0x85/0x2c0
> [   31.985132][    T1]
> [   31.985132][    T1] other info that might help us debug this:
> [   31.985967][    T1]  Possible unsafe locking scenario:
> [   31.985967][    T1]
> [   31.986694][    T1]        CPU0
> [   31.987025][    T1]        ----
> [   31.987366][    T1]   lock(&dentry->d_lock);
> [   31.987828][    T1]   lock(&dentry->d_lock);
> [   31.988283][    T1]
> [   31.988283][    T1]  *** DEADLOCK ***
> [   31.988283][    T1]
> [   31.989061][    T1]  May be due to missing lock nesting notation
> [   31.989061][    T1]
> [   31.989888][    T1] 3 locks held by systemd/1:
> [   31.990361][    T1]  #0: ffff88815249e128 (&group->mark_mutex){+.+.}-{3:3}, at: __x64_sys_inotify_add_watch+0x2fc/0xc00
> [   31.991473][    T1]  #1: ffff888100480af8 (&sb->s_type->i_mutex_key){++++}-{3:3}, at: fsnotify_recalc_mask+0xf1/0x1c0
> [   31.992528][    T1]  #2: ffff888100441b18 (&dentry->d_lock){+.+.}-{2:2}, at: __fsnotify_update_child_dentry_flags+0x85/0x2c0
> [   31.993671][    T1]
> [   31.993671][    T1] stack backtrace:
> [   31.994260][    T1] CPU: 0 PID: 1 Comm: systemd Not tainted 6.0.0-rc4-00066-gbed2685d9557 #1 1afcec0fe797aeed18cb95313bac4a75fb6852d3
> [   31.995440][    T1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-4 04/01/2014
> [   31.996441][    T1] Call Trace:
> [   31.996791][    T1]  <TASK>
> [   31.997101][    T1]  dump_stack_lvl+0x6a/0x100
> [   31.997590][    T1]  __lock_acquire+0x1110/0x7480
> [   31.998105][    T1]  ? mark_lock+0x9a/0x380
> [   31.998560][    T1]  ? mark_held_locks+0xad/0x1c0
> [   31.999056][    T1]  ? lockdep_hardirqs_on_prepare+0x1a8/0x400
> [   31.999650][    T1]  ? asm_sysvec_apic_timer_interrupt+0x1a/0x20
> [   32.000276][    T1]  lock_acquire+0x177/0x480
> [   32.000739][    T1]  ? lockref_get+0xd/0x80
> [   32.001178][    T1]  _raw_spin_lock+0x2f/0x40
> [   32.001656][    T1]  ? lockref_get+0xd/0x80
> [   32.002093][    T1]  lockref_get+0xd/0x80
> [   32.002529][    T1]  __fsnotify_update_child_dentry_flags+0x142/0x2c0
> [   32.003178][    T1]  fsnotify_recalc_mask+0x126/0x1c0
> [   32.003711][    T1]  fsnotify_add_mark_locked+0xd9e/0x1280
> [   32.004292][    T1]  __x64_sys_inotify_add_watch+0x755/0xc00
> [   32.004898][    T1]  ? syscall_enter_from_user_mode+0x26/0x180
> [   32.005660][    T1]  do_syscall_64+0x6d/0xc0
> [   32.006125][    T1]  entry_SYSCALL_64_after_hwframe+0x46/0xb0
> [   32.006735][    T1] RIP: 0033:0x7f839dd0a8f7
> [   32.007188][    T1] Code: f0 ff ff 73 01 c3 48 8b 0d 96 f5 0b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 fe 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 69 f5 0b 00 f7 d8 64 89 01 48
> [   32.009103][    T1] RSP: 002b:00007ffe52095c98 EFLAGS: 00000202 ORIG_RAX: 00000000000000fe
> [   32.009945][    T1] RAX: ffffffffffffffda RBX: 0000555bc72cf930 RCX: 00007f839dd0a8f7
> [   32.010685][    T1] RDX: 0000000000000d84 RSI: 0000555bc72cf930 RDI: 000000000000001a
> [   32.011469][    T1] RBP: 0000555bc72cf931 R08: 00000000fe000000 R09: 0000555bc72a1e90
> [   32.012266][    T1] R10: 00007ffe52095c2c R11: 0000000000000202 R12: 0000000000000000
> [   32.012976][    T1] R13: 0000555bc72a1e90 R14: 0000000000000d84 R15: 0000555bc72cf930
> [   32.013705][    T1]  </TASK>
>
>
> If you fix the issue, kindly add following tag
> | Reported-by: kernel test robot <yujie.liu@intel.com>
> | Link: https://lore.kernel.org/oe-lkp/202210271500.731e3808-yujie.liu@intel.com
>
>
> To reproduce:
>
>          # build kernel
> 	cd linux
> 	cp config-6.0.0-rc4-00066-gbed2685d9557 .config
> 	make HOSTCC=clang-14 CC=clang-14 ARCH=x86_64 olddefconfig prepare modules_prepare bzImage modules
> 	make HOSTCC=clang-14 CC=clang-14 ARCH=x86_64 INSTALL_MOD_PATH=<mod-install-dir> modules_install
> 	cd <mod-install-dir>
> 	find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz
>
>
>          git clone https://github.com/intel/lkp-tests.git
>          cd lkp-tests
>          bin/lkp qemu -k <bzImage> -m modules.cgz job-script # job-script is attached in this email
>
>          # if come across any failure that blocks the test,
>          # please remove ~/.lkp and /lkp dir to run from a clean state.
>
>
Stephen Brennan Oct. 27, 2022, 10:12 p.m. UTC | #4
Yujie Liu <yujie.liu@intel.com> writes:
> On Thu, Oct 27, 2022 at 03:50:17PM +0800, kernel test robot wrote:
>> Greeting,
>> 
>> FYI, we noticed WARNING:possible_recursive_locking_detected due to commit (built with clang-14):
>> 
>> commit: bed2685d9557ff9a7705f4172651a138e5f705af ("[PATCH 2/2] fsnotify: allow sleepable child flag update")
>> url: https://github.com/intel-lab-lkp/linux/commits/Stephen-Brennan/fsnotify-Protect-i_fsnotify_mask-and-child-flags-with-inode-rwsem/20221018-131326
>> base: https://git.kernel.org/cgit/linux/kernel/git/jack/linux-fs.git fsnotify
>> patch link: https://lore.kernel.org/linux-fsdevel/20221018041233.376977-3-stephen.s.brennan@oracle.com
>> patch subject: [PATCH 2/2] fsnotify: allow sleepable child flag update
>> 
>> in testcase: boot
>> 
>> on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G
>> 
>> caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
>
> Sorry, this report is for the v1 patch which seems to be obsolete now.
> Please kindly check the details in report, if the issue has already been
> fixed in v2, please ignore this report. Thanks.

Thanks for the message, I'm looking deeper into it now. If it were to
happen on the v1, it may very well occur on v2.

Thanks,
Stephen
diff mbox series

Patch

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index e887a195983b..499b19272b32 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -102,10 +102,13 @@  void fsnotify_sb_delete(struct super_block *sb)
  * on a child we run all of our children and set a dentry flag saying that the
  * parent cares.  Thus when an event happens on a child it can quickly tell
  * if there is a need to find a parent and send the event to the parent.
+ *
+ * Context: inode locked exclusive
  */
 void __fsnotify_update_child_dentry_flags(struct inode *inode)
 {
-	struct dentry *alias;
+	struct dentry *child, *alias, *last_ref = NULL;
+	struct list_head *p;
 	int watched;
 
 	if (!S_ISDIR(inode->i_mode))
@@ -114,30 +117,55 @@  void __fsnotify_update_child_dentry_flags(struct inode *inode)
 	/* determine if the children should tell inode about their events */
 	watched = fsnotify_inode_watches_children(inode);
 
-	spin_lock(&inode->i_lock);
-	/* run all of the dentries associated with this inode.  Since this is a
-	 * directory, there damn well better only be one item on this list */
-	hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
-		struct dentry *child;
-
-		/* run all of the children of the original inode and fix their
-		 * d_flags to indicate parental interest (their parent is the
-		 * original inode) */
-		spin_lock(&alias->d_lock);
-		list_for_each_entry(child, &alias->d_subdirs, d_child) {
-			if (!child->d_inode)
-				continue;
+	alias = d_find_any_alias(inode);
+
+	/*
+	 * These lists can get very long, so we may need to sleep during
+	 * iteration. Normally this would be impossible without a cursor,
+	 * but since we have the inode locked exclusive, we're guaranteed
+	 * that the directory won't be modified, so whichever dentry we
+	 * pick to sleep on won't get moved. So, start a manual iteration
+	 * over d_subdirs which will allow us to sleep.
+	 */
+	spin_lock(&alias->d_lock);
+	p = alias->d_subdirs.next;
 
-			spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
-			if (watched)
-				child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
-			else
-				child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
-			spin_unlock(&child->d_lock);
+	while (p != &alias->d_subdirs) {
+		child = list_entry(p, struct dentry, d_child);
+		if (need_resched()) {
+			/*
+			 * We need to hold a reference while we sleep. But when
+			 * we wake, dput() could free the dentry, invalidating
+			 * the list pointers. We can't look at the list pointers
+			 * until we re-lock the parent, and we can't dput() once
+			 * we have the parent locked. So the solution is to hold
+			 * onto our reference and free it the *next* time we drop
+			 * alias->d_lock: either at the end of the function, or
+			 * at the time of the next sleep.
+			 */
+			dget(child);
+			spin_unlock(&alias->d_lock);
+			if (last_ref)
+				dput(last_ref);
+			last_ref = child;
+			cond_resched();
+			spin_lock(&alias->d_lock);
 		}
-		spin_unlock(&alias->d_lock);
+		p = p->next;
+
+		if (!child->d_inode)
+			continue;
+
+		spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+		if (watched)
+			child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
+		else
+			child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
+		spin_unlock(&child->d_lock);
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&alias->d_lock);
+	if (last_ref)
+		dput(last_ref);
 }
 
 /* Are inode/sb/mount interested in parent and name info with this event? */