Message ID | 20130122142414.GA15978@liubo (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, 22 Jan 2013 22:24:15 +0800, Liu Bo wrote: > On Tue, Jan 22, 2013 at 06:49:00PM +0800, Miao Xie wrote: >> btrfs_start_delalloc_inodes() locks the delalloc_inodes list, fetches the >> first inode, unlocks the list, triggers btrfs_alloc_delalloc_work/ >> btrfs_queue_worker for this inode, and then it locks the list, checks the >> head of the list again. But because we don't delete the first inode that it >> deals with before, it will fetch the same inode. As a result, this function >> allocates a huge amount of btrfs_delalloc_work structures, and OOM happens. >> >> Fix this problem by splice this delalloc list. >> >> Reported-by: Alex Lyakas <alex.btrfs@zadarastorage.com> >> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> >> --- >> fs/btrfs/inode.c | 55 ++++++++++++++++++++++++++++++++++++++++------------- >> 1 files changed, 41 insertions(+), 14 deletions(-) >> >> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c >> index 67ed24a..86f1d25 100644 >> --- a/fs/btrfs/inode.c >> +++ b/fs/btrfs/inode.c >> @@ -7545,41 +7545,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) >> */ >> int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) >> { >> - struct list_head *head = &root->fs_info->delalloc_inodes; >> struct btrfs_inode *binode; >> struct inode *inode; >> struct btrfs_delalloc_work *work, *next; >> struct list_head works; >> + struct list_head splice; >> int ret = 0; >> >> if (root->fs_info->sb->s_flags & MS_RDONLY) >> return -EROFS; >> >> INIT_LIST_HEAD(&works); >> - >> + INIT_LIST_HEAD(&splice); >> +again: >> spin_lock(&root->fs_info->delalloc_lock); >> - while (!list_empty(head)) { >> - binode = list_entry(head->next, struct btrfs_inode, >> + list_splice_init(&root->fs_info->delalloc_inodes, &splice); >> + while (!list_empty(&splice)) { >> + binode = list_entry(splice.next, struct btrfs_inode, >> delalloc_inodes); >> + >> + list_del_init(&binode->delalloc_inodes); >> + > > I believe this patch can work well, but it's a little complex. > > How about adding a flag in runtime_flags set? I have tried to adding a flag in runtime_flags, but I found it is not a good way, because - it can not avoid traversing the delalloc list repeatedly when someone write data into the file endlessly. In fact, it is unnecessary because we can just see that data as the one which is written after the flush is done. - bit operation need lock the bus, but we have a spin lock to protect all the relative variants, so it is unnecessary. besides that, there is something wrong with the following patch. > We can use the flag instead of 'delalloc_inodes' list to tell if we > have clear the delalloc bytes, and the most important thing is it > won't touch the original code logic too much. > > thanks, > liubo > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index 67ed24a..692ed0e 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -1555,8 +1555,8 @@ static void btrfs_clear_bit_hook(struct inode *inode, > BTRFS_I(inode)->delalloc_bytes -= len; > > if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && > - !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { > - list_del_init(&BTRFS_I(inode)->delalloc_inodes); > + test_bit(BTRFS_INODE_FLUSH, &BTRFS_I(inode)->runtime_flags)) { > + clear_bit(BTRFS_INODE_FLUSH, &BTRFS_I(inode)->runtime_flags); > } We can not remove list_del_init(), because the delalloc file can be flushed not only by btrfs_start_delalloc_inodes(), but also by flusher or the task who invoke btrfs_sync_file(). > spin_unlock(&root->fs_info->delalloc_lock); > } > @@ -7562,8 +7562,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) > binode = list_entry(head->next, struct btrfs_inode, > delalloc_inodes); > inode = igrab(&binode->vfs_inode); > - if (!inode) > - list_del_init(&binode->delalloc_inodes); > + > + list_del_init(&binode->delalloc_inodes); > + > spin_unlock(&root->fs_info->delalloc_lock); > if (inode) { > work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); > @@ -7572,6 +7573,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) > goto out; > } > list_add_tail(&work->list, &works); > + set_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags); if someone flush the file before set_bit(), no one will clear bit. > btrfs_queue_worker(&root->fs_info->flush_workers, > &work->work); > } > @@ -7580,6 +7582,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) > } > spin_unlock(&root->fs_info->delalloc_lock); > > + /* make sure we clear all delalloc bytes we have scheduled */ > + while (!list_empty(&works)) { > + work = list_entry(works.next, struct btrfs_delalloc_work, > + list); > + binode = btrfs_ino(work->inode); ^^^^^^BTRFS_I(), not btrfs_ino() > + if (!test_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags)) { > + list_del_init(&work->list); > + btrfs_wait_and_free_delalloc_work(work); We must wait and free all the delalloc work here, or memory leak will happen. Thanks Miao > + } > + cond_resched(); > + } > + > /* the filemap_flush will queue IO into the worker threads, but > * we have to make sure the IO is actually started and that > * ordered extents get created before we return > > > >> inode = igrab(&binode->vfs_inode); >> if (!inode) >> - list_del_init(&binode->delalloc_inodes); >> + continue; >> + >> + list_add_tail(&binode->delalloc_inodes, >> + &root->fs_info->delalloc_inodes); >> spin_unlock(&root->fs_info->delalloc_lock); >> - if (inode) { >> - work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); >> - if (!work) { >> - ret = -ENOMEM; >> - goto out; >> - } >> - list_add_tail(&work->list, &works); >> - btrfs_queue_worker(&root->fs_info->flush_workers, >> - &work->work); >> + >> + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); >> + if (unlikely(!work)) { >> + ret = -ENOMEM; >> + goto out; >> } >> + list_add_tail(&work->list, &works); >> + btrfs_queue_worker(&root->fs_info->flush_workers, >> + &work->work); >> + >> cond_resched(); >> spin_lock(&root->fs_info->delalloc_lock); >> } >> spin_unlock(&root->fs_info->delalloc_lock); >> >> + list_for_each_entry_safe(work, next, &works, list) { >> + list_del_init(&work->list); >> + btrfs_wait_and_free_delalloc_work(work); >> + } >> + >> + spin_lock(&root->fs_info->delalloc_lock); >> + if (!list_empty(&root->fs_info->delalloc_inodes)) { >> + spin_unlock(&root->fs_info->delalloc_lock); >> + goto again; >> + } >> + spin_unlock(&root->fs_info->delalloc_lock); >> + >> /* the filemap_flush will queue IO into the worker threads, but >> * we have to make sure the IO is actually started and that >> * ordered extents get created before we return >> @@ -7592,11 +7612,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) >> atomic_read(&root->fs_info->async_delalloc_pages) == 0)); >> } >> atomic_dec(&root->fs_info->async_submit_draining); >> + return 0; >> out: >> list_for_each_entry_safe(work, next, &works, list) { >> list_del_init(&work->list); >> btrfs_wait_and_free_delalloc_work(work); >> } >> + >> + if (!list_empty_careful(&splice)) { >> + spin_lock(&root->fs_info->delalloc_lock); >> + list_splice_tail(&splice, &root->fs_info->delalloc_inodes); >> + spin_unlock(&root->fs_info->delalloc_lock); >> + } >> return ret; >> } >> >> -- >> 1.6.5.2 >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jan 23, 2013 at 10:54:39AM +0800, Miao Xie wrote: > On Tue, 22 Jan 2013 22:24:15 +0800, Liu Bo wrote: > > On Tue, Jan 22, 2013 at 06:49:00PM +0800, Miao Xie wrote: > >> btrfs_start_delalloc_inodes() locks the delalloc_inodes list, fetches the > >> first inode, unlocks the list, triggers btrfs_alloc_delalloc_work/ > >> btrfs_queue_worker for this inode, and then it locks the list, checks the > >> head of the list again. But because we don't delete the first inode that it > >> deals with before, it will fetch the same inode. As a result, this function > >> allocates a huge amount of btrfs_delalloc_work structures, and OOM happens. > >> > >> Fix this problem by splice this delalloc list. > >> > >> Reported-by: Alex Lyakas <alex.btrfs@zadarastorage.com> > >> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> > >> --- > >> fs/btrfs/inode.c | 55 ++++++++++++++++++++++++++++++++++++++++------------- > >> 1 files changed, 41 insertions(+), 14 deletions(-) > >> > >> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > >> index 67ed24a..86f1d25 100644 > >> --- a/fs/btrfs/inode.c > >> +++ b/fs/btrfs/inode.c > >> @@ -7545,41 +7545,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) > >> */ > >> int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) > >> { > >> - struct list_head *head = &root->fs_info->delalloc_inodes; > >> struct btrfs_inode *binode; > >> struct inode *inode; > >> struct btrfs_delalloc_work *work, *next; > >> struct list_head works; > >> + struct list_head splice; > >> int ret = 0; > >> > >> if (root->fs_info->sb->s_flags & MS_RDONLY) > >> return -EROFS; > >> > >> INIT_LIST_HEAD(&works); > >> - > >> + INIT_LIST_HEAD(&splice); > >> +again: > >> spin_lock(&root->fs_info->delalloc_lock); > >> - while (!list_empty(head)) { > >> - binode = list_entry(head->next, struct btrfs_inode, > >> + list_splice_init(&root->fs_info->delalloc_inodes, &splice); > >> + while (!list_empty(&splice)) { > >> + binode = list_entry(splice.next, struct btrfs_inode, > >> delalloc_inodes); > >> + > >> + list_del_init(&binode->delalloc_inodes); > >> + > > > > I believe this patch can work well, but it's a little complex. > > > > How about adding a flag in runtime_flags set? > > I have tried to adding a flag in runtime_flags, but I found it is not a good > way, because > - it can not avoid traversing the delalloc list repeatedly when someone write > data into the file endlessly. In fact, it is unnecessary because we can just > see that data as the one which is written after the flush is done. > - bit operation need lock the bus, but we have a spin lock to protect all > the relative variants, so it is unnecessary. > > besides that, there is something wrong with the following patch. Okay, I see the problem. But with [PATCH 4/5], I think maybe we can merge these two patches and simplify things as following? Just flush them once, spin_lock(&root->fs_info->delalloc_lock); list_splice_init(&root->fs_info->delalloc_inodes, &splice); spin_unlock(&root->fs_info->delalloc_lock); while (!list_empty(&splice)) { ... } thanks, liubo > > > We can use the flag instead of 'delalloc_inodes' list to tell if we > > have clear the delalloc bytes, and the most important thing is it > > won't touch the original code logic too much. > > > > thanks, > > liubo > > > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > > index 67ed24a..692ed0e 100644 > > --- a/fs/btrfs/inode.c > > +++ b/fs/btrfs/inode.c > > @@ -1555,8 +1555,8 @@ static void btrfs_clear_bit_hook(struct inode *inode, > > BTRFS_I(inode)->delalloc_bytes -= len; > > > > if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && > > - !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { > > - list_del_init(&BTRFS_I(inode)->delalloc_inodes); > > + test_bit(BTRFS_INODE_FLUSH, &BTRFS_I(inode)->runtime_flags)) { > > + clear_bit(BTRFS_INODE_FLUSH, &BTRFS_I(inode)->runtime_flags); > > } > > We can not remove list_del_init(), because the delalloc file can be flushed not only > by btrfs_start_delalloc_inodes(), but also by flusher or the task who invoke btrfs_sync_file(). > > > spin_unlock(&root->fs_info->delalloc_lock); > > } > > @@ -7562,8 +7562,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) > > binode = list_entry(head->next, struct btrfs_inode, > > delalloc_inodes); > > inode = igrab(&binode->vfs_inode); > > - if (!inode) > > - list_del_init(&binode->delalloc_inodes); > > + > > + list_del_init(&binode->delalloc_inodes); > > + > > spin_unlock(&root->fs_info->delalloc_lock); > > if (inode) { > > work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); > > @@ -7572,6 +7573,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) > > goto out; > > } > > list_add_tail(&work->list, &works); > > + set_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags); > > if someone flush the file before set_bit(), no one will clear bit. > > > btrfs_queue_worker(&root->fs_info->flush_workers, > > &work->work); > > } > > @@ -7580,6 +7582,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) > > } > > spin_unlock(&root->fs_info->delalloc_lock); > > > > + /* make sure we clear all delalloc bytes we have scheduled */ > > + while (!list_empty(&works)) { > > + work = list_entry(works.next, struct btrfs_delalloc_work, > > + list); > > + binode = btrfs_ino(work->inode); > > ^^^^^^BTRFS_I(), not btrfs_ino() > > > + if (!test_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags)) { > > + list_del_init(&work->list); > > + btrfs_wait_and_free_delalloc_work(work); > > We must wait and free all the delalloc work here, or memory leak will happen. > > Thanks > Miao > > > + } > > + cond_resched(); > > + } > > + > > /* the filemap_flush will queue IO into the worker threads, but > > * we have to make sure the IO is actually started and that > > * ordered extents get created before we return > > > > > > > >> inode = igrab(&binode->vfs_inode); > >> if (!inode) > >> - list_del_init(&binode->delalloc_inodes); > >> + continue; > >> + > >> + list_add_tail(&binode->delalloc_inodes, > >> + &root->fs_info->delalloc_inodes); > >> spin_unlock(&root->fs_info->delalloc_lock); > >> - if (inode) { > >> - work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); > >> - if (!work) { > >> - ret = -ENOMEM; > >> - goto out; > >> - } > >> - list_add_tail(&work->list, &works); > >> - btrfs_queue_worker(&root->fs_info->flush_workers, > >> - &work->work); > >> + > >> + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); > >> + if (unlikely(!work)) { > >> + ret = -ENOMEM; > >> + goto out; > >> } > >> + list_add_tail(&work->list, &works); > >> + btrfs_queue_worker(&root->fs_info->flush_workers, > >> + &work->work); > >> + > >> cond_resched(); > >> spin_lock(&root->fs_info->delalloc_lock); > >> } > >> spin_unlock(&root->fs_info->delalloc_lock); > >> > >> + list_for_each_entry_safe(work, next, &works, list) { > >> + list_del_init(&work->list); > >> + btrfs_wait_and_free_delalloc_work(work); > >> + } > >> + > >> + spin_lock(&root->fs_info->delalloc_lock); > >> + if (!list_empty(&root->fs_info->delalloc_inodes)) { > >> + spin_unlock(&root->fs_info->delalloc_lock); > >> + goto again; > >> + } > >> + spin_unlock(&root->fs_info->delalloc_lock); > >> + > >> /* the filemap_flush will queue IO into the worker threads, but > >> * we have to make sure the IO is actually started and that > >> * ordered extents get created before we return > >> @@ -7592,11 +7612,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) > >> atomic_read(&root->fs_info->async_delalloc_pages) == 0)); > >> } > >> atomic_dec(&root->fs_info->async_submit_draining); > >> + return 0; > >> out: > >> list_for_each_entry_safe(work, next, &works, list) { > >> list_del_init(&work->list); > >> btrfs_wait_and_free_delalloc_work(work); > >> } > >> + > >> + if (!list_empty_careful(&splice)) { > >> + spin_lock(&root->fs_info->delalloc_lock); > >> + list_splice_tail(&splice, &root->fs_info->delalloc_inodes); > >> + spin_unlock(&root->fs_info->delalloc_lock); > >> + } > >> return ret; > >> } > >> > >> -- > >> 1.6.5.2 > >> -- > >> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > >> the body of a message to majordomo@vger.kernel.org > >> More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On wed, 23 Jan 2013 11:56:55 +0800, Liu Bo wrote: > On Wed, Jan 23, 2013 at 10:54:39AM +0800, Miao Xie wrote: >> On Tue, 22 Jan 2013 22:24:15 +0800, Liu Bo wrote: >>> On Tue, Jan 22, 2013 at 06:49:00PM +0800, Miao Xie wrote: >>>> btrfs_start_delalloc_inodes() locks the delalloc_inodes list, fetches the >>>> first inode, unlocks the list, triggers btrfs_alloc_delalloc_work/ >>>> btrfs_queue_worker for this inode, and then it locks the list, checks the >>>> head of the list again. But because we don't delete the first inode that it >>>> deals with before, it will fetch the same inode. As a result, this function >>>> allocates a huge amount of btrfs_delalloc_work structures, and OOM happens. >>>> >>>> Fix this problem by splice this delalloc list. >>>> >>>> Reported-by: Alex Lyakas <alex.btrfs@zadarastorage.com> >>>> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> >>>> --- >>>> fs/btrfs/inode.c | 55 ++++++++++++++++++++++++++++++++++++++++------------- >>>> 1 files changed, 41 insertions(+), 14 deletions(-) >>>> >>>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c >>>> index 67ed24a..86f1d25 100644 >>>> --- a/fs/btrfs/inode.c >>>> +++ b/fs/btrfs/inode.c >>>> @@ -7545,41 +7545,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) >>>> */ >>>> int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) >>>> { >>>> - struct list_head *head = &root->fs_info->delalloc_inodes; >>>> struct btrfs_inode *binode; >>>> struct inode *inode; >>>> struct btrfs_delalloc_work *work, *next; >>>> struct list_head works; >>>> + struct list_head splice; >>>> int ret = 0; >>>> >>>> if (root->fs_info->sb->s_flags & MS_RDONLY) >>>> return -EROFS; >>>> >>>> INIT_LIST_HEAD(&works); >>>> - >>>> + INIT_LIST_HEAD(&splice); >>>> +again: >>>> spin_lock(&root->fs_info->delalloc_lock); >>>> - while (!list_empty(head)) { >>>> - binode = list_entry(head->next, struct btrfs_inode, >>>> + list_splice_init(&root->fs_info->delalloc_inodes, &splice); >>>> + while (!list_empty(&splice)) { >>>> + binode = list_entry(splice.next, struct btrfs_inode, >>>> delalloc_inodes); >>>> + >>>> + list_del_init(&binode->delalloc_inodes); >>>> + >>> >>> I believe this patch can work well, but it's a little complex. >>> >>> How about adding a flag in runtime_flags set? >> >> I have tried to adding a flag in runtime_flags, but I found it is not a good >> way, because >> - it can not avoid traversing the delalloc list repeatedly when someone write >> data into the file endlessly. In fact, it is unnecessary because we can just >> see that data as the one which is written after the flush is done. >> - bit operation need lock the bus, but we have a spin lock to protect all >> the relative variants, so it is unnecessary. >> >> besides that, there is something wrong with the following patch. > > Okay, I see the problem. > > But with [PATCH 4/5], I think maybe we can merge these two patches and > simplify things as following? > > Just flush them once, > > spin_lock(&root->fs_info->delalloc_lock); > list_splice_init(&root->fs_info->delalloc_inodes, &splice); > spin_unlock(&root->fs_info->delalloc_lock); > > while (!list_empty(&splice)) { > ... > } No, we can't. The other tasks which flush the delalloc data may remove the inode from the delalloc list/splice list. If we release the lock, we will meet the race between list traversing and list_del(). Thanks Miao > > thanks, > liubo > >> >>> We can use the flag instead of 'delalloc_inodes' list to tell if we >>> have clear the delalloc bytes, and the most important thing is it >>> won't touch the original code logic too much. >>> >>> thanks, >>> liubo >>> >>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c >>> index 67ed24a..692ed0e 100644 >>> --- a/fs/btrfs/inode.c >>> +++ b/fs/btrfs/inode.c >>> @@ -1555,8 +1555,8 @@ static void btrfs_clear_bit_hook(struct inode *inode, >>> BTRFS_I(inode)->delalloc_bytes -= len; >>> >>> if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && >>> - !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { >>> - list_del_init(&BTRFS_I(inode)->delalloc_inodes); >>> + test_bit(BTRFS_INODE_FLUSH, &BTRFS_I(inode)->runtime_flags)) { >>> + clear_bit(BTRFS_INODE_FLUSH, &BTRFS_I(inode)->runtime_flags); >>> } >> >> We can not remove list_del_init(), because the delalloc file can be flushed not only >> by btrfs_start_delalloc_inodes(), but also by flusher or the task who invoke btrfs_sync_file(). >> >>> spin_unlock(&root->fs_info->delalloc_lock); >>> } >>> @@ -7562,8 +7562,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) >>> binode = list_entry(head->next, struct btrfs_inode, >>> delalloc_inodes); >>> inode = igrab(&binode->vfs_inode); >>> - if (!inode) >>> - list_del_init(&binode->delalloc_inodes); >>> + >>> + list_del_init(&binode->delalloc_inodes); >>> + >>> spin_unlock(&root->fs_info->delalloc_lock); >>> if (inode) { >>> work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); >>> @@ -7572,6 +7573,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) >>> goto out; >>> } >>> list_add_tail(&work->list, &works); >>> + set_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags); >> >> if someone flush the file before set_bit(), no one will clear bit. >> >>> btrfs_queue_worker(&root->fs_info->flush_workers, >>> &work->work); >>> } >>> @@ -7580,6 +7582,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) >>> } >>> spin_unlock(&root->fs_info->delalloc_lock); >>> >>> + /* make sure we clear all delalloc bytes we have scheduled */ >>> + while (!list_empty(&works)) { >>> + work = list_entry(works.next, struct btrfs_delalloc_work, >>> + list); >>> + binode = btrfs_ino(work->inode); >> >> ^^^^^^BTRFS_I(), not btrfs_ino() >> >>> + if (!test_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags)) { >>> + list_del_init(&work->list); >>> + btrfs_wait_and_free_delalloc_work(work); >> >> We must wait and free all the delalloc work here, or memory leak will happen. >> >> Thanks >> Miao >> >>> + } >>> + cond_resched(); >>> + } >>> + >>> /* the filemap_flush will queue IO into the worker threads, but >>> * we have to make sure the IO is actually started and that >>> * ordered extents get created before we return >>> >>> >>> >>>> inode = igrab(&binode->vfs_inode); >>>> if (!inode) >>>> - list_del_init(&binode->delalloc_inodes); >>>> + continue; >>>> + >>>> + list_add_tail(&binode->delalloc_inodes, >>>> + &root->fs_info->delalloc_inodes); >>>> spin_unlock(&root->fs_info->delalloc_lock); >>>> - if (inode) { >>>> - work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); >>>> - if (!work) { >>>> - ret = -ENOMEM; >>>> - goto out; >>>> - } >>>> - list_add_tail(&work->list, &works); >>>> - btrfs_queue_worker(&root->fs_info->flush_workers, >>>> - &work->work); >>>> + >>>> + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); >>>> + if (unlikely(!work)) { >>>> + ret = -ENOMEM; >>>> + goto out; >>>> } >>>> + list_add_tail(&work->list, &works); >>>> + btrfs_queue_worker(&root->fs_info->flush_workers, >>>> + &work->work); >>>> + >>>> cond_resched(); >>>> spin_lock(&root->fs_info->delalloc_lock); >>>> } >>>> spin_unlock(&root->fs_info->delalloc_lock); >>>> >>>> + list_for_each_entry_safe(work, next, &works, list) { >>>> + list_del_init(&work->list); >>>> + btrfs_wait_and_free_delalloc_work(work); >>>> + } >>>> + >>>> + spin_lock(&root->fs_info->delalloc_lock); >>>> + if (!list_empty(&root->fs_info->delalloc_inodes)) { >>>> + spin_unlock(&root->fs_info->delalloc_lock); >>>> + goto again; >>>> + } >>>> + spin_unlock(&root->fs_info->delalloc_lock); >>>> + >>>> /* the filemap_flush will queue IO into the worker threads, but >>>> * we have to make sure the IO is actually started and that >>>> * ordered extents get created before we return >>>> @@ -7592,11 +7612,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) >>>> atomic_read(&root->fs_info->async_delalloc_pages) == 0)); >>>> } >>>> atomic_dec(&root->fs_info->async_submit_draining); >>>> + return 0; >>>> out: >>>> list_for_each_entry_safe(work, next, &works, list) { >>>> list_del_init(&work->list); >>>> btrfs_wait_and_free_delalloc_work(work); >>>> } >>>> + >>>> + if (!list_empty_careful(&splice)) { >>>> + spin_lock(&root->fs_info->delalloc_lock); >>>> + list_splice_tail(&splice, &root->fs_info->delalloc_inodes); >>>> + spin_unlock(&root->fs_info->delalloc_lock); >>>> + } >>>> return ret; >>>> } >>>> >>>> -- >>>> 1.6.5.2 >>>> -- >>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in >>>> the body of a message to majordomo@vger.kernel.org >>>> More majordomo info at http://vger.kernel.org/majordomo-info.html >>> >> > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote: > No, we can't. The other tasks which flush the delalloc data may remove the inode > from the delalloc list/splice list. If we release the lock, we will meet the race > between list traversing and list_del(). OK, then please merge patch 1 and 4 so that we can backport 1 less patch at least. thanks, liubo -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote: > On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote: >> No, we can't. The other tasks which flush the delalloc data may remove the inode >> from the delalloc list/splice list. If we release the lock, we will meet the race >> between list traversing and list_del(). > > OK, then please merge patch 1 and 4 so that we can backport 1 less patch > at least. I don't think we should merge these two patch because they do two different things - one is bug fix, and the other is just a improvement, and this improvement changes the logic of the code and might be argumentative for some developers. So 2 patches is better than one, I think. Thanks Miao -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jan 23, 2013 at 02:33:27PM +0800, Miao Xie wrote: > On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote: > > On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote: > >> No, we can't. The other tasks which flush the delalloc data may remove the inode > >> from the delalloc list/splice list. If we release the lock, we will meet the race > >> between list traversing and list_del(). > > > > OK, then please merge patch 1 and 4 so that we can backport 1 less patch > > at least. > > I don't think we should merge these two patch because they do two different things - one > is bug fix, and the other is just a improvement, and this improvement changes the logic > of the code and might be argumentative for some developers. So 2 patches is better than one, > I think. Sorry, this is right only when patch 1 really fixes the problem Alex reported. But the fact is 1) patch 1 is not enough to fix the bug, it just fixes the OOM of allocating 'struct btrfs_delalloc_work' while the original OOM of allocating requests remains. We can still get the same inode over and over again and then stuck in btrfs_start_delalloc_inodes() because we 'goto again' to make sure we flush all inodes listed in fs_info->delalloc_inodes. 2) patch 4 fixes 1)'s problems by removing 'goto again'. Am I missing something? thanks, liubo -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On wed, 23 Jan 2013 16:17:53 +0800, Liu Bo wrote: > On Wed, Jan 23, 2013 at 02:33:27PM +0800, Miao Xie wrote: >> On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote: >>> On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote: >>>> No, we can't. The other tasks which flush the delalloc data may remove the inode >>>> from the delalloc list/splice list. If we release the lock, we will meet the race >>>> between list traversing and list_del(). >>> >>> OK, then please merge patch 1 and 4 so that we can backport 1 less patch >>> at least. >> >> I don't think we should merge these two patch because they do two different things - one >> is bug fix, and the other is just a improvement, and this improvement changes the logic >> of the code and might be argumentative for some developers. So 2 patches is better than one, >> I think. > > Sorry, this is right only when patch 1 really fixes the problem Alex reported. > > But the fact is > > 1) patch 1 is not enough to fix the bug, it just fixes the OOM of > allocating 'struct btrfs_delalloc_work' while the original OOM of allocating > requests remains. We can still get the same inode over and over again > and then stuck in btrfs_start_delalloc_inodes() because we 'goto again' to > make sure we flush all inodes listed in fs_info->delalloc_inodes. > > 2) patch 4 fixes 1)'s problems by removing 'goto again'. > > Am I missing something? In fact, there are two different problems. One is OOM bug. it is a serious problem and also is an regression, so we should fix it as soon as possible. The other one is that we may fetch the same inode again and again if someone write data into it endlessly. This problem is not so urgent to deal with. and perhaps some developers think it is not a problem, we should flush that inode since there are dirty pages in it. So we need split it from the patch of the 1st problem. Thanks Miao -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Miao, thank you for addressing the issue. I will try out this 5/5 patchset and let you know what I see. I will apply it on top of latest for-linus branch. Thanks, Alex. On Wed, Jan 23, 2013 at 10:58 AM, Miao Xie <miaox@cn.fujitsu.com> wrote: > On wed, 23 Jan 2013 16:17:53 +0800, Liu Bo wrote: >> On Wed, Jan 23, 2013 at 02:33:27PM +0800, Miao Xie wrote: >>> On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote: >>>> On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote: >>>>> No, we can't. The other tasks which flush the delalloc data may remove the inode >>>>> from the delalloc list/splice list. If we release the lock, we will meet the race >>>>> between list traversing and list_del(). >>>> >>>> OK, then please merge patch 1 and 4 so that we can backport 1 less patch >>>> at least. >>> >>> I don't think we should merge these two patch because they do two different things - one >>> is bug fix, and the other is just a improvement, and this improvement changes the logic >>> of the code and might be argumentative for some developers. So 2 patches is better than one, >>> I think. >> >> Sorry, this is right only when patch 1 really fixes the problem Alex reported. >> >> But the fact is >> >> 1) patch 1 is not enough to fix the bug, it just fixes the OOM of >> allocating 'struct btrfs_delalloc_work' while the original OOM of allocating >> requests remains. We can still get the same inode over and over again >> and then stuck in btrfs_start_delalloc_inodes() because we 'goto again' to >> make sure we flush all inodes listed in fs_info->delalloc_inodes. >> >> 2) patch 4 fixes 1)'s problems by removing 'goto again'. >> >> Am I missing something? > > In fact, there are two different problems. > One is OOM bug. it is a serious problem and also is an regression, so we should fix it > as soon as possible. > The other one is that we may fetch the same inode again and again if someone write data > into it endlessly. This problem is not so urgent to deal with. and perhaps some developers > think it is not a problem, we should flush that inode since there are dirty pages in it. > So we need split it from the patch of the 1st problem. > > Thanks > Miao -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jan 23, 2013 at 04:58:27PM +0800, Miao Xie wrote: > On wed, 23 Jan 2013 16:17:53 +0800, Liu Bo wrote: > > On Wed, Jan 23, 2013 at 02:33:27PM +0800, Miao Xie wrote: > >> On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote: > >>> On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote: > >>>> No, we can't. The other tasks which flush the delalloc data may remove the inode > >>>> from the delalloc list/splice list. If we release the lock, we will meet the race > >>>> between list traversing and list_del(). > >>> > >>> OK, then please merge patch 1 and 4 so that we can backport 1 less patch > >>> at least. > >> > >> I don't think we should merge these two patch because they do two different things - one > >> is bug fix, and the other is just a improvement, and this improvement changes the logic > >> of the code and might be argumentative for some developers. So 2 patches is better than one, > >> I think. > > > > Sorry, this is right only when patch 1 really fixes the problem Alex reported. > > > > But the fact is > > > > 1) patch 1 is not enough to fix the bug, it just fixes the OOM of > > allocating 'struct btrfs_delalloc_work' while the original OOM of allocating > > requests remains. We can still get the same inode over and over again > > and then stuck in btrfs_start_delalloc_inodes() because we 'goto again' to > > make sure we flush all inodes listed in fs_info->delalloc_inodes. > > > > 2) patch 4 fixes 1)'s problems by removing 'goto again'. > > > > Am I missing something? > > In fact, there are two different problems. > One is OOM bug. it is a serious problem and also is an regression, so we should fix it > as soon as possible. > The other one is that we may fetch the same inode again and again if someone write data > into it endlessly. This problem is not so urgent to deal with. and perhaps some developers > think it is not a problem, we should flush that inode since there are dirty pages in it. > So we need split it from the patch of the 1st problem. All right, I'm ok with this. But the TWO different problems are both due to fetching the same inode more than once, and the solutions are indeed same, "fetch every inode on the list just once", and they are in the same code. It is definitely a bug/problem if any users complains about their box getting stuck. It is KERNEL that should be blamed. thanks, liubo -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 23 Jan 2013 17:52:14 +0800, Liu Bo wrote: > On Wed, Jan 23, 2013 at 04:58:27PM +0800, Miao Xie wrote: >> On wed, 23 Jan 2013 16:17:53 +0800, Liu Bo wrote: >>> On Wed, Jan 23, 2013 at 02:33:27PM +0800, Miao Xie wrote: >>>> On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote: >>>>> On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote: >>>>>> No, we can't. The other tasks which flush the delalloc data may remove the inode >>>>>> from the delalloc list/splice list. If we release the lock, we will meet the race >>>>>> between list traversing and list_del(). >>>>> >>>>> OK, then please merge patch 1 and 4 so that we can backport 1 less patch >>>>> at least. >>>> >>>> I don't think we should merge these two patch because they do two different things - one >>>> is bug fix, and the other is just a improvement, and this improvement changes the logic >>>> of the code and might be argumentative for some developers. So 2 patches is better than one, >>>> I think. >>> >>> Sorry, this is right only when patch 1 really fixes the problem Alex reported. >>> >>> But the fact is >>> >>> 1) patch 1 is not enough to fix the bug, it just fixes the OOM of >>> allocating 'struct btrfs_delalloc_work' while the original OOM of allocating >>> requests remains. We can still get the same inode over and over again >>> and then stuck in btrfs_start_delalloc_inodes() because we 'goto again' to >>> make sure we flush all inodes listed in fs_info->delalloc_inodes. >>> >>> 2) patch 4 fixes 1)'s problems by removing 'goto again'. >>> >>> Am I missing something? >> >> In fact, there are two different problems. >> One is OOM bug. it is a serious problem and also is an regression, so we should fix it >> as soon as possible. >> The other one is that we may fetch the same inode again and again if someone write data >> into it endlessly. This problem is not so urgent to deal with. and perhaps some developers >> think it is not a problem, we should flush that inode since there are dirty pages in it. >> So we need split it from the patch of the 1st problem. > > All right, I'm ok with this. > > But the TWO different problems are both due to fetching the same inode more > than once, and the solutions are indeed same, "fetch every inode on > the list just once", and they are in the same code. I forgot to say that the first problem can happen even though no task writes data into the file after we start to flush the delalloc inodes. Thanks Miao > > It is definitely a bug/problem if any users complains about their box > getting stuck. It is KERNEL that should be blamed. > > thanks, > liubo > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Miao, I have tested your patch, and the two discussed issues (OOM and handling the same inode more than once) are solved with it. However, snap creation under IO still takes 5-10 minutes for me. Basically, now the situation is similar to kernel 3.6, before your change to push the work to the flush workers. I see that flush worker is stuck in one of two stacks like this: [<ffffffff81301f1d>] get_request+0x14d/0x330 [<ffffffff813030a4>] blk_queue_bio+0x84/0x3b0 [<ffffffff812ff9a7>] generic_make_request.part.55+0x77/0xb0 [<ffffffff812fff48>] generic_make_request+0x68/0x70 [<ffffffff812fffcb>] submit_bio+0x7b/0x160 [<ffffffffa02fe76e>] submit_stripe_bio+0x5e/0x80 [btrfs] [<ffffffffa03050db>] btrfs_map_bio+0x1cb/0x420 [btrfs] [<ffffffffa02e1b14>] btrfs_submit_bio_hook+0xb4/0x1d0 [btrfs] [<ffffffffa02f5b1a>] submit_one_bio+0x6a/0xa0 [btrfs] [<ffffffffa02f99c4>] submit_extent_page.isra.38+0xe4/0x200 [btrfs] [<ffffffffa02fa83c>] __extent_writepage+0x5dc/0x750 [btrfs] [<ffffffffa02fac6a>] extent_write_cache_pages.isra.29.constprop.42+0x2ba/0x410 [btrfs] [<ffffffffa02fb00e>] extent_writepages+0x4e/0x70 [btrfs] [<ffffffffa02e0898>] btrfs_writepages+0x28/0x30 [btrfs] [<ffffffff81136510>] do_writepages+0x20/0x40 [<ffffffff8112c341>] __filemap_fdatawrite_range+0x51/0x60 [<ffffffff8112cc3c>] filemap_flush+0x1c/0x20 [<ffffffffa02e416d>] btrfs_run_delalloc_work+0xad/0xe0 [btrfs] [<ffffffffa0308c1f>] worker_loop+0x16f/0x5d0 [btrfs] [<ffffffff8107ba50>] kthread+0xc0/0xd0 [<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0 [<ffffffffffffffff>] 0xffffffffffffffff or [<ffffffff8112a58e>] sleep_on_page+0xe/0x20 [<ffffffff8112a577>] __lock_page+0x67/0x70 [<ffffffffa02fabe9>] extent_write_cache_pages.isra.29.constprop.42+0x239/0x410 [btrfs] [<ffffffffa02fb00e>] extent_writepages+0x4e/0x70 [btrfs] [<ffffffffa02e0898>] btrfs_writepages+0x28/0x30 [btrfs] [<ffffffff81136510>] do_writepages+0x20/0x40 [<ffffffff8112c341>] __filemap_fdatawrite_range+0x51/0x60 [<ffffffff8112cc3c>] filemap_flush+0x1c/0x20 [<ffffffffa02e416d>] btrfs_run_delalloc_work+0xad/0xe0 [btrfs] [<ffffffffa0308c1f>] worker_loop+0x16f/0x5d0 [btrfs] [<ffffffff8107ba50>] kthread+0xc0/0xd0 [<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0 [<ffffffffffffffff>] 0xffffffffffffffff while btrfs_start_delalloc_inodes() waits for it to complete in the commit thread. Also for some reason, I have only one "flush_workers" thread, so switching to another thread does not improve for me. Another operation that takes time (up to one minute) is btrfs_wait_ordered_extents(), which does similar thing by switching the work to the flush worker. In this case, the btrfs_start_ordered_extent() is stuck in stacks like follows: [<ffffffffa04e6fc5>] btrfs_start_ordered_extent+0x85/0x130 [btrfs] [<ffffffffa04e70e1>] btrfs_run_ordered_extent_work+0x71/0xd0 [btrfs] [<ffffffffa04facef>] worker_loop+0x16f/0x5d0 [btrfs] [<ffffffff8107ba50>] kthread+0xc0/0xd0 [<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0 [<ffffffffffffffff>] 0xffffffffffffffff where it waits for BTRFS_ORDERED_COMPLETE. I have several questions, on how to possibly address this issue: - I see that btrfs_flush_all_pending_stuffs() is invoked at least twice during each transaction commit. It may be invoked more than twice if the do{ } while loop that waits for writes, performs more than one iteration. For me, each invocation takes a lot of time because of btrfs_start_delalloc_inodes() and btrfs_wait_ordered_extents(). Given the fixes you have made (handling each inode only once), is it still needed to call these functions several times during the same commit? - I see that during a commit without pending snapshots, these two functions are not invoked at all. if (flush_on_commit || snap_pending) { ret = btrfs_start_delalloc_inodes(root, 1); if (ret) return ret; btrfs_wait_ordered_extents(root, 1); } The FLUSHONCOMMIT mount option is normally *not set*, I see in the wiki that it is "Not needed with recent kernels, as it's the normal mode of operation. " Can you pls explain why the delalloc is needed when there is a pending snap, but not with a non-snap commit? Or pls point me to the code, where I can better understand what delalloc inode is and how it is related to FLUSHONCOMMIT or snapshot? Can you pls explain what the FLUSHONCOMMIT mount option is about? I understand from your explanation that without flushing the delalloc, the data in the snap may be different from the subvolume data. But this may happen anyways, since the IO to the subvolume is not blocked during transaction commit (at least it doesn't look that it's blocked). - Is there something that user-space can do to avoid flushing the delalloc during snap-commit? For example, if the user-space stops the IO and does a normal commit, this will not call btrfs_start_delalloc_inodes(), but this should ensure that *some* data is safe on disk. Then the user-space triggers snap creation (which is another commit), and then resume the IO? Because without the ongoing IO, snap creation is very fast. Thanks for your help, Alex. P.S.: As I am writing this email, snap creation is stuck for 30 minutes, calling btrfs_flush_all_pending_stuffs() again and again.... On Wed, Jan 23, 2013 at 12:20 PM, Miao Xie <miaox@cn.fujitsu.com> wrote: > On Wed, 23 Jan 2013 17:52:14 +0800, Liu Bo wrote: >> On Wed, Jan 23, 2013 at 04:58:27PM +0800, Miao Xie wrote: >>> On wed, 23 Jan 2013 16:17:53 +0800, Liu Bo wrote: >>>> On Wed, Jan 23, 2013 at 02:33:27PM +0800, Miao Xie wrote: >>>>> On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote: >>>>>> On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote: >>>>>>> No, we can't. The other tasks which flush the delalloc data may remove the inode >>>>>>> from the delalloc list/splice list. If we release the lock, we will meet the race >>>>>>> between list traversing and list_del(). >>>>>> >>>>>> OK, then please merge patch 1 and 4 so that we can backport 1 less patch >>>>>> at least. >>>>> >>>>> I don't think we should merge these two patch because they do two different things - one >>>>> is bug fix, and the other is just a improvement, and this improvement changes the logic >>>>> of the code and might be argumentative for some developers. So 2 patches is better than one, >>>>> I think. >>>> >>>> Sorry, this is right only when patch 1 really fixes the problem Alex reported. >>>> >>>> But the fact is >>>> >>>> 1) patch 1 is not enough to fix the bug, it just fixes the OOM of >>>> allocating 'struct btrfs_delalloc_work' while the original OOM of allocating >>>> requests remains. We can still get the same inode over and over again >>>> and then stuck in btrfs_start_delalloc_inodes() because we 'goto again' to >>>> make sure we flush all inodes listed in fs_info->delalloc_inodes. >>>> >>>> 2) patch 4 fixes 1)'s problems by removing 'goto again'. >>>> >>>> Am I missing something? >>> >>> In fact, there are two different problems. >>> One is OOM bug. it is a serious problem and also is an regression, so we should fix it >>> as soon as possible. >>> The other one is that we may fetch the same inode again and again if someone write data >>> into it endlessly. This problem is not so urgent to deal with. and perhaps some developers >>> think it is not a problem, we should flush that inode since there are dirty pages in it. >>> So we need split it from the patch of the 1st problem. >> >> All right, I'm ok with this. >> >> But the TWO different problems are both due to fetching the same inode more >> than once, and the solutions are indeed same, "fetch every inode on >> the list just once", and they are in the same code. > > I forgot to say that the first problem can happen even though no task writes data into > the file after we start to flush the delalloc inodes. > > Thanks > Miao > >> >> It is definitely a bug/problem if any users complains about their box >> getting stuck. It is KERNEL that should be blamed. >> >> thanks, >> liubo >> > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On wed, 23 Jan 2013 19:02:30 +0200, Alex Lyakas wrote: > Hi Miao, > I have tested your patch, and the two discussed issues (OOM and > handling the same inode more than once) are solved with it. > > However, snap creation under IO still takes 5-10 minutes for me. > Basically, now the situation is similar to kernel 3.6, before your > change to push the work to the flush workers. I see that flush worker > is stuck in one of two stacks like this: > > [<ffffffff81301f1d>] get_request+0x14d/0x330 > [<ffffffff813030a4>] blk_queue_bio+0x84/0x3b0 > [<ffffffff812ff9a7>] generic_make_request.part.55+0x77/0xb0 > [<ffffffff812fff48>] generic_make_request+0x68/0x70 > [<ffffffff812fffcb>] submit_bio+0x7b/0x160 > [<ffffffffa02fe76e>] submit_stripe_bio+0x5e/0x80 [btrfs] > [<ffffffffa03050db>] btrfs_map_bio+0x1cb/0x420 [btrfs] > [<ffffffffa02e1b14>] btrfs_submit_bio_hook+0xb4/0x1d0 [btrfs] > [<ffffffffa02f5b1a>] submit_one_bio+0x6a/0xa0 [btrfs] > [<ffffffffa02f99c4>] submit_extent_page.isra.38+0xe4/0x200 [btrfs] > [<ffffffffa02fa83c>] __extent_writepage+0x5dc/0x750 [btrfs] > [<ffffffffa02fac6a>] > extent_write_cache_pages.isra.29.constprop.42+0x2ba/0x410 [btrfs] > [<ffffffffa02fb00e>] extent_writepages+0x4e/0x70 [btrfs] > [<ffffffffa02e0898>] btrfs_writepages+0x28/0x30 [btrfs] > [<ffffffff81136510>] do_writepages+0x20/0x40 > [<ffffffff8112c341>] __filemap_fdatawrite_range+0x51/0x60 > [<ffffffff8112cc3c>] filemap_flush+0x1c/0x20 > [<ffffffffa02e416d>] btrfs_run_delalloc_work+0xad/0xe0 [btrfs] > [<ffffffffa0308c1f>] worker_loop+0x16f/0x5d0 [btrfs] > [<ffffffff8107ba50>] kthread+0xc0/0xd0 > [<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0 > [<ffffffffffffffff>] 0xffffffffffffffff > > or > > [<ffffffff8112a58e>] sleep_on_page+0xe/0x20 > [<ffffffff8112a577>] __lock_page+0x67/0x70 > [<ffffffffa02fabe9>] > extent_write_cache_pages.isra.29.constprop.42+0x239/0x410 [btrfs] > [<ffffffffa02fb00e>] extent_writepages+0x4e/0x70 [btrfs] > [<ffffffffa02e0898>] btrfs_writepages+0x28/0x30 [btrfs] > [<ffffffff81136510>] do_writepages+0x20/0x40 > [<ffffffff8112c341>] __filemap_fdatawrite_range+0x51/0x60 > [<ffffffff8112cc3c>] filemap_flush+0x1c/0x20 > [<ffffffffa02e416d>] btrfs_run_delalloc_work+0xad/0xe0 [btrfs] > [<ffffffffa0308c1f>] worker_loop+0x16f/0x5d0 [btrfs] > [<ffffffff8107ba50>] kthread+0xc0/0xd0 > [<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0 > [<ffffffffffffffff>] 0xffffffffffffffff > > while btrfs_start_delalloc_inodes() waits for it to complete in the > commit thread. Also for some reason, I have only one "flush_workers" > thread, so switching to another thread does not improve for me. it is because the number of the inodes is very few, so the worker don't create new workers to deal with the new works. Maybe we can change flush_workers->idle_thresh to improve this problem. > Another operation that takes time (up to one minute) is > btrfs_wait_ordered_extents(), which does similar thing by switching > the work to the flush worker. In this case, the > btrfs_start_ordered_extent() is stuck in stacks like follows: > [<ffffffffa04e6fc5>] btrfs_start_ordered_extent+0x85/0x130 [btrfs] > [<ffffffffa04e70e1>] btrfs_run_ordered_extent_work+0x71/0xd0 [btrfs] > [<ffffffffa04facef>] worker_loop+0x16f/0x5d0 [btrfs] > [<ffffffff8107ba50>] kthread+0xc0/0xd0 > [<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0 > [<ffffffffffffffff>] 0xffffffffffffffff > where it waits for BTRFS_ORDERED_COMPLETE. > > I have several questions, on how to possibly address this issue: > > - I see that btrfs_flush_all_pending_stuffs() is invoked at least > twice during each transaction commit. It may be invoked more than > twice if the do{ } while loop that waits for writes, performs more > than one iteration. For me, each invocation takes a lot of time > because of btrfs_start_delalloc_inodes() and > btrfs_wait_ordered_extents(). Given the fixes you have made (handling > each inode only once), is it still needed to call these functions > several times during the same commit? Calling it in the while loop is because we don't know how much time we need to wait for the end of the other transaction handle, so we flush the delalloc inodes instead of just waiting. Maybe we need not call btrfs_wait_ordered_extents() in the while loop, just call btrfs_start_delalloc_inodes() to start flush. > - I see that during a commit without pending snapshots, these two > functions are not invoked at all. > if (flush_on_commit || snap_pending) { > ret = btrfs_start_delalloc_inodes(root, 1); > if (ret) > return ret; > btrfs_wait_ordered_extents(root, 1); > } > The FLUSHONCOMMIT mount option is normally *not set*, I see in the > wiki that it is "Not needed with recent kernels, as it's the normal > mode of operation. " > Can you pls explain why the delalloc is needed when there is a pending > snap, but not with a non-snap commit? Or pls point me to the code, > where I can better understand what delalloc inode is and how it is > related to FLUSHONCOMMIT or snapshot? Can you pls explain what the > FLUSHONCOMMIT mount option is about? > > I understand from your explanation that without flushing the delalloc, > the data in the snap may be different from the subvolume data. But > this may happen anyways, since the IO to the subvolume is not blocked > during transaction commit (at least it doesn't look that it's > blocked). There are two cases that the data in the snap may be different from the subvolume data without flushing the delalloc. 1st: User0 write(fd, buf, len) create snap 2nd: User0 User1 create snap commit_transaction write(fd, buf, len) create_pending_snapshot For the 1st case, the user doesn't expect the data is different between source subvolume and the snapshot. For the 2nd case, it doesn't matter. So in order to avoid the 1st case, we need flush the delalloc before snapshot creation. > - Is there something that user-space can do to avoid flushing the > delalloc during snap-commit? For example, if the user-space stops the > IO and does a normal commit, this will not call > btrfs_start_delalloc_inodes(), but this should ensure that *some* data > is safe on disk. Then the user-space triggers snap creation (which is > another commit), and then resume the IO? Because without the ongoing > IO, snap creation is very fast. We can sync the fs before creating snapshot, in this way, we needn't flush the delalloc while committing the transaction. But I don't think it is good way. Because for the users, the page cache is transparent. I have a idea to improve this problem: - introduce per-subvolume delalloc inode list - flush the delalloc inode list of the source subvolume in create_snapshot(), not flush it in commit_transaction(). In this way, we just flush once. - don't do the delalloc flush in commit_transaction() if there are pending snapshots - If FLUSHONCOMMIT is set, just call btrfs_start_delalloc_inodes() before while loop, and call btrfs_wait_ordered_extents() after the while loop. How do you think about this idea? Thanks Miao > Thanks for your help, > Alex. > > P.S.: As I am writing this email, snap creation is stuck for 30 > minutes, calling btrfs_flush_all_pending_stuffs() again and again.... > > > > > On Wed, Jan 23, 2013 at 12:20 PM, Miao Xie <miaox@cn.fujitsu.com> wrote: >> On Wed, 23 Jan 2013 17:52:14 +0800, Liu Bo wrote: >>> On Wed, Jan 23, 2013 at 04:58:27PM +0800, Miao Xie wrote: >>>> On wed, 23 Jan 2013 16:17:53 +0800, Liu Bo wrote: >>>>> On Wed, Jan 23, 2013 at 02:33:27PM +0800, Miao Xie wrote: >>>>>> On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote: >>>>>>> On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote: >>>>>>>> No, we can't. The other tasks which flush the delalloc data may remove the inode >>>>>>>> from the delalloc list/splice list. If we release the lock, we will meet the race >>>>>>>> between list traversing and list_del(). >>>>>>> >>>>>>> OK, then please merge patch 1 and 4 so that we can backport 1 less patch >>>>>>> at least. >>>>>> >>>>>> I don't think we should merge these two patch because they do two different things - one >>>>>> is bug fix, and the other is just a improvement, and this improvement changes the logic >>>>>> of the code and might be argumentative for some developers. So 2 patches is better than one, >>>>>> I think. >>>>> >>>>> Sorry, this is right only when patch 1 really fixes the problem Alex reported. >>>>> >>>>> But the fact is >>>>> >>>>> 1) patch 1 is not enough to fix the bug, it just fixes the OOM of >>>>> allocating 'struct btrfs_delalloc_work' while the original OOM of allocating >>>>> requests remains. We can still get the same inode over and over again >>>>> and then stuck in btrfs_start_delalloc_inodes() because we 'goto again' to >>>>> make sure we flush all inodes listed in fs_info->delalloc_inodes. >>>>> >>>>> 2) patch 4 fixes 1)'s problems by removing 'goto again'. >>>>> >>>>> Am I missing something? >>>> >>>> In fact, there are two different problems. >>>> One is OOM bug. it is a serious problem and also is an regression, so we should fix it >>>> as soon as possible. >>>> The other one is that we may fetch the same inode again and again if someone write data >>>> into it endlessly. This problem is not so urgent to deal with. and perhaps some developers >>>> think it is not a problem, we should flush that inode since there are dirty pages in it. >>>> So we need split it from the patch of the 1st problem. >>> >>> All right, I'm ok with this. >>> >>> But the TWO different problems are both due to fetching the same inode more >>> than once, and the solutions are indeed same, "fetch every inode on >>> the list just once", and they are in the same code. >> >> I forgot to say that the first problem can happen even though no task writes data into >> the file after we start to flush the delalloc inodes. >> >> Thanks >> Miao >> >>> >>> It is definitely a bug/problem if any users complains about their box >>> getting stuck. It is KERNEL that should be blamed. >>> >>> thanks, >>> liubo >>> >> > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Miao, On Thu, Jan 24, 2013 at 4:14 AM, Miao Xie <miaox@cn.fujitsu.com> wrote: > On wed, 23 Jan 2013 19:02:30 +0200, Alex Lyakas wrote: >> Hi Miao, >> I have tested your patch, and the two discussed issues (OOM and >> handling the same inode more than once) are solved with it. >> >> However, snap creation under IO still takes 5-10 minutes for me. >> Basically, now the situation is similar to kernel 3.6, before your >> change to push the work to the flush workers. I see that flush worker >> is stuck in one of two stacks like this: >> >> [<ffffffff81301f1d>] get_request+0x14d/0x330 >> [<ffffffff813030a4>] blk_queue_bio+0x84/0x3b0 >> [<ffffffff812ff9a7>] generic_make_request.part.55+0x77/0xb0 >> [<ffffffff812fff48>] generic_make_request+0x68/0x70 >> [<ffffffff812fffcb>] submit_bio+0x7b/0x160 >> [<ffffffffa02fe76e>] submit_stripe_bio+0x5e/0x80 [btrfs] >> [<ffffffffa03050db>] btrfs_map_bio+0x1cb/0x420 [btrfs] >> [<ffffffffa02e1b14>] btrfs_submit_bio_hook+0xb4/0x1d0 [btrfs] >> [<ffffffffa02f5b1a>] submit_one_bio+0x6a/0xa0 [btrfs] >> [<ffffffffa02f99c4>] submit_extent_page.isra.38+0xe4/0x200 [btrfs] >> [<ffffffffa02fa83c>] __extent_writepage+0x5dc/0x750 [btrfs] >> [<ffffffffa02fac6a>] >> extent_write_cache_pages.isra.29.constprop.42+0x2ba/0x410 [btrfs] >> [<ffffffffa02fb00e>] extent_writepages+0x4e/0x70 [btrfs] >> [<ffffffffa02e0898>] btrfs_writepages+0x28/0x30 [btrfs] >> [<ffffffff81136510>] do_writepages+0x20/0x40 >> [<ffffffff8112c341>] __filemap_fdatawrite_range+0x51/0x60 >> [<ffffffff8112cc3c>] filemap_flush+0x1c/0x20 >> [<ffffffffa02e416d>] btrfs_run_delalloc_work+0xad/0xe0 [btrfs] >> [<ffffffffa0308c1f>] worker_loop+0x16f/0x5d0 [btrfs] >> [<ffffffff8107ba50>] kthread+0xc0/0xd0 >> [<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0 >> [<ffffffffffffffff>] 0xffffffffffffffff >> >> or >> >> [<ffffffff8112a58e>] sleep_on_page+0xe/0x20 >> [<ffffffff8112a577>] __lock_page+0x67/0x70 >> [<ffffffffa02fabe9>] >> extent_write_cache_pages.isra.29.constprop.42+0x239/0x410 [btrfs] >> [<ffffffffa02fb00e>] extent_writepages+0x4e/0x70 [btrfs] >> [<ffffffffa02e0898>] btrfs_writepages+0x28/0x30 [btrfs] >> [<ffffffff81136510>] do_writepages+0x20/0x40 >> [<ffffffff8112c341>] __filemap_fdatawrite_range+0x51/0x60 >> [<ffffffff8112cc3c>] filemap_flush+0x1c/0x20 >> [<ffffffffa02e416d>] btrfs_run_delalloc_work+0xad/0xe0 [btrfs] >> [<ffffffffa0308c1f>] worker_loop+0x16f/0x5d0 [btrfs] >> [<ffffffff8107ba50>] kthread+0xc0/0xd0 >> [<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0 >> [<ffffffffffffffff>] 0xffffffffffffffff >> >> while btrfs_start_delalloc_inodes() waits for it to complete in the >> commit thread. Also for some reason, I have only one "flush_workers" >> thread, so switching to another thread does not improve for me. > > it is because the number of the inodes is very few, so the worker don't > create new workers to deal with the new works. Maybe we can change > flush_workers->idle_thresh to improve this problem. > >> Another operation that takes time (up to one minute) is >> btrfs_wait_ordered_extents(), which does similar thing by switching >> the work to the flush worker. In this case, the >> btrfs_start_ordered_extent() is stuck in stacks like follows: >> [<ffffffffa04e6fc5>] btrfs_start_ordered_extent+0x85/0x130 [btrfs] >> [<ffffffffa04e70e1>] btrfs_run_ordered_extent_work+0x71/0xd0 [btrfs] >> [<ffffffffa04facef>] worker_loop+0x16f/0x5d0 [btrfs] >> [<ffffffff8107ba50>] kthread+0xc0/0xd0 >> [<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0 >> [<ffffffffffffffff>] 0xffffffffffffffff >> where it waits for BTRFS_ORDERED_COMPLETE. >> >> I have several questions, on how to possibly address this issue: >> >> - I see that btrfs_flush_all_pending_stuffs() is invoked at least >> twice during each transaction commit. It may be invoked more than >> twice if the do{ } while loop that waits for writes, performs more >> than one iteration. For me, each invocation takes a lot of time >> because of btrfs_start_delalloc_inodes() and >> btrfs_wait_ordered_extents(). Given the fixes you have made (handling >> each inode only once), is it still needed to call these functions >> several times during the same commit? > > Calling it in the while loop is because we don't know how much time we need > to wait for the end of the other transaction handle, so we flush the delalloc > inodes instead of just waiting. > > Maybe we need not call btrfs_wait_ordered_extents() in the while loop, just > call btrfs_start_delalloc_inodes() to start flush. > >> - I see that during a commit without pending snapshots, these two >> functions are not invoked at all. >> if (flush_on_commit || snap_pending) { >> ret = btrfs_start_delalloc_inodes(root, 1); >> if (ret) >> return ret; >> btrfs_wait_ordered_extents(root, 1); >> } >> The FLUSHONCOMMIT mount option is normally *not set*, I see in the >> wiki that it is "Not needed with recent kernels, as it's the normal >> mode of operation. " >> Can you pls explain why the delalloc is needed when there is a pending >> snap, but not with a non-snap commit? Or pls point me to the code, >> where I can better understand what delalloc inode is and how it is >> related to FLUSHONCOMMIT or snapshot? Can you pls explain what the >> FLUSHONCOMMIT mount option is about? >> >> I understand from your explanation that without flushing the delalloc, >> the data in the snap may be different from the subvolume data. But >> this may happen anyways, since the IO to the subvolume is not blocked >> during transaction commit (at least it doesn't look that it's >> blocked). > > There are two cases that the data in the snap may be different from the > subvolume data without flushing the delalloc. > > 1st: > User0 > write(fd, buf, len) > create snap > > 2nd: > User0 User1 > create snap > commit_transaction > write(fd, buf, len) > create_pending_snapshot > > For the 1st case, the user doesn't expect the data is different between source > subvolume and the snapshot. > > For the 2nd case, it doesn't matter. > > So in order to avoid the 1st case, we need flush the delalloc before snapshot > creation. Thanks for clarifying, Miao. > >> - Is there something that user-space can do to avoid flushing the >> delalloc during snap-commit? For example, if the user-space stops the >> IO and does a normal commit, this will not call >> btrfs_start_delalloc_inodes(), but this should ensure that *some* data >> is safe on disk. Then the user-space triggers snap creation (which is >> another commit), and then resume the IO? Because without the ongoing >> IO, snap creation is very fast. > > We can sync the fs before creating snapshot, in this way, we needn't flush > the delalloc while committing the transaction. But I don't think it is good > way. Because for the users, the page cache is transparent. I was always under impression that in Linux you must be aware of the page cache. For exampe, if a C program writes to a file, it is also responsible to fsync() the file (and check return value), to make sure that data has been persisted. However, for snap creation, it is perhaps better to do this automatically for the user. > > I have a idea to improve this problem: I think the below idea is very good: > - introduce per-subvolume delalloc inode list Perfect. > - flush the delalloc inode list of the source subvolume in create_snapshot(), > not flush it in commit_transaction(). In this way, we just flush once. Perfect. > - don't do the delalloc flush in commit_transaction() if there are pending snapshots Perfect. > - If FLUSHONCOMMIT is set, just call btrfs_start_delalloc_inodes() before while loop, > and call btrfs_wait_ordered_extents() after the while loop. In case FLUSHONCOMMIT is not set, but there are pending snapshots, is it also needed to call btrfs_wait_ordered_extents(), since we have started the delalloc IO in create_snapshot()? I have run additional tests with your patchset. It definitely improves the two bugs. However, I have a question: if the IO to the filesystem continues during extent_write_cache_pages() loop, can it add more dirty pages to the flush this function is doing? Basically, I am looking for some way to separate "old" delallocs that must be flushed vs "new" delallocs that were added after we started committing. For example, with your current patches, the extent_write_cache_pages() is called at least twice per inode (I know that your new idea will fix it). In my case, the first call submitted 22671 pages (inode is 10Gb) and the second call submitted 33705 pages. Between those two calls there were 6531 pages that were submitted twice. I noticed that if I stop the IO and then immediately trigger snap creation, much less pages are submitted. I played with the freeze_super() API, which also syncs the filesystem, so delalloc flush is not needed in this case, like you mentioned. However, the snap creation flow also calls mnt_want_write_file(), which blocks if the FS is freezed. Does it make sense to have some kind of a light-freeze functionality, which would only block the new writers (and possibly wait for in-flight writer threads to complete), but will not do the sync_filesystem part, but only the SB_FREEZE_WRITE part? Thanks for your help, Alex. > > Thanks > Miao > >> Thanks for your help, >> Alex. >> >> P.S.: As I am writing this email, snap creation is stuck for 30 >> minutes, calling btrfs_flush_all_pending_stuffs() again and again.... >> >> >> >> >> On Wed, Jan 23, 2013 at 12:20 PM, Miao Xie <miaox@cn.fujitsu.com> wrote: >>> On Wed, 23 Jan 2013 17:52:14 +0800, Liu Bo wrote: >>>> On Wed, Jan 23, 2013 at 04:58:27PM +0800, Miao Xie wrote: >>>>> On wed, 23 Jan 2013 16:17:53 +0800, Liu Bo wrote: >>>>>> On Wed, Jan 23, 2013 at 02:33:27PM +0800, Miao Xie wrote: >>>>>>> On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote: >>>>>>>> On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote: >>>>>>>>> No, we can't. The other tasks which flush the delalloc data may remove the inode >>>>>>>>> from the delalloc list/splice list. If we release the lock, we will meet the race >>>>>>>>> between list traversing and list_del(). >>>>>>>> >>>>>>>> OK, then please merge patch 1 and 4 so that we can backport 1 less patch >>>>>>>> at least. >>>>>>> >>>>>>> I don't think we should merge these two patch because they do two different things - one >>>>>>> is bug fix, and the other is just a improvement, and this improvement changes the logic >>>>>>> of the code and might be argumentative for some developers. So 2 patches is better than one, >>>>>>> I think. >>>>>> >>>>>> Sorry, this is right only when patch 1 really fixes the problem Alex reported. >>>>>> >>>>>> But the fact is >>>>>> >>>>>> 1) patch 1 is not enough to fix the bug, it just fixes the OOM of >>>>>> allocating 'struct btrfs_delalloc_work' while the original OOM of allocating >>>>>> requests remains. We can still get the same inode over and over again >>>>>> and then stuck in btrfs_start_delalloc_inodes() because we 'goto again' to >>>>>> make sure we flush all inodes listed in fs_info->delalloc_inodes. >>>>>> >>>>>> 2) patch 4 fixes 1)'s problems by removing 'goto again'. >>>>>> >>>>>> Am I missing something? >>>>> >>>>> In fact, there are two different problems. >>>>> One is OOM bug. it is a serious problem and also is an regression, so we should fix it >>>>> as soon as possible. >>>>> The other one is that we may fetch the same inode again and again if someone write data >>>>> into it endlessly. This problem is not so urgent to deal with. and perhaps some developers >>>>> think it is not a problem, we should flush that inode since there are dirty pages in it. >>>>> So we need split it from the patch of the 1st problem. >>>> >>>> All right, I'm ok with this. >>>> >>>> But the TWO different problems are both due to fetching the same inode more >>>> than once, and the solutions are indeed same, "fetch every inode on >>>> the list just once", and they are in the same code. >>> >>> I forgot to say that the first problem can happen even though no task writes data into >>> the file after we start to flush the delalloc inodes. >>> >>> Thanks >>> Miao >>> >>>> >>>> It is definitely a bug/problem if any users complains about their box >>>> getting stuck. It is KERNEL that should be blamed. >>>> >>>> thanks, >>>> liubo >>>> >>> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html >> > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On thu, 24 Jan 2013 18:20:06 +0200, Alex Lyakas wrote: >>> - Is there something that user-space can do to avoid flushing the >>> delalloc during snap-commit? For example, if the user-space stops the >>> IO and does a normal commit, this will not call >>> btrfs_start_delalloc_inodes(), but this should ensure that *some* data >>> is safe on disk. Then the user-space triggers snap creation (which is >>> another commit), and then resume the IO? Because without the ongoing >>> IO, snap creation is very fast. >> >> We can sync the fs before creating snapshot, in this way, we needn't flush >> the delalloc while committing the transaction. But I don't think it is good >> way. Because for the users, the page cache is transparent. > > I was always under impression that in Linux you must be aware of the > page cache. For exampe, if a C program writes to a file, it is also > responsible to fsync() the file (and check return value), to make sure > that data has been persisted. However, for snap creation, it is > perhaps better to do this automatically for the user. > >> >> I have a idea to improve this problem: > I think the below idea is very good: > >> - introduce per-subvolume delalloc inode list > Perfect. > >> - flush the delalloc inode list of the source subvolume in create_snapshot(), >> not flush it in commit_transaction(). In this way, we just flush once. > Perfect. > >> - don't do the delalloc flush in commit_transaction() if there are pending snapshots > Perfect. > >> - If FLUSHONCOMMIT is set, just call btrfs_start_delalloc_inodes() before while loop, >> and call btrfs_wait_ordered_extents() after the while loop. > In case FLUSHONCOMMIT is not set, but there are pending snapshots, is > it also needed to call btrfs_wait_ordered_extents(), since we have > started the delalloc IO in create_snapshot()? Yes, because FLUSHONCOMMIT will flush all delalloc inodes, including the source subvolumes and the others which are not being snapshoted. > I have run additional tests with your patchset. It definitely improves > the two bugs. However, I have a question: if the IO to the filesystem > continues during extent_write_cache_pages() loop, can it add more > dirty pages to the flush this function is doing? Basically, I am > looking for some way to separate "old" delallocs that must be flushed > vs "new" delallocs that were added after we started committing. We can not add dirty pages into the extents which are under the disk IO, but we can add dirty pages into the other extents which belong to the same file, but is not under the disk IO. (Some developers have discuss the issue(unstable page problem) that if we can dirty the extent that is under the disk IO or not in the past. Their approach is allocate a new page to store the new data. But some developers disagree this idea because the disk become more and more fast, it is unnecessary, and if the free memory is not enough, we still need wait the IO.) I think flushing the delalloc inodes at the beginning of btrfs_commit_transaction() is a simple way to separate "old" delallocs and "new" delallocs. > For example, with your current patches, the extent_write_cache_pages() > is called at least twice per inode (I know that your new idea will fix > it). In my case, the first call submitted 22671 pages (inode is 10Gb) > and the second call submitted 33705 pages. Between those two calls > there were 6531 pages that were submitted twice. I noticed that if I > stop the IO and then immediately trigger snap creation, much less > pages are submitted. > > I played with the freeze_super() API, which also syncs the filesystem, > so delalloc flush is not needed in this case, like you mentioned. > However, the snap creation flow also calls mnt_want_write_file(), > which blocks if the FS is freezed. > Does it make sense to have some kind of a light-freeze functionality, > which would only block the new writers (and possibly wait for > in-flight writer threads to complete), but will not do the > sync_filesystem part, but only the SB_FREEZE_WRITE part? I have implemented a light-freeze functionality for the R/W->R/O change of the subvolume(The patch will send out later). But I don't think it is necessary to introduce it to the snapshot creation. Thanks Miao -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Miao, thank you for your comments. After some additional testing, few more observations: # I see that only the OOM-fix patch was included in "for-linus" pull request, but the rest of the patches are in btrfs-next. I hope they will also make it into "for-linus" soon. # for now, I have commented out the call to btrfs_flush_all_pending_stuffs() in the do { } while loop of btrfs_commit_transaction(). For me it improves things, by taking less iterations of the do { } while loop, and flushing delalloc only once. Of course, your future fix for per-subvolume-delalloc-flush is better to have. # I have tried to implement a quick-and-dirty light-freeze functionality, by simply setting sb->s_writers.frozen = SB_FREEZE_WRITE, thus making new callers to btrfs_file_aio_write() (and others) to block. It actually improves very significantly the delalloc time. On the other hand, it blocks the host io (for me up to 20s sometimes), which is not sure if acceptable. I am looking forward to see your light-freeze patch. # I have increased the block device request queue (/sys/block/<name>/queue/nr_requests). It also improves a little, because now extent_write_cache_pages() completes immediately, because now it does not have to wait for free requests. The waiting is then pushed to btrfs_wait_ordered_extents(). Thanks again for your help, Alex. On Fri, Jan 25, 2013 at 8:09 AM, Miao Xie <miaox@cn.fujitsu.com> wrote: > On thu, 24 Jan 2013 18:20:06 +0200, Alex Lyakas wrote: >>>> - Is there something that user-space can do to avoid flushing the >>>> delalloc during snap-commit? For example, if the user-space stops the >>>> IO and does a normal commit, this will not call >>>> btrfs_start_delalloc_inodes(), but this should ensure that *some* data >>>> is safe on disk. Then the user-space triggers snap creation (which is >>>> another commit), and then resume the IO? Because without the ongoing >>>> IO, snap creation is very fast. >>> >>> We can sync the fs before creating snapshot, in this way, we needn't flush >>> the delalloc while committing the transaction. But I don't think it is good >>> way. Because for the users, the page cache is transparent. >> >> I was always under impression that in Linux you must be aware of the >> page cache. For exampe, if a C program writes to a file, it is also >> responsible to fsync() the file (and check return value), to make sure >> that data has been persisted. However, for snap creation, it is >> perhaps better to do this automatically for the user. >> >>> >>> I have a idea to improve this problem: >> I think the below idea is very good: >> >>> - introduce per-subvolume delalloc inode list >> Perfect. >> >>> - flush the delalloc inode list of the source subvolume in create_snapshot(), >>> not flush it in commit_transaction(). In this way, we just flush once. >> Perfect. >> >>> - don't do the delalloc flush in commit_transaction() if there are pending snapshots >> Perfect. >> >>> - If FLUSHONCOMMIT is set, just call btrfs_start_delalloc_inodes() before while loop, >>> and call btrfs_wait_ordered_extents() after the while loop. >> In case FLUSHONCOMMIT is not set, but there are pending snapshots, is >> it also needed to call btrfs_wait_ordered_extents(), since we have >> started the delalloc IO in create_snapshot()? > > Yes, because FLUSHONCOMMIT will flush all delalloc inodes, including the source subvolumes > and the others which are not being snapshoted. > >> I have run additional tests with your patchset. It definitely improves >> the two bugs. However, I have a question: if the IO to the filesystem >> continues during extent_write_cache_pages() loop, can it add more >> dirty pages to the flush this function is doing? Basically, I am >> looking for some way to separate "old" delallocs that must be flushed >> vs "new" delallocs that were added after we started committing. > > We can not add dirty pages into the extents which are under the disk IO, but > we can add dirty pages into the other extents which belong to the same file, but > is not under the disk IO. > (Some developers have discuss the issue(unstable page problem) that if we can > dirty the extent that is under the disk IO or not in the past. Their approach is > allocate a new page to store the new data. But some developers disagree this idea > because the disk become more and more fast, it is unnecessary, and if the free > memory is not enough, we still need wait the IO.) > > I think flushing the delalloc inodes at the beginning of btrfs_commit_transaction() > is a simple way to separate "old" delallocs and "new" delallocs. > >> For example, with your current patches, the extent_write_cache_pages() >> is called at least twice per inode (I know that your new idea will fix >> it). In my case, the first call submitted 22671 pages (inode is 10Gb) >> and the second call submitted 33705 pages. Between those two calls >> there were 6531 pages that were submitted twice. I noticed that if I >> stop the IO and then immediately trigger snap creation, much less >> pages are submitted. >> >> I played with the freeze_super() API, which also syncs the filesystem, >> so delalloc flush is not needed in this case, like you mentioned. >> However, the snap creation flow also calls mnt_want_write_file(), >> which blocks if the FS is freezed. >> Does it make sense to have some kind of a light-freeze functionality, >> which would only block the new writers (and possibly wait for >> in-flight writer threads to complete), but will not do the >> sync_filesystem part, but only the SB_FREEZE_WRITE part? > > I have implemented a light-freeze functionality for the R/W->R/O change > of the subvolume(The patch will send out later). But I don't think it is > necessary to introduce it to the snapshot creation. > > Thanks > Miao -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 67ed24a..692ed0e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1555,8 +1555,8 @@ static void btrfs_clear_bit_hook(struct inode *inode, BTRFS_I(inode)->delalloc_bytes -= len; if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && - !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_del_init(&BTRFS_I(inode)->delalloc_inodes); + test_bit(BTRFS_INODE_FLUSH, &BTRFS_I(inode)->runtime_flags)) { + clear_bit(BTRFS_INODE_FLUSH, &BTRFS_I(inode)->runtime_flags); } spin_unlock(&root->fs_info->delalloc_lock); } @@ -7562,8 +7562,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) binode = list_entry(head->next, struct btrfs_inode, delalloc_inodes); inode = igrab(&binode->vfs_inode); - if (!inode) - list_del_init(&binode->delalloc_inodes); + + list_del_init(&binode->delalloc_inodes); + spin_unlock(&root->fs_info->delalloc_lock); if (inode) { work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); @@ -7572,6 +7573,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) goto out; } list_add_tail(&work->list, &works); + set_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags); btrfs_queue_worker(&root->fs_info->flush_workers, &work->work); } @@ -7580,6 +7582,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) } spin_unlock(&root->fs_info->delalloc_lock); + /* make sure we clear all delalloc bytes we have scheduled */ + while (!list_empty(&works)) { + work = list_entry(works.next, struct btrfs_delalloc_work, + list); + binode = btrfs_ino(work->inode); + if (!test_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags)) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + cond_resched(); + } + /* the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return