diff mbox series

[07/16] btrfs: Lock extents before folio for read()s

Message ID 5c7c77d0735c18cea82c347eef2ce2eb169681e6.1668530684.git.rgoldwyn@suse.com (mailing list archive)
State New, archived
Headers show
Series Lock extents before pages | expand

Commit Message

Goldwyn Rodrigues Nov. 15, 2022, 6 p.m. UTC
Perform extent locking around buffered reads as opposed to while
performing folio reads.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 fs/btrfs/compression.c |  5 -----
 fs/btrfs/extent_io.c   | 36 +-----------------------------------
 fs/btrfs/file.c        | 17 ++++++++++++++---
 3 files changed, 15 insertions(+), 43 deletions(-)

Comments

kernel test robot Nov. 21, 2022, 1:31 p.m. UTC | #1
Greeting,

FYI, we noticed a -99.7% regression of fxmark.ssd_btrfs_DRBM_72_bufferedio.works/sec due to commit:


commit: 94d3d147892ad202ecbd0a373b63469f2d12667a ("[PATCH 07/16] btrfs: Lock extents before folio for read()s")
url: https://github.com/intel-lab-lkp/linux/commits/Goldwyn-Rodrigues/btrfs-check-for-range-correctness-while-locking-or-setting-extent-bits/20221116-030112
base: https://git.kernel.org/cgit/linux/kernel/git/kdave/linux.git for-next
patch link: https://lore.kernel.org/all/5c7c77d0735c18cea82c347eef2ce2eb169681e6.1668530684.git.rgoldwyn@suse.com/
patch subject: [PATCH 07/16] btrfs: Lock extents before folio for read()s

in testcase: fxmark
on test machine: 128 threads 2 sockets Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz (Ice Lake) with 128G memory
with following parameters:

	disk: 1SSD
	media: ssd
	test: DRBM
	fstype: btrfs
	directio: bufferedio
	cpufreq_governor: performance

test-description: FxMark is a filesystem benchmark that test multicore scalability.
test-url: https://github.com/sslab-gatech/fxmark



If you fix the issue, kindly add following tag
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Link: https://lore.kernel.org/oe-lkp/202211212140.7f9c0a76-oliver.sang@intel.com


Details are as below:
-------------------------------------------------------------------------------------------------->


To reproduce:

        git clone https://github.com/intel/lkp-tests.git
        cd lkp-tests
        sudo bin/lkp install job.yaml           # job file is attached in this email
        bin/lkp split-job --compatible job.yaml # generate the yaml file for lkp run
        sudo bin/lkp run generated-yaml-file

        # if come across any failure that blocks the test,
        # please remove ~/.lkp and /lkp dir to run from a clean state.

=========================================================================================
compiler/cpufreq_governor/directio/disk/fstype/kconfig/media/rootfs/tbox_group/test/testcase:
  gcc-11/performance/bufferedio/1SSD/btrfs/x86_64-rhel-8.3/ssd/debian-11.1-x86_64-20220510.cgz/lkp-icl-2sp5/DRBM/fxmark

commit: 
  440e8ad2af ("btrfs: Lock extents before pages in writepages")
  94d3d14789 ("btrfs: Lock extents before folio for read()s")

440e8ad2afce1fc9 94d3d147892ad202ecbd0a373b6 
---------------- --------------------------- 
         %stddev     %change         %stddev
             \          |                \  
      0.01 ±  7%     -27.6%       0.00 ± 14%  fxmark.ssd_btrfs_DRBM_18_bufferedio.softirq_util
    795.82           +12.4%     894.79        fxmark.ssd_btrfs_DRBM_18_bufferedio.sys_sec
     88.36           +12.4%      99.35        fxmark.ssd_btrfs_DRBM_18_bufferedio.sys_util
    100.70           -98.3%       1.74 ±  2%  fxmark.ssd_btrfs_DRBM_18_bufferedio.user_sec
     11.18           -98.3%       0.19 ±  2%  fxmark.ssd_btrfs_DRBM_18_bufferedio.user_util
 4.048e+09           -98.4%   64233289 ±  2%  fxmark.ssd_btrfs_DRBM_18_bufferedio.works
  80965674           -98.4%    1284665 ±  2%  fxmark.ssd_btrfs_DRBM_18_bufferedio.works/sec
      5.54 ±  6%     -34.4%       3.63        fxmark.ssd_btrfs_DRBM_1_bufferedio.user_sec
     11.07 ±  6%     -34.5%       7.26        fxmark.ssd_btrfs_DRBM_1_bufferedio.user_util
 2.324e+08           -44.4%  1.292e+08        fxmark.ssd_btrfs_DRBM_1_bufferedio.works
   4648529           -44.4%    2584226        fxmark.ssd_btrfs_DRBM_1_bufferedio.works/sec
     10.14 ± 17%     -63.2%       3.74 ±  7%  fxmark.ssd_btrfs_DRBM_2_bufferedio.user_sec
     10.14 ± 17%     -63.2%       3.73 ±  7%  fxmark.ssd_btrfs_DRBM_2_bufferedio.user_util
 4.647e+08           -68.4%   1.47e+08 ±  5%  fxmark.ssd_btrfs_DRBM_2_bufferedio.works
   9293719           -68.4%    2939729 ±  5%  fxmark.ssd_btrfs_DRBM_2_bufferedio.works/sec
      1199 ± 46%     +49.3%       1790        fxmark.ssd_btrfs_DRBM_36_bufferedio.sys_sec
     87.78           +13.2%      99.38        fxmark.ssd_btrfs_DRBM_36_bufferedio.sys_util
    150.29 ± 47%     -99.1%       1.41        fxmark.ssd_btrfs_DRBM_36_bufferedio.user_sec
     10.88 ±  4%     -99.3%       0.08        fxmark.ssd_btrfs_DRBM_36_bufferedio.user_util
 7.801e+09           -99.4%   47465006        fxmark.ssd_btrfs_DRBM_36_bufferedio.works
  1.56e+08           -99.4%     949300        fxmark.ssd_btrfs_DRBM_36_bufferedio.works/sec
    177.16           +11.3%     197.19        fxmark.ssd_btrfs_DRBM_4_bufferedio.sys_sec
     88.52           +11.3%      98.53        fxmark.ssd_btrfs_DRBM_4_bufferedio.sys_util
     22.14 ±  2%     -90.6%       2.09 ±  5%  fxmark.ssd_btrfs_DRBM_4_bufferedio.user_sec
     11.06 ±  2%     -90.6%       1.04 ±  5%  fxmark.ssd_btrfs_DRBM_4_bufferedio.user_util
 9.216e+08           -91.1%   81989808 ±  2%  fxmark.ssd_btrfs_DRBM_4_bufferedio.works
  18431506           -91.1%    1639796 ±  2%  fxmark.ssd_btrfs_DRBM_4_bufferedio.works/sec
      1993 ± 27%     +34.6%       2682        fxmark.ssd_btrfs_DRBM_54_bufferedio.sys_sec
     88.00           +12.8%      99.30        fxmark.ssd_btrfs_DRBM_54_bufferedio.sys_util
    250.85 ± 27%     -98.9%       2.78 ±  6%  fxmark.ssd_btrfs_DRBM_54_bufferedio.user_sec
     11.10 ±  2%     -99.1%       0.10 ±  6%  fxmark.ssd_btrfs_DRBM_54_bufferedio.user_util
 1.181e+10           -99.7%   40876182        fxmark.ssd_btrfs_DRBM_54_bufferedio.works
 2.362e+08           -99.7%     817522        fxmark.ssd_btrfs_DRBM_54_bufferedio.works/sec
      0.08 ± 32%     -28.9%       0.06        fxmark.ssd_btrfs_DRBM_72_bufferedio.idle_util
      0.10 ± 30%     +33.9%       0.14 ±  2%  fxmark.ssd_btrfs_DRBM_72_bufferedio.softirq_sec
      2693 ± 25%     +32.9%       3579        fxmark.ssd_btrfs_DRBM_72_bufferedio.sys_sec
     88.06           +12.8%      99.36        fxmark.ssd_btrfs_DRBM_72_bufferedio.sys_util
    335.37 ± 26%     -99.5%       1.65 ± 15%  fxmark.ssd_btrfs_DRBM_72_bufferedio.user_sec
     10.93 ±  2%     -99.6%       0.05 ± 15%  fxmark.ssd_btrfs_DRBM_72_bufferedio.user_util
  1.43e+10           -99.7%   38601368        fxmark.ssd_btrfs_DRBM_72_bufferedio.works
 2.859e+08           -99.7%     772027        fxmark.ssd_btrfs_DRBM_72_bufferedio.works/sec
     41.54 ± 11%     -83.4%       6.91 ±  2%  fxmark.time.user_time
     83376 ±  5%     +21.4%     101242 ±  2%  meminfo.Shmem
     11.33 ±  6%     -82.4%       2.00        vmstat.cpu.us
     81.25           +11.0%      90.21        iostat.cpu.system
     11.08 ±  3%     -75.7%       2.69        iostat.cpu.user
     81.55            +8.9       90.46        mpstat.cpu.all.sys%
     11.27 ±  3%      -8.5        2.75        mpstat.cpu.all.usr%
     10620 ±  4%      +9.1%      11586 ±  3%  numa-meminfo.node0.KernelStack
     78499 ±  5%     +21.4%      95276 ±  3%  numa-meminfo.node1.Shmem
     10628 ±  4%      +9.0%      11587 ±  3%  numa-vmstat.node0.nr_kernel_stack
     19488 ±  4%     +22.2%      23816 ±  3%  numa-vmstat.node1.nr_shmem
    124.35 ± 20%     -23.5%      95.07 ±  4%  sched_debug.cfs_rq:/.runnable_avg.stddev
 1.177e+09 ±141%    -100.0%     521129        sched_debug.cpu.max_idle_balance_cost.max
 1.607e+08 ±153%    -100.0%       3301 ± 14%  sched_debug.cpu.max_idle_balance_cost.stddev
      1995 ±  4%     -10.2%       1792        perf-stat.i.context-switches
      2601 ± 16%     +28.0%       3331 ±  7%  perf-stat.i.node-stores
     40.20 ± 18%     -10.3       29.87 ±  7%  perf-stat.overall.node-store-miss-rate%
      2517 ± 20%     +32.7%       3340 ±  7%  perf-stat.ps.node-stores
    103708            +4.5%     108351        proc-vmstat.nr_inactive_anon
     21169 ±  7%     +19.6%      25327 ±  2%  proc-vmstat.nr_shmem
    103708            +4.5%     108351        proc-vmstat.nr_zone_inactive_anon
     55635 ± 16%     +28.5%      71510        proc-vmstat.pgactivate
     28109 ±  4%     +23.0%      34583 ±  2%  proc-vmstat.pgdeactivate
      3237            +1.9%       3297        turbostat.Bzy_MHz
      0.68           -92.6%       0.05        turbostat.IPC
     46.44 ±  4%     -46.4        0.00        turbostat.PKG_%
     47.33 ±  2%      -4.2%      45.33 ±  2%  turbostat.PkgTmp
    316.77 ±  3%     -17.2%     262.32        turbostat.PkgWatt
     63.02           -63.0        0.00        perf-profile.calltrace.cycles-pp.filemap_read.vfs_read.__x64_sys_pread64.do_syscall_64.entry_SYSCALL_64_after_hwframe
     26.81           -26.8        0.00        perf-profile.calltrace.cycles-pp.copy_page_to_iter.filemap_read.vfs_read.__x64_sys_pread64.do_syscall_64
     22.15           -22.2        0.00        perf-profile.calltrace.cycles-pp._copy_to_iter.copy_page_to_iter.filemap_read.vfs_read.__x64_sys_pread64
     15.70           -15.7        0.00        perf-profile.calltrace.cycles-pp.filemap_get_pages.filemap_read.vfs_read.__x64_sys_pread64.do_syscall_64
     13.99           -14.0        0.00        perf-profile.calltrace.cycles-pp.copyout._copy_to_iter.copy_page_to_iter.filemap_read.vfs_read
     13.40           -13.4        0.00        perf-profile.calltrace.cycles-pp.filemap_get_read_batch.filemap_get_pages.filemap_read.vfs_read.__x64_sys_pread64
     11.89           -11.9        0.00        perf-profile.calltrace.cycles-pp.copy_user_enhanced_fast_string.copyout._copy_to_iter.copy_page_to_iter.filemap_read
      9.52 ±  8%      -9.5        0.00        perf-profile.calltrace.cycles-pp.__entry_text_start.__libc_pread
     85.33           +14.4       99.77        perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_after_hwframe.__libc_pread
     83.21           +16.6       99.76        perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.__libc_pread
     80.28           +19.5       99.75        perf-profile.calltrace.cycles-pp.__x64_sys_pread64.do_syscall_64.entry_SYSCALL_64_after_hwframe.__libc_pread
     74.78           +25.0       99.73        perf-profile.calltrace.cycles-pp.vfs_read.__x64_sys_pread64.do_syscall_64.entry_SYSCALL_64_after_hwframe.__libc_pread
      0.00           +48.6       48.62        perf-profile.calltrace.cycles-pp.native_queued_spin_lock_slowpath._raw_spin_lock.__clear_extent_bit.btrfs_file_read_iter.vfs_read
      0.00           +48.6       48.63        perf-profile.calltrace.cycles-pp.native_queued_spin_lock_slowpath._raw_spin_lock.__set_extent_bit.lock_extent.btrfs_lock_and_flush_ordered_range
      0.00           +48.8       48.79        perf-profile.calltrace.cycles-pp._raw_spin_lock.__clear_extent_bit.btrfs_file_read_iter.vfs_read.__x64_sys_pread64
      0.00           +48.8       48.79        perf-profile.calltrace.cycles-pp._raw_spin_lock.__set_extent_bit.lock_extent.btrfs_lock_and_flush_ordered_range.btrfs_file_read_iter
      0.00           +49.4       49.45        perf-profile.calltrace.cycles-pp.__set_extent_bit.lock_extent.btrfs_lock_and_flush_ordered_range.btrfs_file_read_iter.vfs_read
      0.00           +49.5       49.46        perf-profile.calltrace.cycles-pp.lock_extent.btrfs_lock_and_flush_ordered_range.btrfs_file_read_iter.vfs_read.__x64_sys_pread64
      0.00           +49.6       49.65        perf-profile.calltrace.cycles-pp.__clear_extent_bit.btrfs_file_read_iter.vfs_read.__x64_sys_pread64.do_syscall_64
      0.00           +49.7       49.72        perf-profile.calltrace.cycles-pp.btrfs_lock_and_flush_ordered_range.btrfs_file_read_iter.vfs_read.__x64_sys_pread64.do_syscall_64
      0.68           +99.0       99.69        perf-profile.calltrace.cycles-pp.btrfs_file_read_iter.vfs_read.__x64_sys_pread64.do_syscall_64.entry_SYSCALL_64_after_hwframe
     64.40           -64.1        0.31 ± 11%  perf-profile.children.cycles-pp.filemap_read
     27.64           -27.6        0.09 ±  5%  perf-profile.children.cycles-pp.copy_page_to_iter
     22.97           -22.9        0.07        perf-profile.children.cycles-pp._copy_to_iter
     15.97           -15.9        0.11        perf-profile.children.cycles-pp.filemap_get_pages
     13.68           -13.6        0.10        perf-profile.children.cycles-pp.filemap_get_read_batch
     13.44           -13.4        0.00        perf-profile.children.cycles-pp.copyout
     12.76           -12.7        0.05        perf-profile.children.cycles-pp.copy_user_enhanced_fast_string
      5.72            -5.7        0.00        perf-profile.children.cycles-pp.entry_SYSRETQ_unsafe_stack
      5.57            -5.6        0.00        perf-profile.children.cycles-pp.__might_fault
      5.41 ±  7%      -5.4        0.00        perf-profile.children.cycles-pp.__entry_text_start
      5.10            -5.1        0.03 ±100%  perf-profile.children.cycles-pp.touch_atime
      4.16            -4.1        0.03 ±100%  perf-profile.children.cycles-pp.atime_needs_update
      0.11 ± 26%      +0.0        0.14 ±  3%  perf-profile.children.cycles-pp.perf_tp_event
      0.09 ± 29%      +0.0        0.13 ±  5%  perf-profile.children.cycles-pp.perf_callchain
      0.09 ± 29%      +0.0        0.13 ±  5%  perf-profile.children.cycles-pp.get_perf_callchain
      0.11 ± 26%      +0.0        0.15 ±  5%  perf-profile.children.cycles-pp.update_curr
      0.05 ± 71%      +0.0        0.09 ±  7%  perf-profile.children.cycles-pp.perf_callchain_kernel
      0.00            +0.1        0.05        perf-profile.children.cycles-pp.cache_state_if_flags
      0.00            +0.1        0.06 ±  7%  perf-profile.children.cycles-pp.kmem_cache_alloc
      0.00            +0.1        0.07 ±  8%  perf-profile.children.cycles-pp.alloc_extent_state
      0.00            +0.1        0.07        perf-profile.children.cycles-pp.rb_erase
      0.00            +0.1        0.12 ±  6%  perf-profile.children.cycles-pp.rb_next
      0.00            +0.1        0.14 ±  2%  perf-profile.children.cycles-pp.insert_state
      0.00            +0.2        0.23 ±  4%  perf-profile.children.cycles-pp._raw_spin_lock_irq
      0.00            +0.3        0.26 ±  3%  perf-profile.children.cycles-pp.btrfs_lookup_ordered_range
      0.00            +0.3        0.28 ±  2%  perf-profile.children.cycles-pp.free_extent_state
      0.00            +0.4        0.45 ±  2%  perf-profile.children.cycles-pp.clear_state_bit
     98.72            +1.1       99.82        perf-profile.children.cycles-pp.__libc_pread
     85.55           +14.3       99.84        perf-profile.children.cycles-pp.entry_SYSCALL_64_after_hwframe
     83.99           +15.8       99.83        perf-profile.children.cycles-pp.do_syscall_64
     80.69           +19.1       99.75        perf-profile.children.cycles-pp.__x64_sys_pread64
     75.35           +24.4       99.74        perf-profile.children.cycles-pp.vfs_read
      0.00           +49.5       49.45        perf-profile.children.cycles-pp.__set_extent_bit
      0.00           +49.5       49.46        perf-profile.children.cycles-pp.lock_extent
      0.00           +49.6       49.65        perf-profile.children.cycles-pp.__clear_extent_bit
      0.00           +49.7       49.73        perf-profile.children.cycles-pp.btrfs_lock_and_flush_ordered_range
      0.00           +97.3       97.29        perf-profile.children.cycles-pp.native_queued_spin_lock_slowpath
      0.00           +97.6       97.58        perf-profile.children.cycles-pp._raw_spin_lock
      0.81 ±  2%     +98.9       99.69        perf-profile.children.cycles-pp.btrfs_file_read_iter
     13.51 ±  3%     -13.5        0.05 ± 74%  perf-profile.self.cycles-pp.filemap_read
     12.54           -12.5        0.04 ± 44%  perf-profile.self.cycles-pp.copy_user_enhanced_fast_string
     10.46 ±  2%     -10.4        0.08 ±  6%  perf-profile.self.cycles-pp.filemap_get_read_batch
      5.59            -5.6        0.00        perf-profile.self.cycles-pp.entry_SYSRETQ_unsafe_stack
      0.00            +0.1        0.05        perf-profile.self.cycles-pp.cache_state_if_flags
      0.00            +0.1        0.07 ±  5%  perf-profile.self.cycles-pp.rb_erase
      0.00            +0.1        0.08        perf-profile.self.cycles-pp.clear_state_bit
      0.00            +0.1        0.12 ±  4%  perf-profile.self.cycles-pp.insert_state
      0.00            +0.1        0.12 ±  6%  perf-profile.self.cycles-pp.rb_next
      0.00            +0.2        0.22 ±  2%  perf-profile.self.cycles-pp.__clear_extent_bit
      0.00            +0.2        0.22 ±  4%  perf-profile.self.cycles-pp._raw_spin_lock_irq
      0.00            +0.3        0.28 ±  2%  perf-profile.self.cycles-pp.free_extent_state
      0.00            +0.3        0.30 ±  4%  perf-profile.self.cycles-pp._raw_spin_lock
      0.00            +0.4        0.44 ±  2%  perf-profile.self.cycles-pp.__set_extent_bit
      0.00           +96.9       96.94        perf-profile.self.cycles-pp.native_queued_spin_lock_slowpath




Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.
Goldwyn Rodrigues Nov. 22, 2022, 5:11 p.m. UTC | #2
On 21:31 21/11, kernel test robot wrote:
> 
> Greeting,
> 
> FYI, we noticed a -99.7% regression of fxmark.ssd_btrfs_DRBM_72_bufferedio.works/sec due to commit:
> 
> 
> commit: 94d3d147892ad202ecbd0a373b63469f2d12667a ("[PATCH 07/16] btrfs: Lock extents before folio for read()s")

This brings me to think why is the extent lock around reads required. Is
not the extent locks to protect against write overwrites. Pagelocks
should be sufficient. I tried running my patchset after removing the
extent locks and the tests run fine.

What am I missing?

<snipped>
kernel test robot Nov. 27, 2022, 8:48 a.m. UTC | #3
hi Goldwyn,

we reported last week
"a -99.7% regression of fxmark.ssd_btrfs_DRBM_72_bufferedio.works/sec"
due to this commit on
https://lore.kernel.org/all/202211212140.7f9c0a76-oliver.sang@intel.com/
and we noticed it brought you to some thinkings.

now we also found this commit caused some func issues which can be reproduced
stably in our tests.

440e8ad2afce1fc9 94d3d147892ad202ecbd0a373b6
---------------- ---------------------------
       fail:runs  %reproduction    fail:runs
           |             |             |
           :6          100%           6:6     xfstests.btrfs.020.fail

below is the detail report and hope there are some further hints in attached
dmesg. if any question, please let us know. Thanks


Greeting,

FYI, we noticed xfstests.btrfs.020.fail due to commit (built with gcc-11):

commit: 94d3d147892ad202ecbd0a373b63469f2d12667a ("[PATCH 07/16] btrfs: Lock extents before folio for read()s")
url: https://github.com/intel-lab-lkp/linux/commits/Goldwyn-Rodrigues/btrfs-check-for-range-correctness-while-locking-or-setting-extent-bits/20221116-030112
base: https://git.kernel.org/cgit/linux/kernel/git/kdave/linux.git for-next
patch link: https://lore.kernel.org/all/5c7c77d0735c18cea82c347eef2ce2eb169681e6.1668530684.git.rgoldwyn@suse.com/
patch subject: [PATCH 07/16] btrfs: Lock extents before folio for read()s

in testcase: xfstests
version: xfstests-x86_64-5a5e419-1_20221108
with following parameters:

	disk: 6HDD
	fs: btrfs
	test: btrfs-group-02

test-description: xfstests is a regression test suite for xfs and other files ystems.
test-url: git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git


on test machine: 8 threads 1 sockets Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz (Haswell) with 8G memory

caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):




If you fix the issue, kindly add following tag
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Link: https://lore.kernel.org/oe-lkp/202211271625.4b0c95ff-oliver.sang@intel.com

2022-11-25 04:24:10 export TEST_DIR=/fs/sda1
2022-11-25 04:24:10 export TEST_DEV=/dev/sda1
2022-11-25 04:24:10 export FSTYP=btrfs
2022-11-25 04:24:10 export SCRATCH_MNT=/fs/scratch
2022-11-25 04:24:10 mkdir /fs/scratch -p
2022-11-25 04:24:10 export SCRATCH_DEV_POOL="/dev/sda2 /dev/sda3 /dev/sda4 /dev/sda5 /dev/sda6"
2022-11-25 04:24:10 sed "s:^:btrfs/:" //lkp/benchmarks/xfstests/tests/btrfs-group-02
2022-11-25 04:24:10 ./check btrfs/020 btrfs/021 btrfs/022 btrfs/023 btrfs/024 btrfs/025 btrfs/026 btrfs/027 btrfs/028 btrfs/029
FSTYP         -- btrfs
PLATFORM      -- Linux/x86_64 lkp-hsw-d01 6.1.0-rc4-00281-g94d3d147892a #1 SMP Thu Nov 24 15:07:29 CST 2022
MKFS_OPTIONS  -- /dev/sda2
MOUNT_OPTIONS -- /dev/sda2 /fs/scratch

btrfs/020       [failed, exit status 1]
btrfs/021        14s
btrfs/022        13s
btrfs/023        6s
btrfs/024        2s
btrfs/025        2s
btrfs/026        7s
btrfs/027        10s
btrfs/028        31s
btrfs/029        2s
Ran: btrfs/020 btrfs/021 btrfs/022 btrfs/023 btrfs/024 btrfs/025 btrfs/026 btrfs/027 btrfs/028 btrfs/029
Failures: btrfs/020
Failed 1 of 10 tests




To reproduce:

        git clone https://github.com/intel/lkp-tests.git
        cd lkp-tests
        sudo bin/lkp install job.yaml           # job file is attached in this email
        bin/lkp split-job --compatible job.yaml # generate the yaml file for lkp run
        sudo bin/lkp run generated-yaml-file

        # if come across any failure that blocks the test,
        # please remove ~/.lkp and /lkp dir to run from a clean state.
Josef Bacik Dec. 13, 2022, 6:57 p.m. UTC | #4
On Tue, Nov 15, 2022 at 12:00:25PM -0600, Goldwyn Rodrigues wrote:
> Perform extent locking around buffered reads as opposed to while
> performing folio reads.
> 
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> ---
>  fs/btrfs/compression.c |  5 -----
>  fs/btrfs/extent_io.c   | 36 +-----------------------------------
>  fs/btrfs/file.c        | 17 ++++++++++++++---
>  3 files changed, 15 insertions(+), 43 deletions(-)
> 
> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
> index 30adf430241e..7f6452c4234e 100644
> --- a/fs/btrfs/compression.c
> +++ b/fs/btrfs/compression.c
> @@ -526,11 +526,9 @@ static noinline int add_ra_bio_pages(struct inode *inode,
>  	struct extent_map *em;
>  	struct address_space *mapping = inode->i_mapping;
>  	struct extent_map_tree *em_tree;
> -	struct extent_io_tree *tree;
>  	int sectors_missed = 0;
>  
>  	em_tree = &BTRFS_I(inode)->extent_tree;
> -	tree = &BTRFS_I(inode)->io_tree;
>  
>  	if (isize == 0)
>  		return 0;
> @@ -597,7 +595,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
>  		}
>  
>  		page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
> -		lock_extent(tree, cur, page_end, NULL);
>  		read_lock(&em_tree->lock);
>  		em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
>  		read_unlock(&em_tree->lock);
> @@ -611,7 +608,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
>  		    (cur + fs_info->sectorsize > extent_map_end(em)) ||
>  		    (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
>  			free_extent_map(em);
> -			unlock_extent(tree, cur, page_end, NULL);
>  			unlock_page(page);
>  			put_page(page);
>  			break;
> @@ -631,7 +627,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
>  		add_size = min(em->start + em->len, page_end + 1) - cur;
>  		ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
>  		if (ret != add_size) {
> -			unlock_extent(tree, cur, page_end, NULL);
>  			unlock_page(page);
>  			put_page(page);
>  			break;
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 42bae149f923..c5430cabad62 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -891,15 +891,6 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
>  		btrfs_subpage_end_reader(fs_info, page, start, len);
>  }
>  
> -static void end_sector_io(struct page *page, u64 offset, bool uptodate)
> -{
> -	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
> -	const u32 sectorsize = inode->root->fs_info->sectorsize;
> -
> -	end_page_read(page, uptodate, offset, sectorsize);
> -	unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL);
> -}
> -
>  static void submit_data_read_repair(struct inode *inode,
>  				    struct btrfs_bio *failed_bbio,
>  				    u32 bio_offset, const struct bio_vec *bvec,
> @@ -960,7 +951,7 @@ static void submit_data_read_repair(struct inode *inode,
>  		 * will not be properly unlocked.
>  		 */
>  next:
> -		end_sector_io(page, start + offset, uptodate);
> +		end_page_read(page, uptodate, start + offset, sectorsize);
>  	}
>  }
>  
> @@ -1072,9 +1063,6 @@ static void endio_readpage_release_extent(struct processed_extent *processed,
>  			      struct btrfs_inode *inode, u64 start, u64 end,
>  			      bool uptodate)
>  {
> -	struct extent_state *cached = NULL;
> -	struct extent_io_tree *tree;
> -
>  	/* The first extent, initialize @processed */
>  	if (!processed->inode)
>  		goto update;
> @@ -1096,13 +1084,6 @@ static void endio_readpage_release_extent(struct processed_extent *processed,
>  		return;
>  	}
>  
> -	tree = &processed->inode->io_tree;
> -	/*
> -	 * Now we don't have range contiguous to the processed range, release
> -	 * the processed range now.
> -	 */
> -	unlock_extent(tree, processed->start, processed->end, &cached);
> -
>  update:
>  	/* Update processed to current range */
>  	processed->inode = inode;
> @@ -1743,11 +1724,9 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
>  	size_t pg_offset = 0;
>  	size_t iosize;
>  	size_t blocksize = inode->i_sb->s_blocksize;
> -	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
>  
>  	ret = set_page_extent_mapped(page);
>  	if (ret < 0) {
> -		unlock_extent(tree, start, end, NULL);
>  		btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
>  		unlock_page(page);
>  		goto out;
> @@ -1772,14 +1751,12 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
>  		if (cur >= last_byte) {
>  			iosize = PAGE_SIZE - pg_offset;
>  			memzero_page(page, pg_offset, iosize);
> -			unlock_extent(tree, cur, cur + iosize - 1, NULL);
>  			end_page_read(page, true, cur, iosize);
>  			break;
>  		}
>  		em = __get_extent_map(inode, page, pg_offset, cur,
>  				      end - cur + 1, em_cached);
>  		if (IS_ERR(em)) {
> -			unlock_extent(tree, cur, end, NULL);
>  			end_page_read(page, false, cur, end + 1 - cur);
>  			ret = PTR_ERR(em);
>  			break;
> @@ -1849,8 +1826,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
>  		/* we've found a hole, just zero and go on */
>  		if (block_start == EXTENT_MAP_HOLE) {
>  			memzero_page(page, pg_offset, iosize);
> -
> -			unlock_extent(tree, cur, cur + iosize - 1, NULL);
>  			end_page_read(page, true, cur, iosize);
>  			cur = cur + iosize;
>  			pg_offset += iosize;
> @@ -1858,7 +1833,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
>  		}
>  		/* the get_extent function already copied into the page */
>  		if (block_start == EXTENT_MAP_INLINE) {
> -			unlock_extent(tree, cur, cur + iosize - 1, NULL);
>  			end_page_read(page, true, cur, iosize);
>  			cur = cur + iosize;
>  			pg_offset += iosize;
> @@ -1874,7 +1848,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
>  			 * We have to unlock the remaining range, or the page
>  			 * will never be unlocked.
>  			 */
> -			unlock_extent(tree, cur, end, NULL);
>  			end_page_read(page, false, cur, end + 1 - cur);
>  			goto out;
>  		}
> @@ -1888,13 +1861,9 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
>  int btrfs_read_folio(struct file *file, struct folio *folio)
>  {
>  	struct page *page = &folio->page;
> -	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
> -	u64 start = page_offset(page);
> -	u64 end = start + PAGE_SIZE - 1;
>  	struct btrfs_bio_ctrl bio_ctrl = { 0 };
>  	int ret;
>  
> -	btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
>  
>  	ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
>  	/*
> @@ -1911,11 +1880,8 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
>  					struct btrfs_bio_ctrl *bio_ctrl,
>  					u64 *prev_em_start)
>  {
> -	struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
>  	int index;
>  
> -	btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
> -
>  	for (index = 0; index < nr_pages; index++) {
>  		btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
>  				  REQ_RAHEAD, prev_em_start);
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 473a0743270b..9266ee6c1a61 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -3806,15 +3806,26 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
>  static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  {
>  	ssize_t ret = 0;
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	loff_t len = iov_iter_count(to);
> +	u64 start = round_down(iocb->ki_pos, PAGE_SIZE);
> +	u64 end = round_up(iocb->ki_pos + len, PAGE_SIZE) - 1;
> +	struct extent_state *cached = NULL;
>  
>  	if (iocb->ki_flags & IOCB_DIRECT) {
>  		ret = btrfs_direct_read(iocb, to);
> -		if (ret < 0 || !iov_iter_count(to) ||
> -		    iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
> +		if (ret < 0 || !len ||
> +		    iocb->ki_pos >= i_size_read(inode))
>  			return ret;
>  	}
>  
> -	return filemap_read(iocb, to, ret);
> +	btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), start,
> +			end, &cached);
> +

Ok I see why the buffered io thing fell apart, we're now flushing litterally
everything.

Consider what happened before, we search through pagecache, don't find pages,
send it down to readpages, and then we do a btrfs_lock_and_flush_ordered_range()
at that point.

99.99999999% of the time we find nothing, the only time we would is if there was
a O_DIRECT ordered extent outstanding, which is why we need the
btrfs_lock_and_flush_ordered_range(), we need to make sure the right file extent
is in place so we know what to read.

What you've done now is that we'll just flush all the dirty pages in the range
we're reading, instead of just finding an uptodate page and copying it into the
buffer.

What you need here is a btrfs_lock_and_flush_direct_ordered_range(), which only
flushes an ordered extent that has BTRFS_ORDERED_DIRECT set on it.

Additionally we're now incurring a extent lock when the pages would be already
in cache.  I would really rather add a ->readahead_unlocked or something like
this that was called before we started gathering pages, which would happen after
we have a cache missed, and then we do the
btrfs_lock_and_flush_direct_ordered_range() there, and then call into whatever
entry point that goes and does the readahead.  This way we get our locking order
with the same performance characteristics that exist now.  Thanks,

Josef
diff mbox series

Patch

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 30adf430241e..7f6452c4234e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -526,11 +526,9 @@  static noinline int add_ra_bio_pages(struct inode *inode,
 	struct extent_map *em;
 	struct address_space *mapping = inode->i_mapping;
 	struct extent_map_tree *em_tree;
-	struct extent_io_tree *tree;
 	int sectors_missed = 0;
 
 	em_tree = &BTRFS_I(inode)->extent_tree;
-	tree = &BTRFS_I(inode)->io_tree;
 
 	if (isize == 0)
 		return 0;
@@ -597,7 +595,6 @@  static noinline int add_ra_bio_pages(struct inode *inode,
 		}
 
 		page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
-		lock_extent(tree, cur, page_end, NULL);
 		read_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
 		read_unlock(&em_tree->lock);
@@ -611,7 +608,6 @@  static noinline int add_ra_bio_pages(struct inode *inode,
 		    (cur + fs_info->sectorsize > extent_map_end(em)) ||
 		    (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
 			free_extent_map(em);
-			unlock_extent(tree, cur, page_end, NULL);
 			unlock_page(page);
 			put_page(page);
 			break;
@@ -631,7 +627,6 @@  static noinline int add_ra_bio_pages(struct inode *inode,
 		add_size = min(em->start + em->len, page_end + 1) - cur;
 		ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
 		if (ret != add_size) {
-			unlock_extent(tree, cur, page_end, NULL);
 			unlock_page(page);
 			put_page(page);
 			break;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 42bae149f923..c5430cabad62 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -891,15 +891,6 @@  static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
 		btrfs_subpage_end_reader(fs_info, page, start, len);
 }
 
-static void end_sector_io(struct page *page, u64 offset, bool uptodate)
-{
-	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
-	const u32 sectorsize = inode->root->fs_info->sectorsize;
-
-	end_page_read(page, uptodate, offset, sectorsize);
-	unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL);
-}
-
 static void submit_data_read_repair(struct inode *inode,
 				    struct btrfs_bio *failed_bbio,
 				    u32 bio_offset, const struct bio_vec *bvec,
@@ -960,7 +951,7 @@  static void submit_data_read_repair(struct inode *inode,
 		 * will not be properly unlocked.
 		 */
 next:
-		end_sector_io(page, start + offset, uptodate);
+		end_page_read(page, uptodate, start + offset, sectorsize);
 	}
 }
 
@@ -1072,9 +1063,6 @@  static void endio_readpage_release_extent(struct processed_extent *processed,
 			      struct btrfs_inode *inode, u64 start, u64 end,
 			      bool uptodate)
 {
-	struct extent_state *cached = NULL;
-	struct extent_io_tree *tree;
-
 	/* The first extent, initialize @processed */
 	if (!processed->inode)
 		goto update;
@@ -1096,13 +1084,6 @@  static void endio_readpage_release_extent(struct processed_extent *processed,
 		return;
 	}
 
-	tree = &processed->inode->io_tree;
-	/*
-	 * Now we don't have range contiguous to the processed range, release
-	 * the processed range now.
-	 */
-	unlock_extent(tree, processed->start, processed->end, &cached);
-
 update:
 	/* Update processed to current range */
 	processed->inode = inode;
@@ -1743,11 +1724,9 @@  static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 	size_t pg_offset = 0;
 	size_t iosize;
 	size_t blocksize = inode->i_sb->s_blocksize;
-	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 
 	ret = set_page_extent_mapped(page);
 	if (ret < 0) {
-		unlock_extent(tree, start, end, NULL);
 		btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
 		unlock_page(page);
 		goto out;
@@ -1772,14 +1751,12 @@  static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 		if (cur >= last_byte) {
 			iosize = PAGE_SIZE - pg_offset;
 			memzero_page(page, pg_offset, iosize);
-			unlock_extent(tree, cur, cur + iosize - 1, NULL);
 			end_page_read(page, true, cur, iosize);
 			break;
 		}
 		em = __get_extent_map(inode, page, pg_offset, cur,
 				      end - cur + 1, em_cached);
 		if (IS_ERR(em)) {
-			unlock_extent(tree, cur, end, NULL);
 			end_page_read(page, false, cur, end + 1 - cur);
 			ret = PTR_ERR(em);
 			break;
@@ -1849,8 +1826,6 @@  static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 		/* we've found a hole, just zero and go on */
 		if (block_start == EXTENT_MAP_HOLE) {
 			memzero_page(page, pg_offset, iosize);
-
-			unlock_extent(tree, cur, cur + iosize - 1, NULL);
 			end_page_read(page, true, cur, iosize);
 			cur = cur + iosize;
 			pg_offset += iosize;
@@ -1858,7 +1833,6 @@  static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 		}
 		/* the get_extent function already copied into the page */
 		if (block_start == EXTENT_MAP_INLINE) {
-			unlock_extent(tree, cur, cur + iosize - 1, NULL);
 			end_page_read(page, true, cur, iosize);
 			cur = cur + iosize;
 			pg_offset += iosize;
@@ -1874,7 +1848,6 @@  static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 			 * We have to unlock the remaining range, or the page
 			 * will never be unlocked.
 			 */
-			unlock_extent(tree, cur, end, NULL);
 			end_page_read(page, false, cur, end + 1 - cur);
 			goto out;
 		}
@@ -1888,13 +1861,9 @@  static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 int btrfs_read_folio(struct file *file, struct folio *folio)
 {
 	struct page *page = &folio->page;
-	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
-	u64 start = page_offset(page);
-	u64 end = start + PAGE_SIZE - 1;
 	struct btrfs_bio_ctrl bio_ctrl = { 0 };
 	int ret;
 
-	btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
 
 	ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
 	/*
@@ -1911,11 +1880,8 @@  static inline void contiguous_readpages(struct page *pages[], int nr_pages,
 					struct btrfs_bio_ctrl *bio_ctrl,
 					u64 *prev_em_start)
 {
-	struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
 	int index;
 
-	btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
-
 	for (index = 0; index < nr_pages; index++) {
 		btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
 				  REQ_RAHEAD, prev_em_start);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 473a0743270b..9266ee6c1a61 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3806,15 +3806,26 @@  static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	ssize_t ret = 0;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	loff_t len = iov_iter_count(to);
+	u64 start = round_down(iocb->ki_pos, PAGE_SIZE);
+	u64 end = round_up(iocb->ki_pos + len, PAGE_SIZE) - 1;
+	struct extent_state *cached = NULL;
 
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		ret = btrfs_direct_read(iocb, to);
-		if (ret < 0 || !iov_iter_count(to) ||
-		    iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
+		if (ret < 0 || !len ||
+		    iocb->ki_pos >= i_size_read(inode))
 			return ret;
 	}
 
-	return filemap_read(iocb, to, ret);
+	btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), start,
+			end, &cached);
+
+	ret = filemap_read(iocb, to, ret);
+
+	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached);
+	return ret;
 }
 
 const struct file_operations btrfs_file_operations = {