diff mbox series

[v3,1/3] mm, lru_gen: try to prefetch next page when scanning LRU

Message ID 20240123184552.59758-2-ryncsn@gmail.com (mailing list archive)
State New
Headers show
Series mm, lru_gen: batch update pages when aging | expand

Commit Message

Kairui Song Jan. 23, 2024, 6:45 p.m. UTC
From: Kairui Song <kasong@tencent.com>

Prefetch for inactive/active LRU have been long exiting, apply the same
optimization for MGLRU.

Test 1: Ramdisk fio ro test in a 4G memcg on a EPYC 7K62:
  fio -name=mglru --numjobs=16 --directory=/mnt --size=960m \
    --buffered=1 --ioengine=io_uring --iodepth=128 \
    --iodepth_batch_submit=32 --iodepth_batch_complete=32 \
    --rw=randread --random_distribution=zipf:0.5 --norandommap \
    --time_based --ramp_time=1m --runtime=6m --group_reporting

Before this patch:
bw (  MiB/s): min= 7758, max= 9239, per=100.00%, avg=8747.59, stdev=16.51, samples=11488
iops        : min=1986251, max=2365323, avg=2239380.87, stdev=4225.93, samples=11488

After this patch (+7.2%):
bw (  MiB/s): min= 8360, max= 9771, per=100.00%, avg=9381.31, stdev=15.67, samples=11488
iops        : min=2140296, max=2501385, avg=2401613.91, stdev=4010.41, samples=11488

Test 2: Ramdisk fio hybrid test for 30m in a 4G memcg on a EPYC 7K62 (3 times):
  fio --buffered=1 --numjobs=8 --size=960m --directory=/mnt \
    --time_based --ramp_time=1m --runtime=30m \
    --ioengine=io_uring --iodepth=128 --iodepth_batch_submit=32 \
    --iodepth_batch_complete=32 --norandommap \
    --name=mglru-ro --rw=randread --random_distribution=zipf:0.7 \
    --name=mglru-rw --rw=randrw --random_distribution=zipf:0.7

Before this patch:
 READ: 6622.0 MiB/s. Stdev: 22.090722
WRITE: 1256.3 MiB/s. Stdev: 5.249339

After this patch (+4.6%, +3.3%):
 READ: 6926.6 MiB/s, Stdev: 37.950260
WRITE: 1297.3 MiB/s, Stdev: 7.408704

Test 3: 30m of MySQL test in 6G memcg (12 times):
  echo 'set GLOBAL innodb_buffer_pool_size=16106127360;' | \
    mysql -u USER -h localhost --password=PASS

  sysbench /usr/share/sysbench/oltp_read_only.lua \
    --mysql-user=USER --mysql-password=PASS --mysql-db=DB \
    --tables=48 --table-size=2000000 --threads=16 --time=1800 run

Before this patch
Avg: 134743.714545 qps. Stdev: 582.242189

After this patch (+0.2%):
Avg: 135005.779091 qps. Stdev: 295.299027

Test 4: Build linux kernel in 2G memcg with make -j48 with SSD swap
        (for memory stress, 18 times):

Before this patch:
Avg: 1456.768899 s. Stdev: 20.106973

After this patch (+0.0%):
Avg: 1455.659254 s. Stdev: 15.274481

Test 5: Memtier test in a 4G cgroup using brd as swap (18 times):
  memcached -u nobody -m 16384 -s /tmp/memcached.socket \
    -a 0766 -t 16 -B binary &
  memtier_benchmark -S /tmp/memcached.socket \
    -P memcache_binary -n allkeys \
    --key-minimum=1 --key-maximum=16000000 -d 1024 \
    --ratio=1:0 --key-pattern=P:P -c 1 -t 16 --pipeline 8 -x 3

Before this patch:
Avg: 50317.984000 Ops/sec. Stdev: 2568.965458

After this patch (-5.7%):
Avg: 47691.343500 Ops/sec. Stdev: 3925.772473

It seems prefetch is helpful in most cases, but the memtier test is
either hitting a case where prefetch causes higher cache miss or it's
just too noisy (high stdev).

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/vmscan.c | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

Comments

Chris Li Jan. 25, 2024, 7:32 a.m. UTC | #1
On Tue, Jan 23, 2024 at 10:46 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> Prefetch for inactive/active LRU have been long exiting, apply the same
> optimization for MGLRU.
>
> Test 1: Ramdisk fio ro test in a 4G memcg on a EPYC 7K62:
>   fio -name=mglru --numjobs=16 --directory=/mnt --size=960m \
>     --buffered=1 --ioengine=io_uring --iodepth=128 \
>     --iodepth_batch_submit=32 --iodepth_batch_complete=32 \
>     --rw=randread --random_distribution=zipf:0.5 --norandommap \
>     --time_based --ramp_time=1m --runtime=6m --group_reporting
>
> Before this patch:
> bw (  MiB/s): min= 7758, max= 9239, per=100.00%, avg=8747.59, stdev=16.51, samples=11488
> iops        : min=1986251, max=2365323, avg=2239380.87, stdev=4225.93, samples=11488
>
> After this patch (+7.2%):
> bw (  MiB/s): min= 8360, max= 9771, per=100.00%, avg=9381.31, stdev=15.67, samples=11488
> iops        : min=2140296, max=2501385, avg=2401613.91, stdev=4010.41, samples=11488
>
> Test 2: Ramdisk fio hybrid test for 30m in a 4G memcg on a EPYC 7K62 (3 times):
>   fio --buffered=1 --numjobs=8 --size=960m --directory=/mnt \
>     --time_based --ramp_time=1m --runtime=30m \
>     --ioengine=io_uring --iodepth=128 --iodepth_batch_submit=32 \
>     --iodepth_batch_complete=32 --norandommap \
>     --name=mglru-ro --rw=randread --random_distribution=zipf:0.7 \
>     --name=mglru-rw --rw=randrw --random_distribution=zipf:0.7
>
> Before this patch:
>  READ: 6622.0 MiB/s. Stdev: 22.090722
> WRITE: 1256.3 MiB/s. Stdev: 5.249339
>
> After this patch (+4.6%, +3.3%):
>  READ: 6926.6 MiB/s, Stdev: 37.950260
> WRITE: 1297.3 MiB/s, Stdev: 7.408704
>
> Test 3: 30m of MySQL test in 6G memcg (12 times):
>   echo 'set GLOBAL innodb_buffer_pool_size=16106127360;' | \
>     mysql -u USER -h localhost --password=PASS
>
>   sysbench /usr/share/sysbench/oltp_read_only.lua \
>     --mysql-user=USER --mysql-password=PASS --mysql-db=DB \
>     --tables=48 --table-size=2000000 --threads=16 --time=1800 run
>
> Before this patch
> Avg: 134743.714545 qps. Stdev: 582.242189
>
> After this patch (+0.2%):
> Avg: 135005.779091 qps. Stdev: 295.299027
>
> Test 4: Build linux kernel in 2G memcg with make -j48 with SSD swap
>         (for memory stress, 18 times):
>
> Before this patch:
> Avg: 1456.768899 s. Stdev: 20.106973
>
> After this patch (+0.0%):
> Avg: 1455.659254 s. Stdev: 15.274481
>
> Test 5: Memtier test in a 4G cgroup using brd as swap (18 times):
>   memcached -u nobody -m 16384 -s /tmp/memcached.socket \
>     -a 0766 -t 16 -B binary &
>   memtier_benchmark -S /tmp/memcached.socket \
>     -P memcache_binary -n allkeys \
>     --key-minimum=1 --key-maximum=16000000 -d 1024 \
>     --ratio=1:0 --key-pattern=P:P -c 1 -t 16 --pipeline 8 -x 3
>
> Before this patch:
> Avg: 50317.984000 Ops/sec. Stdev: 2568.965458
>
> After this patch (-5.7%):
> Avg: 47691.343500 Ops/sec. Stdev: 3925.772473
>
> It seems prefetch is helpful in most cases, but the memtier test is
> either hitting a case where prefetch causes higher cache miss or it's
> just too noisy (high stdev).
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/vmscan.c | 30 ++++++++++++++++++++++++++----
>  1 file changed, 26 insertions(+), 4 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 4f9c854ce6cc..03631cedb3ab 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3681,15 +3681,26 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
>         /* prevent cold/hot inversion if force_scan is true */
>         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
>                 struct list_head *head = &lrugen->folios[old_gen][type][zone];
> +               struct folio *prev = NULL;
>
> -               while (!list_empty(head)) {
> -                       struct folio *folio = lru_to_folio(head);
> +               if (!list_empty(head))
> +                       prev = lru_to_folio(head);
> +
> +               while (prev) {
> +                       struct folio *folio = prev;
>
>                         VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
>                         VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
>                         VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
>                         VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
>
> +                       if (unlikely(list_is_first(&folio->lru, head))) {
> +                               prev = NULL;
> +                       } else {
> +                               prev = lru_to_folio(&folio->lru);
> +                               prefetchw(&prev->flags);
> +                       }

This makes the code flow much harder to follow. Also for architecture
that does not support prefetch, this will be a net loss.

Can you use refetchw_prev_lru_folio() instead? It will make the code
much easier to follow. It also turns into no-op when prefetch is not
supported.

Chris

> +
>                         new_gen = folio_inc_gen(lruvec, folio, false);
>                         list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
>
> @@ -4341,11 +4352,15 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
>         for (i = MAX_NR_ZONES; i > 0; i--) {
>                 LIST_HEAD(moved);
>                 int skipped_zone = 0;
> +               struct folio *prev = NULL;
>                 int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
>                 struct list_head *head = &lrugen->folios[gen][type][zone];
>
> -               while (!list_empty(head)) {
> -                       struct folio *folio = lru_to_folio(head);
> +               if (!list_empty(head))
> +                       prev = lru_to_folio(head);
> +
> +               while (prev) {
> +                       struct folio *folio = prev;
>                         int delta = folio_nr_pages(folio);
>
>                         VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
> @@ -4355,6 +4370,13 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
>
>                         scanned += delta;
>
> +                       if (unlikely(list_is_first(&folio->lru, head))) {
> +                               prev = NULL;
> +                       } else {
> +                               prev = lru_to_folio(&folio->lru);
> +                               prefetchw(&prev->flags);
> +                       }
> +
>                         if (sort_folio(lruvec, folio, sc, tier))
>                                 sorted += delta;
>                         else if (isolate_folio(lruvec, folio, sc)) {
> --
> 2.43.0
>
>
Kairui Song Jan. 25, 2024, 5:51 p.m. UTC | #2
On Thu, Jan 25, 2024 at 3:33 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Tue, Jan 23, 2024 at 10:46 AM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > Prefetch for inactive/active LRU have been long exiting, apply the same
> > optimization for MGLRU.
> >
> > Test 1: Ramdisk fio ro test in a 4G memcg on a EPYC 7K62:
> >   fio -name=mglru --numjobs=16 --directory=/mnt --size=960m \
> >     --buffered=1 --ioengine=io_uring --iodepth=128 \
> >     --iodepth_batch_submit=32 --iodepth_batch_complete=32 \
> >     --rw=randread --random_distribution=zipf:0.5 --norandommap \
> >     --time_based --ramp_time=1m --runtime=6m --group_reporting
> >
> > Before this patch:
> > bw (  MiB/s): min= 7758, max= 9239, per=100.00%, avg=8747.59, stdev=16.51, samples=11488
> > iops        : min=1986251, max=2365323, avg=2239380.87, stdev=4225.93, samples=11488
> >
> > After this patch (+7.2%):
> > bw (  MiB/s): min= 8360, max= 9771, per=100.00%, avg=9381.31, stdev=15.67, samples=11488
> > iops        : min=2140296, max=2501385, avg=2401613.91, stdev=4010.41, samples=11488
> >
> > Test 2: Ramdisk fio hybrid test for 30m in a 4G memcg on a EPYC 7K62 (3 times):
> >   fio --buffered=1 --numjobs=8 --size=960m --directory=/mnt \
> >     --time_based --ramp_time=1m --runtime=30m \
> >     --ioengine=io_uring --iodepth=128 --iodepth_batch_submit=32 \
> >     --iodepth_batch_complete=32 --norandommap \
> >     --name=mglru-ro --rw=randread --random_distribution=zipf:0.7 \
> >     --name=mglru-rw --rw=randrw --random_distribution=zipf:0.7
> >
> > Before this patch:
> >  READ: 6622.0 MiB/s. Stdev: 22.090722
> > WRITE: 1256.3 MiB/s. Stdev: 5.249339
> >
> > After this patch (+4.6%, +3.3%):
> >  READ: 6926.6 MiB/s, Stdev: 37.950260
> > WRITE: 1297.3 MiB/s, Stdev: 7.408704
> >
> > Test 3: 30m of MySQL test in 6G memcg (12 times):
> >   echo 'set GLOBAL innodb_buffer_pool_size=16106127360;' | \
> >     mysql -u USER -h localhost --password=PASS
> >
> >   sysbench /usr/share/sysbench/oltp_read_only.lua \
> >     --mysql-user=USER --mysql-password=PASS --mysql-db=DB \
> >     --tables=48 --table-size=2000000 --threads=16 --time=1800 run
> >
> > Before this patch
> > Avg: 134743.714545 qps. Stdev: 582.242189
> >
> > After this patch (+0.2%):
> > Avg: 135005.779091 qps. Stdev: 295.299027
> >
> > Test 4: Build linux kernel in 2G memcg with make -j48 with SSD swap
> >         (for memory stress, 18 times):
> >
> > Before this patch:
> > Avg: 1456.768899 s. Stdev: 20.106973
> >
> > After this patch (+0.0%):
> > Avg: 1455.659254 s. Stdev: 15.274481
> >
> > Test 5: Memtier test in a 4G cgroup using brd as swap (18 times):
> >   memcached -u nobody -m 16384 -s /tmp/memcached.socket \
> >     -a 0766 -t 16 -B binary &
> >   memtier_benchmark -S /tmp/memcached.socket \
> >     -P memcache_binary -n allkeys \
> >     --key-minimum=1 --key-maximum=16000000 -d 1024 \
> >     --ratio=1:0 --key-pattern=P:P -c 1 -t 16 --pipeline 8 -x 3
> >
> > Before this patch:
> > Avg: 50317.984000 Ops/sec. Stdev: 2568.965458
> >
> > After this patch (-5.7%):
> > Avg: 47691.343500 Ops/sec. Stdev: 3925.772473
> >
> > It seems prefetch is helpful in most cases, but the memtier test is
> > either hitting a case where prefetch causes higher cache miss or it's
> > just too noisy (high stdev).
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >  mm/vmscan.c | 30 ++++++++++++++++++++++++++----
> >  1 file changed, 26 insertions(+), 4 deletions(-)
> >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 4f9c854ce6cc..03631cedb3ab 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -3681,15 +3681,26 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
> >         /* prevent cold/hot inversion if force_scan is true */
> >         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
> >                 struct list_head *head = &lrugen->folios[old_gen][type][zone];
> > +               struct folio *prev = NULL;
> >
> > -               while (!list_empty(head)) {
> > -                       struct folio *folio = lru_to_folio(head);
> > +               if (!list_empty(head))
> > +                       prev = lru_to_folio(head);
> > +
> > +               while (prev) {
> > +                       struct folio *folio = prev;
> >
> >                         VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
> >                         VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
> >                         VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
> >                         VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
> >
> > +                       if (unlikely(list_is_first(&folio->lru, head))) {
> > +                               prev = NULL;
> > +                       } else {
> > +                               prev = lru_to_folio(&folio->lru);
> > +                               prefetchw(&prev->flags);
> > +                       }
>
> This makes the code flow much harder to follow. Also for architecture
> that does not support prefetch, this will be a net loss.
>
> Can you use refetchw_prev_lru_folio() instead? It will make the code
> much easier to follow. It also turns into no-op when prefetch is not
> supported.
>
> Chris
>

Hi Chris,

Thanks for the suggestion.

Yes, that's doable, I made it this way because in previous series (V1
& V2) I applied the bulk move patch first which needed and introduced
the `prev` variable here, so the prefetch logic just used it.
For V3 I did a rebase and moved the prefetch commit to be the first
one, since it seems to be the most effective one, and just kept the
code style to avoid redundant change between patches.

I can update in V4 to make this individual patch better with your suggestion.
Chris Li Jan. 26, 2024, 12:56 a.m. UTC | #3
On Fri, Jan 26, 2024 at 01:51:44AM +0800, Kairui Song wrote:
> > >  mm/vmscan.c | 30 ++++++++++++++++++++++++++----
> > >  1 file changed, 26 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > index 4f9c854ce6cc..03631cedb3ab 100644
> > > --- a/mm/vmscan.c
> > > +++ b/mm/vmscan.c
> > > @@ -3681,15 +3681,26 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
> > >         /* prevent cold/hot inversion if force_scan is true */
> > >         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
> > >                 struct list_head *head = &lrugen->folios[old_gen][type][zone];
> > > +               struct folio *prev = NULL;
> > >
> > > -               while (!list_empty(head)) {
> > > -                       struct folio *folio = lru_to_folio(head);
> > > +               if (!list_empty(head))
> > > +                       prev = lru_to_folio(head);
> > > +
> > > +               while (prev) {
> > > +                       struct folio *folio = prev;
> > >
> > >                         VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
> > >                         VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
> > >                         VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
> > >                         VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
> > >
> > > +                       if (unlikely(list_is_first(&folio->lru, head))) {
> > > +                               prev = NULL;
> > > +                       } else {
> > > +                               prev = lru_to_folio(&folio->lru);
> > > +                               prefetchw(&prev->flags);
> > > +                       }
> >
> > This makes the code flow much harder to follow. Also for architecture
> > that does not support prefetch, this will be a net loss.
> >
> > Can you use refetchw_prev_lru_folio() instead? It will make the code
> > much easier to follow. It also turns into no-op when prefetch is not
> > supported.
> >
> > Chris
> >
> 
> Hi Chris,
> 
> Thanks for the suggestion.
> 
> Yes, that's doable, I made it this way because in previous series (V1
> & V2) I applied the bulk move patch first which needed and introduced
> the `prev` variable here, so the prefetch logic just used it.
> For V3 I did a rebase and moved the prefetch commit to be the first
> one, since it seems to be the most effective one, and just kept the

Maybe something like this? Totally not tested. Feel free to use it any way you want.

Chris

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4f9c854ce6cc..2100e786ccc6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3684,6 +3684,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
 
 		while (!list_empty(head)) {
 			struct folio *folio = lru_to_folio(head);
+			prefetchw_prev_lru_folio(folio, head, flags);
 
 			VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
 			VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
@@ -4346,7 +4347,10 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
 
 		while (!list_empty(head)) {
 			struct folio *folio = lru_to_folio(head);
-			int delta = folio_nr_pages(folio);
+			int delta;
+
+			prefetchw_prev_lru_folio(folio, head, flags);
+			delta = folio_nr_pages(folio);
 
 			VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
 			VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
Kairui Song Jan. 26, 2024, 10:31 a.m. UTC | #4
On Fri, Jan 26, 2024 at 8:56 AM Chris Li <chrisl@kernel.org> wrote:
> On Fri, Jan 26, 2024 at 01:51:44AM +0800, Kairui Song wrote:
> > > >  mm/vmscan.c | 30 ++++++++++++++++++++++++++----
> > > >  1 file changed, 26 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > > index 4f9c854ce6cc..03631cedb3ab 100644
> > > > --- a/mm/vmscan.c
> > > > +++ b/mm/vmscan.c
> > > > @@ -3681,15 +3681,26 @@ static bool inc_min_seq(struct lruvec
*lruvec, int type, bool can_swap)
> > > >         /* prevent cold/hot inversion if force_scan is true */
> > > >         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
> > > >                 struct list_head *head =
&lrugen->folios[old_gen][type][zone];
> > > > +               struct folio *prev = NULL;
> > > >
> > > > -               while (!list_empty(head)) {
> > > > -                       struct folio *folio = lru_to_folio(head);
> > > > +               if (!list_empty(head))
> > > > +                       prev = lru_to_folio(head);
> > > > +
> > > > +               while (prev) {
> > > > +                       struct folio *folio = prev;
> > > >
> > > >
 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
> > > >
 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
> > > >
 VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
> > > >                         VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio)
!= zone, folio);
> > > >
> > > > +                       if (unlikely(list_is_first(&folio->lru,
head))) {
> > > > +                               prev = NULL;
> > > > +                       } else {
> > > > +                               prev = lru_to_folio(&folio->lru);
> > > > +                               prefetchw(&prev->flags);
> > > > +                       }
> > >
> > > This makes the code flow much harder to follow. Also for architecture
> > > that does not support prefetch, this will be a net loss.
> > >
> > > Can you use refetchw_prev_lru_folio() instead? It will make the code
> > > much easier to follow. It also turns into no-op when prefetch is not
> > > supported.
> > >
> > > Chris
> > >
> >
> > Hi Chris,
> >
> > Thanks for the suggestion.
> >
> > Yes, that's doable, I made it this way because in previous series (V1
> > & V2) I applied the bulk move patch first which needed and introduced
> > the `prev` variable here, so the prefetch logic just used it.
> > For V3 I did a rebase and moved the prefetch commit to be the first
> > one, since it seems to be the most effective one, and just kept the
>
> Maybe something like this? Totally not tested. Feel free to use it any
way you want.
>
> Chris
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 4f9c854ce6cc..2100e786ccc6 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3684,6 +3684,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int
type, bool can_swap)
>
>                 while (!list_empty(head)) {
>                         struct folio *folio = lru_to_folio(head);
> +                       prefetchw_prev_lru_folio(folio, head, flags);
>
>
 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
>                         VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio),
folio);
> @@ -4346,7 +4347,10 @@ static int scan_folios(struct lruvec *lruvec,
struct scan_control *sc,
>
>                 while (!list_empty(head)) {
>                         struct folio *folio = lru_to_folio(head);
> -                       int delta = folio_nr_pages(folio);
> +                       int delta;
> +
> +                       prefetchw_prev_lru_folio(folio, head, flags);
> +                       delta = folio_nr_pages(folio);
>
>
 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
>                         VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio),
folio);
>

Thanks!

Actually if benefits from 2/3 and 3/3 is trivial compared to the complexity
and not appealing, then let's only keep the prefetch one, which will be
just a one liner change with good result.
Chris Li Jan. 26, 2024, 9:19 p.m. UTC | #5
On Fri, Jan 26, 2024 at 2:31 AM Kairui Song <ryncsn@gmail.com> wrote:
>

> > > >
> > > > This makes the code flow much harder to follow. Also for architecture
> > > > that does not support prefetch, this will be a net loss.
> > > >
> > > > Can you use refetchw_prev_lru_folio() instead? It will make the code
> > > > much easier to follow. It also turns into no-op when prefetch is not
> > > > supported.
> > > >
> > > > Chris
> > > >
> > >
> > > Hi Chris,
> > >
> > > Thanks for the suggestion.
> > >
> > > Yes, that's doable, I made it this way because in previous series (V1
> > > & V2) I applied the bulk move patch first which needed and introduced
> > > the `prev` variable here, so the prefetch logic just used it.
> > > For V3 I did a rebase and moved the prefetch commit to be the first
> > > one, since it seems to be the most effective one, and just kept the
> >
> > Maybe something like this? Totally not tested. Feel free to use it any way you want.
> >
> > Chris
> >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 4f9c854ce6cc..2100e786ccc6 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -3684,6 +3684,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
> >
> >                 while (!list_empty(head)) {
> >                         struct folio *folio = lru_to_folio(head);
> > +                       prefetchw_prev_lru_folio(folio, head, flags);
> >
> >                         VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
> >                         VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
> > @@ -4346,7 +4347,10 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
> >
> >                 while (!list_empty(head)) {
> >                         struct folio *folio = lru_to_folio(head);
> > -                       int delta = folio_nr_pages(folio);
> > +                       int delta;
> > +
> > +                       prefetchw_prev_lru_folio(folio, head, flags);
> > +                       delta = folio_nr_pages(folio);
> >
> >                         VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
> >                         VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
> >
>
> Thanks!
>
> Actually if benefits from 2/3 and 3/3 is trivial compared to the complexity and not appealing, then let's only keep the prefetch one, which will be just a one liner change with good result.

That is great. I did take a look at 2/3 and 3/3 and come to the same
conclusion regarding the complexity part.

If you resend the one liner for 1/3, you can consider it having my Ack.

Chris
diff mbox series

Patch

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4f9c854ce6cc..03631cedb3ab 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3681,15 +3681,26 @@  static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
 	/* prevent cold/hot inversion if force_scan is true */
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		struct list_head *head = &lrugen->folios[old_gen][type][zone];
+		struct folio *prev = NULL;
 
-		while (!list_empty(head)) {
-			struct folio *folio = lru_to_folio(head);
+		if (!list_empty(head))
+			prev = lru_to_folio(head);
+
+		while (prev) {
+			struct folio *folio = prev;
 
 			VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
 			VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
 			VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
 			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
 
+			if (unlikely(list_is_first(&folio->lru, head))) {
+				prev = NULL;
+			} else {
+				prev = lru_to_folio(&folio->lru);
+				prefetchw(&prev->flags);
+			}
+
 			new_gen = folio_inc_gen(lruvec, folio, false);
 			list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
 
@@ -4341,11 +4352,15 @@  static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
 	for (i = MAX_NR_ZONES; i > 0; i--) {
 		LIST_HEAD(moved);
 		int skipped_zone = 0;
+		struct folio *prev = NULL;
 		int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
 		struct list_head *head = &lrugen->folios[gen][type][zone];
 
-		while (!list_empty(head)) {
-			struct folio *folio = lru_to_folio(head);
+		if (!list_empty(head))
+			prev = lru_to_folio(head);
+
+		while (prev) {
+			struct folio *folio = prev;
 			int delta = folio_nr_pages(folio);
 
 			VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
@@ -4355,6 +4370,13 @@  static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
 
 			scanned += delta;
 
+			if (unlikely(list_is_first(&folio->lru, head))) {
+				prev = NULL;
+			} else {
+				prev = lru_to_folio(&folio->lru);
+				prefetchw(&prev->flags);
+			}
+
 			if (sort_folio(lruvec, folio, sc, tier))
 				sorted += delta;
 			else if (isolate_folio(lruvec, folio, sc)) {