Message ID | 1555487246-15764-1-git-send-email-huangzhaoyang@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [RFC] mm/workingset : judge file page activity via timestamp | expand |
add Johannes and answer his previous question. @Johannes Weiner Yes. I do agree with you about the original thought of sacrificing long distance access pages when huge memory demands arise. The problem is what is the criteria of the distance, which you can find from what I comment in the patch, that is, some pages have long refault_distance while having a very short access time in between. I think the latter one should be take into consideration or as part of the finnal decision of if the page should be active/inactive. On Wed, Apr 17, 2019 at 3:48 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote: > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com> > > This patch introduce timestamp into workingset's entry and judge if the page > is active or inactive via active_file/refault_ratio instead of refault distance. > > The original thought is coming from the logs we got from trace_printk in this > patch, we can find about 1/5 of the file pages' refault are under the > scenario[1],which will be counted as inactive as they have a long refault distance > in between access. However, we can also know from the time information that the > page refault quickly as comparing to the average refault time which is calculated > by the number of active file and refault ratio. We want to save these kinds of > pages from evicted earlier as it used to be. The refault ratio is the value > which can reflect lru's average file access frequency and also can be deemed as a > prediction of future. > > The patch is tested on an android system and reduce 30% of page faults, while > 60% of the pages remain the original status as (refault_distance < active_file) > indicates. Pages status got from ftrace during the test can refer to [2]. > > [1] > system_server workingset_refault: WKST_ACT[0]:rft_dis 265976, act_file 34268 rft_ratio 3047 rft_time 0 avg_rft_time 11 refault 295592 eviction 29616 secs 97 pre_secs 97 > HwBinder:922 workingset_refault: WKST_ACT[0]:rft_dis 264478, act_file 35037 rft_ratio 3070 rft_time 2 avg_rft_time 11 refault 310078 eviction 45600 secs 101 pre_secs 99 > > [2] > WKST_ACT[0]: original--INACTIVE commit--ACTIVE > WKST_ACT[1]: original--ACTIVE commit--ACTIVE > WKST_INACT[0]: original--INACTIVE commit--INACTIVE > WKST_INACT[1]: original--ACTIVE commit--INACTIVE > > Signed-off-by: Zhaoyang Huang <huangzhaoyang@gmail.com> > --- > include/linux/mmzone.h | 1 + > mm/workingset.c | 120 +++++++++++++++++++++++++++++++++++++++++++++---- > 2 files changed, 112 insertions(+), 9 deletions(-) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 32699b2..6f30673 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -240,6 +240,7 @@ struct lruvec { > atomic_long_t inactive_age; > /* Refaults at the time of last reclaim cycle */ > unsigned long refaults; > + atomic_long_t refaults_ratio; > #ifdef CONFIG_MEMCG > struct pglist_data *pgdat; > #endif > diff --git a/mm/workingset.c b/mm/workingset.c > index 40ee02c..66c177b 100644 > --- a/mm/workingset.c > +++ b/mm/workingset.c > @@ -160,6 +160,21 @@ > MEM_CGROUP_ID_SHIFT) > #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) > > +#ifdef CONFIG_64BIT > +#define EVICTION_SECS_POS_SHIFT 20 > +#define EVICTION_SECS_SHRINK_SHIFT 4 > +#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1) > +#else > +#ifndef CONFIG_MEMCG > +#define EVICTION_SECS_POS_SHIFT 12 > +#define EVICTION_SECS_SHRINK_SHIFT 4 > +#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1) > +#else > +#define EVICTION_SECS_POS_SHIFT 0 > +#define EVICTION_SECS_SHRINK_SHIFT 0 > +#define NO_SECS_IN_WORKINGSET > +#endif > +#endif > /* > * Eviction timestamps need to be able to cover the full range of > * actionable refaults. However, bits are tight in the radix tree > @@ -169,10 +184,54 @@ > * evictions into coarser buckets by shaving off lower timestamp bits. > */ > static unsigned int bucket_order __read_mostly; > - > +#ifdef NO_SECS_IN_WORKINGSET > +static void pack_secs(unsigned long *peviction) { } > +static unsigned int unpack_secs(unsigned long entry) {return 0; } > +#else > +/* > + * Shrink the timestamp according to its value and store it together > + * with the shrink size in the entry. > + */ > +static void pack_secs(unsigned long *peviction) > +{ > + unsigned int secs; > + unsigned long eviction; > + int order; > + int secs_shrink_size; > + struct timespec ts; > + > + get_monotonic_boottime(&ts); > + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1; > + order = get_count_order(secs); > + secs_shrink_size = (order <= EVICTION_SECS_POS_SHIFT) > + ? 0 : (order - EVICTION_SECS_POS_SHIFT); > + > + eviction = *peviction; > + eviction = (eviction << EVICTION_SECS_POS_SHIFT) > + | ((secs >> secs_shrink_size) & EVICTION_SECS_POS_MASK); > + eviction = (eviction << EVICTION_SECS_SHRINK_SHIFT) | (secs_shrink_size & 0xf); > + *peviction = eviction; > +} > +/* > + * Unpack the second from the entry and restore the value according to the > + * shrink size. > + */ > +static unsigned int unpack_secs(unsigned long entry) > +{ > + unsigned int secs; > + int secs_shrink_size; > + > + secs_shrink_size = entry & ((1 << EVICTION_SECS_SHRINK_SHIFT) - 1); > + entry >>= EVICTION_SECS_SHRINK_SHIFT; > + secs = entry & EVICTION_SECS_POS_MASK; > + secs = secs << secs_shrink_size; > + return secs; > +} > +#endif > static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) > { > eviction >>= bucket_order; > + pack_secs(&eviction); > eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; > eviction = (eviction << NODES_SHIFT) | pgdat->node_id; > eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); > @@ -181,20 +240,24 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) > } > > static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, > - unsigned long *evictionp) > + unsigned long *evictionp, unsigned int *prev_secs) > { > unsigned long entry = (unsigned long)shadow; > int memcgid, nid; > + unsigned int secs; > > entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; > nid = entry & ((1UL << NODES_SHIFT) - 1); > entry >>= NODES_SHIFT; > memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); > entry >>= MEM_CGROUP_ID_SHIFT; > + secs = unpack_secs(entry); > + entry >>= (EVICTION_SECS_POS_SHIFT + EVICTION_SECS_SHRINK_SHIFT); > > *memcgidp = memcgid; > *pgdat = NODE_DATA(nid); > *evictionp = entry << bucket_order; > + *prev_secs = secs; > } > > /** > @@ -242,9 +305,22 @@ bool workingset_refault(void *shadow) > unsigned long refault; > struct pglist_data *pgdat; > int memcgid; > +#ifndef NO_SECS_IN_WORKINGSET > + unsigned long avg_refault_time; > + unsigned long refault_time; > + int tradition; > + unsigned int prev_secs; > + unsigned int secs; > + unsigned long refaults_ratio; > +#endif > + struct timespec ts; > + /* > + convert jiffies to second > + */ > + get_monotonic_boottime(&ts); > + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1; > > - unpack_shadow(shadow, &memcgid, &pgdat, &eviction); > - > + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &prev_secs); > rcu_read_lock(); > /* > * Look up the memcg associated with the stored ID. It might > @@ -288,14 +364,37 @@ bool workingset_refault(void *shadow) > * list is not a problem. > */ > refault_distance = (refault - eviction) & EVICTION_MASK; > - > inc_lruvec_state(lruvec, WORKINGSET_REFAULT); > - > - if (refault_distance <= active_file) { > +#ifndef NO_SECS_IN_WORKINGSET > + refaults_ratio = (atomic_long_read(&lruvec->inactive_age) + 1) / secs; > + atomic_long_set(&lruvec->refaults_ratio, refaults_ratio); > + refault_time = secs - prev_secs; > + avg_refault_time = active_file / refaults_ratio; > + tradition = !!(refault_distance < active_file); > + if (refault_time <= avg_refault_time) { > +#else > + if (refault_distance < active_file) { > +#endif > inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); > +#ifndef NO_SECS_IN_WORKINGSET > + trace_printk("WKST_ACT[%d]:rft_dis %ld, act_file %ld \ > + rft_ratio %ld rft_time %ld avg_rft_time %ld \ > + refault %ld eviction %ld secs %d pre_secs %d\n", > + tradition, refault_distance, active_file, > + refaults_ratio, refault_time, avg_refault_time, > + refault, eviction, secs, prev_secs); > +#endif > rcu_read_unlock(); > return true; > } > +#ifndef NO_SECS_IN_WORKINGSET > + trace_printk("WKST_INACT[%d]:rft_dis %ld, act_file %ld \ > + rft_ratio %ld rft_time %ld avg_rft_time %ld \ > + refault %ld eviction %ld secs %d pre_secs %d\n", > + tradition, refault_distance, active_file, > + refaults_ratio, refault_time, avg_refault_time, > + refault, eviction, secs, prev_secs); > +#endif > rcu_read_unlock(); > return false; > } > @@ -513,7 +612,9 @@ static int __init workingset_init(void) > unsigned int max_order; > int ret; > > - BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); > + BUILD_BUG_ON(BITS_PER_LONG < (EVICTION_SHIFT > + + EVICTION_SECS_POS_SHIFT > + + EVICTION_SECS_SHRINK_SHIFT)); > /* > * Calculate the eviction bucket size to cover the longest > * actionable refault distance, which is currently half of > @@ -521,7 +622,8 @@ static int __init workingset_init(void) > * some more pages at runtime, so keep working with up to > * double the initial memory by using totalram_pages as-is. > */ > - timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; > + timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT > + - EVICTION_SECS_POS_SHIFT - EVICTION_SECS_SHRINK_SHIFT; > max_order = fls_long(totalram_pages - 1); > if (max_order > timestamp_bits) > bucket_order = max_order - timestamp_bits; > -- > 1.9.1 >
Hi, I do not see http://lkml.kernel.org/r/1554348617-12897-1-git-send-email-huangzhaoyang@gmail.com discussion reaching a conlusion to change the current workingset implementation. Therefore is there any reason to post a new version of the patch? If yes it would be really great to see a short summary about how this version is different from the previous one and how all the review feedback has been addressed. On Wed 17-04-19 15:47:26, Zhaoyang Huang wrote: > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com> > > This patch introduce timestamp into workingset's entry and judge if the page > is active or inactive via active_file/refault_ratio instead of refault distance. > > The original thought is coming from the logs we got from trace_printk in this > patch, we can find about 1/5 of the file pages' refault are under the > scenario[1],which will be counted as inactive as they have a long refault distance > in between access. However, we can also know from the time information that the > page refault quickly as comparing to the average refault time which is calculated > by the number of active file and refault ratio. We want to save these kinds of > pages from evicted earlier as it used to be. The refault ratio is the value > which can reflect lru's average file access frequency and also can be deemed as a > prediction of future. > > The patch is tested on an android system and reduce 30% of page faults, while > 60% of the pages remain the original status as (refault_distance < active_file) > indicates. Pages status got from ftrace during the test can refer to [2]. > > [1] > system_server workingset_refault: WKST_ACT[0]:rft_dis 265976, act_file 34268 rft_ratio 3047 rft_time 0 avg_rft_time 11 refault 295592 eviction 29616 secs 97 pre_secs 97 > HwBinder:922 workingset_refault: WKST_ACT[0]:rft_dis 264478, act_file 35037 rft_ratio 3070 rft_time 2 avg_rft_time 11 refault 310078 eviction 45600 secs 101 pre_secs 99 > > [2] > WKST_ACT[0]: original--INACTIVE commit--ACTIVE > WKST_ACT[1]: original--ACTIVE commit--ACTIVE > WKST_INACT[0]: original--INACTIVE commit--INACTIVE > WKST_INACT[1]: original--ACTIVE commit--INACTIVE > > Signed-off-by: Zhaoyang Huang <huangzhaoyang@gmail.com> > --- > include/linux/mmzone.h | 1 + > mm/workingset.c | 120 +++++++++++++++++++++++++++++++++++++++++++++---- > 2 files changed, 112 insertions(+), 9 deletions(-) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 32699b2..6f30673 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -240,6 +240,7 @@ struct lruvec { > atomic_long_t inactive_age; > /* Refaults at the time of last reclaim cycle */ > unsigned long refaults; > + atomic_long_t refaults_ratio; > #ifdef CONFIG_MEMCG > struct pglist_data *pgdat; > #endif > diff --git a/mm/workingset.c b/mm/workingset.c > index 40ee02c..66c177b 100644 > --- a/mm/workingset.c > +++ b/mm/workingset.c > @@ -160,6 +160,21 @@ > MEM_CGROUP_ID_SHIFT) > #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) > > +#ifdef CONFIG_64BIT > +#define EVICTION_SECS_POS_SHIFT 20 > +#define EVICTION_SECS_SHRINK_SHIFT 4 > +#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1) > +#else > +#ifndef CONFIG_MEMCG > +#define EVICTION_SECS_POS_SHIFT 12 > +#define EVICTION_SECS_SHRINK_SHIFT 4 > +#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1) > +#else > +#define EVICTION_SECS_POS_SHIFT 0 > +#define EVICTION_SECS_SHRINK_SHIFT 0 > +#define NO_SECS_IN_WORKINGSET > +#endif > +#endif > /* > * Eviction timestamps need to be able to cover the full range of > * actionable refaults. However, bits are tight in the radix tree > @@ -169,10 +184,54 @@ > * evictions into coarser buckets by shaving off lower timestamp bits. > */ > static unsigned int bucket_order __read_mostly; > - > +#ifdef NO_SECS_IN_WORKINGSET > +static void pack_secs(unsigned long *peviction) { } > +static unsigned int unpack_secs(unsigned long entry) {return 0; } > +#else > +/* > + * Shrink the timestamp according to its value and store it together > + * with the shrink size in the entry. > + */ > +static void pack_secs(unsigned long *peviction) > +{ > + unsigned int secs; > + unsigned long eviction; > + int order; > + int secs_shrink_size; > + struct timespec ts; > + > + get_monotonic_boottime(&ts); > + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1; > + order = get_count_order(secs); > + secs_shrink_size = (order <= EVICTION_SECS_POS_SHIFT) > + ? 0 : (order - EVICTION_SECS_POS_SHIFT); > + > + eviction = *peviction; > + eviction = (eviction << EVICTION_SECS_POS_SHIFT) > + | ((secs >> secs_shrink_size) & EVICTION_SECS_POS_MASK); > + eviction = (eviction << EVICTION_SECS_SHRINK_SHIFT) | (secs_shrink_size & 0xf); > + *peviction = eviction; > +} > +/* > + * Unpack the second from the entry and restore the value according to the > + * shrink size. > + */ > +static unsigned int unpack_secs(unsigned long entry) > +{ > + unsigned int secs; > + int secs_shrink_size; > + > + secs_shrink_size = entry & ((1 << EVICTION_SECS_SHRINK_SHIFT) - 1); > + entry >>= EVICTION_SECS_SHRINK_SHIFT; > + secs = entry & EVICTION_SECS_POS_MASK; > + secs = secs << secs_shrink_size; > + return secs; > +} > +#endif > static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) > { > eviction >>= bucket_order; > + pack_secs(&eviction); > eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; > eviction = (eviction << NODES_SHIFT) | pgdat->node_id; > eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); > @@ -181,20 +240,24 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) > } > > static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, > - unsigned long *evictionp) > + unsigned long *evictionp, unsigned int *prev_secs) > { > unsigned long entry = (unsigned long)shadow; > int memcgid, nid; > + unsigned int secs; > > entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; > nid = entry & ((1UL << NODES_SHIFT) - 1); > entry >>= NODES_SHIFT; > memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); > entry >>= MEM_CGROUP_ID_SHIFT; > + secs = unpack_secs(entry); > + entry >>= (EVICTION_SECS_POS_SHIFT + EVICTION_SECS_SHRINK_SHIFT); > > *memcgidp = memcgid; > *pgdat = NODE_DATA(nid); > *evictionp = entry << bucket_order; > + *prev_secs = secs; > } > > /** > @@ -242,9 +305,22 @@ bool workingset_refault(void *shadow) > unsigned long refault; > struct pglist_data *pgdat; > int memcgid; > +#ifndef NO_SECS_IN_WORKINGSET > + unsigned long avg_refault_time; > + unsigned long refault_time; > + int tradition; > + unsigned int prev_secs; > + unsigned int secs; > + unsigned long refaults_ratio; > +#endif > + struct timespec ts; > + /* > + convert jiffies to second > + */ > + get_monotonic_boottime(&ts); > + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1; > > - unpack_shadow(shadow, &memcgid, &pgdat, &eviction); > - > + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &prev_secs); > rcu_read_lock(); > /* > * Look up the memcg associated with the stored ID. It might > @@ -288,14 +364,37 @@ bool workingset_refault(void *shadow) > * list is not a problem. > */ > refault_distance = (refault - eviction) & EVICTION_MASK; > - > inc_lruvec_state(lruvec, WORKINGSET_REFAULT); > - > - if (refault_distance <= active_file) { > +#ifndef NO_SECS_IN_WORKINGSET > + refaults_ratio = (atomic_long_read(&lruvec->inactive_age) + 1) / secs; > + atomic_long_set(&lruvec->refaults_ratio, refaults_ratio); > + refault_time = secs - prev_secs; > + avg_refault_time = active_file / refaults_ratio; > + tradition = !!(refault_distance < active_file); > + if (refault_time <= avg_refault_time) { > +#else > + if (refault_distance < active_file) { > +#endif > inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); > +#ifndef NO_SECS_IN_WORKINGSET > + trace_printk("WKST_ACT[%d]:rft_dis %ld, act_file %ld \ > + rft_ratio %ld rft_time %ld avg_rft_time %ld \ > + refault %ld eviction %ld secs %d pre_secs %d\n", > + tradition, refault_distance, active_file, > + refaults_ratio, refault_time, avg_refault_time, > + refault, eviction, secs, prev_secs); > +#endif > rcu_read_unlock(); > return true; > } > +#ifndef NO_SECS_IN_WORKINGSET > + trace_printk("WKST_INACT[%d]:rft_dis %ld, act_file %ld \ > + rft_ratio %ld rft_time %ld avg_rft_time %ld \ > + refault %ld eviction %ld secs %d pre_secs %d\n", > + tradition, refault_distance, active_file, > + refaults_ratio, refault_time, avg_refault_time, > + refault, eviction, secs, prev_secs); > +#endif > rcu_read_unlock(); > return false; > } > @@ -513,7 +612,9 @@ static int __init workingset_init(void) > unsigned int max_order; > int ret; > > - BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); > + BUILD_BUG_ON(BITS_PER_LONG < (EVICTION_SHIFT > + + EVICTION_SECS_POS_SHIFT > + + EVICTION_SECS_SHRINK_SHIFT)); > /* > * Calculate the eviction bucket size to cover the longest > * actionable refault distance, which is currently half of > @@ -521,7 +622,8 @@ static int __init workingset_init(void) > * some more pages at runtime, so keep working with up to > * double the initial memory by using totalram_pages as-is. > */ > - timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; > + timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT > + - EVICTION_SECS_POS_SHIFT - EVICTION_SECS_SHRINK_SHIFT; > max_order = fls_long(totalram_pages - 1); > if (max_order > timestamp_bits) > bucket_order = max_order - timestamp_bits; > -- > 1.9.1
fix one mailbox and update for some information Comparing to http://lkml.kernel.org/r/1554348617-12897-1-git-send-email-huangzhaoyang@gmail.com, this commit fix the packing order error and add trace_printk for reference debug information. For johannes's comments, please find bellowing for my feedback. On Wed, Apr 17, 2019 at 3:59 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote: > > add Johannes and answer his previous question. > > @Johannes Weiner > Yes. I do agree with you about the original thought of sacrificing > long distance access pages when huge memory demands arise. The problem > is what is the criteria of the distance, which you can find from what > I comment in the patch, that is, some pages have long refault_distance > while having a very short access time in between. I think the latter > one should be take into consideration or as part of the finnal > decision of if the page should be active/inactive. > > On Wed, Apr 17, 2019 at 3:48 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote: > > > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com> > > > > This patch introduce timestamp into workingset's entry and judge if the page > > is active or inactive via active_file/refault_ratio instead of refault distance. > > > > The original thought is coming from the logs we got from trace_printk in this > > patch, we can find about 1/5 of the file pages' refault are under the > > scenario[1],which will be counted as inactive as they have a long refault distance > > in between access. However, we can also know from the time information that the > > page refault quickly as comparing to the average refault time which is calculated > > by the number of active file and refault ratio. We want to save these kinds of > > pages from evicted earlier as it used to be. The refault ratio is the value > > which can reflect lru's average file access frequency and also can be deemed as a > > prediction of future. > > > > The patch is tested on an android system and reduce 30% of page faults, while > > 60% of the pages remain the original status as (refault_distance < active_file) > > indicates. Pages status got from ftrace during the test can refer to [2]. > > > > [1] > > system_server workingset_refault: WKST_ACT[0]:rft_dis 265976, act_file 34268 rft_ratio 3047 rft_time 0 avg_rft_time 11 refault 295592 eviction 29616 secs 97 pre_secs 97 > > HwBinder:922 workingset_refault: WKST_ACT[0]:rft_dis 264478, act_file 35037 rft_ratio 3070 rft_time 2 avg_rft_time 11 refault 310078 eviction 45600 secs 101 pre_secs 99 > > > > [2] > > WKST_ACT[0]: original--INACTIVE commit--ACTIVE > > WKST_ACT[1]: original--ACTIVE commit--ACTIVE > > WKST_INACT[0]: original--INACTIVE commit--INACTIVE > > WKST_INACT[1]: original--ACTIVE commit--INACTIVE > > > > Signed-off-by: Zhaoyang Huang <huangzhaoyang@gmail.com> > > --- > > include/linux/mmzone.h | 1 + > > mm/workingset.c | 120 +++++++++++++++++++++++++++++++++++++++++++++---- > > 2 files changed, 112 insertions(+), 9 deletions(-) > > > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > > index 32699b2..6f30673 100644 > > --- a/include/linux/mmzone.h > > +++ b/include/linux/mmzone.h > > @@ -240,6 +240,7 @@ struct lruvec { > > atomic_long_t inactive_age; > > /* Refaults at the time of last reclaim cycle */ > > unsigned long refaults; > > + atomic_long_t refaults_ratio; > > #ifdef CONFIG_MEMCG > > struct pglist_data *pgdat; > > #endif > > diff --git a/mm/workingset.c b/mm/workingset.c > > index 40ee02c..66c177b 100644 > > --- a/mm/workingset.c > > +++ b/mm/workingset.c > > @@ -160,6 +160,21 @@ > > MEM_CGROUP_ID_SHIFT) > > #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) > > > > +#ifdef CONFIG_64BIT > > +#define EVICTION_SECS_POS_SHIFT 20 > > +#define EVICTION_SECS_SHRINK_SHIFT 4 > > +#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1) > > +#else > > +#ifndef CONFIG_MEMCG > > +#define EVICTION_SECS_POS_SHIFT 12 > > +#define EVICTION_SECS_SHRINK_SHIFT 4 > > +#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1) > > +#else > > +#define EVICTION_SECS_POS_SHIFT 0 > > +#define EVICTION_SECS_SHRINK_SHIFT 0 > > +#define NO_SECS_IN_WORKINGSET > > +#endif > > +#endif > > /* > > * Eviction timestamps need to be able to cover the full range of > > * actionable refaults. However, bits are tight in the radix tree > > @@ -169,10 +184,54 @@ > > * evictions into coarser buckets by shaving off lower timestamp bits. > > */ > > static unsigned int bucket_order __read_mostly; > > - > > +#ifdef NO_SECS_IN_WORKINGSET > > +static void pack_secs(unsigned long *peviction) { } > > +static unsigned int unpack_secs(unsigned long entry) {return 0; } > > +#else > > +/* > > + * Shrink the timestamp according to its value and store it together > > + * with the shrink size in the entry. > > + */ > > +static void pack_secs(unsigned long *peviction) > > +{ > > + unsigned int secs; > > + unsigned long eviction; > > + int order; > > + int secs_shrink_size; > > + struct timespec ts; > > + > > + get_monotonic_boottime(&ts); > > + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1; > > + order = get_count_order(secs); > > + secs_shrink_size = (order <= EVICTION_SECS_POS_SHIFT) > > + ? 0 : (order - EVICTION_SECS_POS_SHIFT); > > + > > + eviction = *peviction; > > + eviction = (eviction << EVICTION_SECS_POS_SHIFT) > > + | ((secs >> secs_shrink_size) & EVICTION_SECS_POS_MASK); > > + eviction = (eviction << EVICTION_SECS_SHRINK_SHIFT) | (secs_shrink_size & 0xf); > > + *peviction = eviction; > > +} > > +/* > > + * Unpack the second from the entry and restore the value according to the > > + * shrink size. > > + */ > > +static unsigned int unpack_secs(unsigned long entry) > > +{ > > + unsigned int secs; > > + int secs_shrink_size; > > + > > + secs_shrink_size = entry & ((1 << EVICTION_SECS_SHRINK_SHIFT) - 1); > > + entry >>= EVICTION_SECS_SHRINK_SHIFT; > > + secs = entry & EVICTION_SECS_POS_MASK; > > + secs = secs << secs_shrink_size; > > + return secs; > > +} > > +#endif > > static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) > > { > > eviction >>= bucket_order; > > + pack_secs(&eviction); > > eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; > > eviction = (eviction << NODES_SHIFT) | pgdat->node_id; > > eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); > > @@ -181,20 +240,24 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) > > } > > > > static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, > > - unsigned long *evictionp) > > + unsigned long *evictionp, unsigned int *prev_secs) > > { > > unsigned long entry = (unsigned long)shadow; > > int memcgid, nid; > > + unsigned int secs; > > > > entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; > > nid = entry & ((1UL << NODES_SHIFT) - 1); > > entry >>= NODES_SHIFT; > > memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); > > entry >>= MEM_CGROUP_ID_SHIFT; > > + secs = unpack_secs(entry); > > + entry >>= (EVICTION_SECS_POS_SHIFT + EVICTION_SECS_SHRINK_SHIFT); > > > > *memcgidp = memcgid; > > *pgdat = NODE_DATA(nid); > > *evictionp = entry << bucket_order; > > + *prev_secs = secs; > > } > > > > /** > > @@ -242,9 +305,22 @@ bool workingset_refault(void *shadow) > > unsigned long refault; > > struct pglist_data *pgdat; > > int memcgid; > > +#ifndef NO_SECS_IN_WORKINGSET > > + unsigned long avg_refault_time; > > + unsigned long refault_time; > > + int tradition; > > + unsigned int prev_secs; > > + unsigned int secs; > > + unsigned long refaults_ratio; > > +#endif > > + struct timespec ts; > > + /* > > + convert jiffies to second > > + */ > > + get_monotonic_boottime(&ts); > > + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1; > > > > - unpack_shadow(shadow, &memcgid, &pgdat, &eviction); > > - > > + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &prev_secs); > > rcu_read_lock(); > > /* > > * Look up the memcg associated with the stored ID. It might > > @@ -288,14 +364,37 @@ bool workingset_refault(void *shadow) > > * list is not a problem. > > */ > > refault_distance = (refault - eviction) & EVICTION_MASK; > > - > > inc_lruvec_state(lruvec, WORKINGSET_REFAULT); > > - > > - if (refault_distance <= active_file) { > > +#ifndef NO_SECS_IN_WORKINGSET > > + refaults_ratio = (atomic_long_read(&lruvec->inactive_age) + 1) / secs; > > + atomic_long_set(&lruvec->refaults_ratio, refaults_ratio); > > + refault_time = secs - prev_secs; > > + avg_refault_time = active_file / refaults_ratio; > > + tradition = !!(refault_distance < active_file); > > + if (refault_time <= avg_refault_time) { > > +#else > > + if (refault_distance < active_file) { > > +#endif > > inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); > > +#ifndef NO_SECS_IN_WORKINGSET > > + trace_printk("WKST_ACT[%d]:rft_dis %ld, act_file %ld \ > > + rft_ratio %ld rft_time %ld avg_rft_time %ld \ > > + refault %ld eviction %ld secs %d pre_secs %d\n", > > + tradition, refault_distance, active_file, > > + refaults_ratio, refault_time, avg_refault_time, > > + refault, eviction, secs, prev_secs); > > +#endif > > rcu_read_unlock(); > > return true; > > } > > +#ifndef NO_SECS_IN_WORKINGSET > > + trace_printk("WKST_INACT[%d]:rft_dis %ld, act_file %ld \ > > + rft_ratio %ld rft_time %ld avg_rft_time %ld \ > > + refault %ld eviction %ld secs %d pre_secs %d\n", > > + tradition, refault_distance, active_file, > > + refaults_ratio, refault_time, avg_refault_time, > > + refault, eviction, secs, prev_secs); > > +#endif > > rcu_read_unlock(); > > return false; > > } > > @@ -513,7 +612,9 @@ static int __init workingset_init(void) > > unsigned int max_order; > > int ret; > > > > - BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); > > + BUILD_BUG_ON(BITS_PER_LONG < (EVICTION_SHIFT > > + + EVICTION_SECS_POS_SHIFT > > + + EVICTION_SECS_SHRINK_SHIFT)); > > /* > > * Calculate the eviction bucket size to cover the longest > > * actionable refault distance, which is currently half of > > @@ -521,7 +622,8 @@ static int __init workingset_init(void) > > * some more pages at runtime, so keep working with up to > > * double the initial memory by using totalram_pages as-is. > > */ > > - timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; > > + timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT > > + - EVICTION_SECS_POS_SHIFT - EVICTION_SECS_SHRINK_SHIFT; > > max_order = fls_long(totalram_pages - 1); > > if (max_order > timestamp_bits) > > bucket_order = max_order - timestamp_bits; > > -- > > 1.9.1 > >
On Wed 17-04-19 18:55:15, Zhaoyang Huang wrote: > fix one mailbox and update for some information > > Comparing to http://lkml.kernel.org/r/1554348617-12897-1-git-send-email-huangzhaoyang@gmail.com, > this commit fix the packing order error and add trace_printk for > reference debug information. > > For johannes's comments, please find bellowing for my feedback. OK, this suggests there is no strong reason to poset a new version of the patch then. Please do not fragment discussion and continue discussing in the original email thread until there is some conclusion reached. Thanks!
sorry for the confusion. What I mean is the basic idea doesn't change as replacing the refault criteria from refault_distance to timestamp. But the detailed implementation changed a lot, including fix bugs, update the way of packing the timestamp, 32bit/64bit differentiation etc. So it makes sense for starting a new context. On Wed, Apr 17, 2019 at 7:06 PM Michal Hocko <mhocko@kernel.org> wrote: > > On Wed 17-04-19 18:55:15, Zhaoyang Huang wrote: > > fix one mailbox and update for some information > > > > Comparing to http://lkml.kernel.org/r/1554348617-12897-1-git-send-email-huangzhaoyang@gmail.com, > > this commit fix the packing order error and add trace_printk for > > reference debug information. > > > > For johannes's comments, please find bellowing for my feedback. > > OK, this suggests there is no strong reason to poset a new version of > the patch then. Please do not fragment discussion and continue > discussing in the original email thread until there is some conclusion > reached. > > Thanks! > -- > Michal Hocko > SUSE Labs
On Wed 17-04-19 19:36:21, Zhaoyang Huang wrote: > sorry for the confusion. What I mean is the basic idea doesn't change > as replacing the refault criteria from refault_distance to timestamp. > But the detailed implementation changed a lot, including fix bugs, > update the way of packing the timestamp, 32bit/64bit differentiation > etc. So it makes sense for starting a new context. Not really. My take away from the previous discussion is that Johannes has questioned the timestamping approach itself. I wasn't following very closely so I might be wrong here but if that is really the case then it doesn't make much sense to improve the implementation if there is no consensus on the approach itself.
repost the feedback by under Johannes's comment When something like a higher-order allocation drops a large number of file pages, it's *intentional* that the pages that were evicted before them become less valuable and less likely to be activated on refault. There is a finite amount of in-memory LRU space and the pages that have been evicted the most recently have precedence because they have the highest proven access frequency. [HZY]: Yes. I do agree with you about the original thought of sacrificing long distance access pages when huge memory demands arise. The problem is what is the criteria of selecting the page, which you can find from what I comment in the patch, that is, some pages have long refault_distance while having a very short access time in between. Of course, when a large amount of the cache that was pushed out in between is not re-used again, and don't claim their space in memory, it would be great if we could then activate the older pages that *are* re-used again in their stead.But that would require us being able to look into the future. When an old page refaults, we don't know if a younger page is still going to refault with a shorter refault distance or not. If it won't, then we were right to activate it. If it will refault, then we put something on the active list whose reuse frequency is too low to be able to fit into memory, and we thrash the hottest pages in the system. [HZY]: We do NOT use the absolute timestamp when page refaulting to indicate young or old of the page and thus to decide the position of LRU. The criteria which i use is to comparing the "time duration of the page's out of cache" and "the active files shrinking time by dividing average refault ratio". I inherite the concept of deeming ACTIVE file as deficit of INACTIVE files, but use time to avoid the scenario as suggested in patch's [1]. As Matthew says, you are fairly randomly making refault activations more aggressive (especially with that timestamp unpacking bug), and while that expectedly boosts workload transition / startup, it comes at the cost of disrupting stable states because you can flood a very active in-ram workingset with completely cold cache pages simply because they refault uniformly wrt each other. [HZY]: I analysis the log got from trace_printk, what we activate have proven record of long refault distance but very short refault time. On Wed, Apr 17, 2019 at 7:46 PM Michal Hocko <mhocko@kernel.org> wrote: > > On Wed 17-04-19 19:36:21, Zhaoyang Huang wrote: > > sorry for the confusion. What I mean is the basic idea doesn't change > > as replacing the refault criteria from refault_distance to timestamp. > > But the detailed implementation changed a lot, including fix bugs, > > update the way of packing the timestamp, 32bit/64bit differentiation > > etc. So it makes sense for starting a new context. > > Not really. My take away from the previous discussion is that Johannes > has questioned the timestamping approach itself. I wasn't following very > closely so I might be wrong here but if that is really the case then it > doesn't make much sense to improve the implementation if there is no > consensus on the approach itself. > > -- > Michal Hocko > SUSE Labs
On Wed 17-04-19 20:26:22, Zhaoyang Huang wrote:
> repost the feedback by under Johannes's comment
Please follow up in the original email thread. Fragmenting the
discussion is exactly what I wanted...
On Wed, Apr 17, 2019 at 08:26:22PM +0800, Zhaoyang Huang wrote: [quoting Johannes here] > As Matthew says, you are fairly randomly making refault activations > more aggressive (especially with that timestamp unpacking bug), and > while that expectedly boosts workload transition / startup, it comes > at the cost of disrupting stable states because you can flood a very > active in-ram workingset with completely cold cache pages simply > because they refault uniformly wrt each other. > [HZY]: I analysis the log got from trace_printk, what we activate have > proven record of long refault distance but very short refault time. You haven't addressed my point, which is that you were only testing workloads for which your changed algorithm would improve the results. What you haven't done is shown how other workloads would be negatively affected. Once you do that, we can make a decision about whether to improve your workload by X% and penalise that other workload by Y%.
rebase the commit to latest mainline and update the code. @Matthew, with regarding to your comment, I would like to say the algorithm doesn't change at all. I do NOT judge the page's activity via an absolute time value, but still the refault distance. What I want to fix is the scenario which drop lots of file pages on this lru that leading to a big refault_distance(inactive_age) and inactivate the page. I haven't found regression of the commit yet. Could you please suggest me more test cases? Thank you! diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fba7741..ca4ced6 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -242,6 +242,7 @@ struct lruvec { atomic_long_t inactive_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults; + atomic_long_t refaults_ratio; #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif diff --git a/mm/workingset.c b/mm/workingset.c index 0bedf67..95683c1 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -171,6 +171,15 @@ 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) +#ifdef CONFIG_64BIT +#define EVICTION_SECS_POS_SHIFT 19 +#define EVICTION_SECS_SHRINK_SHIFT 4 +#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1) +#else +#define EVICTION_SECS_POS_SHIFT 0 +#define EVICTION_SECS_SHRINK_SHIFT 0 +#define NO_SECS_IN_WORKINGSET +#endif /* * Eviction timestamps need to be able to cover the full range of * actionable refaults. However, bits are tight in the xarray @@ -180,12 +189,48 @@ * evictions into coarser buckets by shaving off lower timestamp bits. */ static unsigned int bucket_order __read_mostly; - +#ifdef NO_SECS_IN_WORKINGSET +static void pack_secs(unsigned long *peviction) { } +static unsigned int unpack_secs(unsigned long entry) {return 0; } +#else +static void pack_secs(unsigned long *peviction) +{ + unsigned int secs; + unsigned long eviction; + int order; + int secs_shrink_size; + struct timespec64 ts; + + ktime_get_boottime_ts64(&ts); + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1; + order = get_count_order(secs); + secs_shrink_size = (order <= EVICTION_SECS_POS_SHIFT) + ? 0 : (order - EVICTION_SECS_POS_SHIFT); + + eviction = *peviction; + eviction = (eviction << EVICTION_SECS_POS_SHIFT) + | ((secs >> secs_shrink_size) & EVICTION_SECS_POS_MASK); + eviction = (eviction << EVICTION_SECS_SHRINK_SHIFT) | (secs_shrink_size & 0xf); + *peviction = eviction; +} +static unsigned int unpack_secs(unsigned long entry) +{ + unsigned int secs; + int secs_shrink_size; + + secs_shrink_size = entry & ((1 << EVICTION_SECS_SHRINK_SHIFT) - 1); + entry >>= EVICTION_SECS_SHRINK_SHIFT; + secs = entry & EVICTION_SECS_POS_MASK; + secs = secs << secs_shrink_size; + return secs; +} +#endif static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, bool workingset) { eviction >>= bucket_order; eviction &= EVICTION_MASK; + pack_secs(&eviction); eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; eviction = (eviction << 1) | workingset; @@ -194,11 +239,12 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, - unsigned long *evictionp, bool *workingsetp) + unsigned long *evictionp, bool *workingsetp, unsigned int *prev_secs) { unsigned long entry = xa_to_value(shadow); int memcgid, nid; bool workingset; + unsigned int secs; workingset = entry & 1; entry >>= 1; @@ -206,11 +252,14 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT; + secs = unpack_secs(entry); + entry >>= (EVICTION_SECS_POS_SHIFT + EVICTION_SECS_SHRINK_SHIFT); *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry << bucket_order; *workingsetp = workingset; + *prev_secs = secs; } /** @@ -257,8 +306,22 @@ void workingset_refault(struct page *page, void *shadow) unsigned long refault; bool workingset; int memcgid; +#ifndef NO_SECS_IN_WORKINGSET + unsigned long avg_refault_time; + unsigned long refaults_ratio; + unsigned long refault_time; + int tradition; + unsigned int prev_secs; + unsigned int secs; +#endif + struct timespec64 ts; + /* + convert jiffies to second + */ + ktime_get_boottime_ts64(&ts); + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1; - unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset, &prev_secs); rcu_read_lock(); /* @@ -303,23 +366,58 @@ void workingset_refault(struct page *page, void *shadow) refault_distance = (refault - eviction) & EVICTION_MASK; inc_lruvec_state(lruvec, WORKINGSET_REFAULT); - +#ifndef NO_SECS_IN_WORKINGSET + refaults_ratio = (atomic_long_read(&lruvec->inactive_age) + 1) / secs; + atomic_long_set(&lruvec->refaults_ratio, refaults_ratio); + refault_time = secs - prev_secs; + avg_refault_time = active_file / refaults_ratio; + tradition = !!(refault_distance < active_file); /* - * Compare the distance to the existing workingset size. We - * don't act on pages that couldn't stay resident even if all - * the memory was available to the page cache. + * What we are trying to solve here is + * 1. extremely fast refault as refault_time == 0. + * 2. quick file drop scenario, which has a big refault_distance but + * small refault_time comparing with the past refault ratio, which + * will be deemed as inactive in previous implementation. */ - if (refault_distance > active_file) + if (refault_time && (((refault_time < avg_refault_time) + && (avg_refault_time < 2 * refault_time)) + || (refault_time >= avg_refault_time))) { + trace_printk("WKST_INACT[%d]:rft_dis %ld, act %ld\ + rft_ratio %ld rft_time %ld avg_rft_time %ld\ + refault %ld eviction %ld secs %d pre_secs %d page %p\n", + tradition, refault_distance, active_file, + refaults_ratio, refault_time, avg_refault_time, + refault, eviction, secs, prev_secs, page); goto out; + } + else { +#else + if (refault_distance < active_file) { +#endif - SetPageActive(page); - atomic_long_inc(&lruvec->inactive_age); - inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); + /* + * Compare the distance to the existing workingset size. We + * don't act on pages that couldn't stay resident even if all + * the memory was available to the page cache. + */ - /* Page was active prior to eviction */ - if (workingset) { - SetPageWorkingset(page); - inc_lruvec_state(lruvec, WORKINGSET_RESTORE); + SetPageActive(page); + atomic_long_inc(&lruvec->inactive_age); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); + + /* Page was active prior to eviction */ + if (workingset) { + SetPageWorkingset(page); + inc_lruvec_state(lruvec, WORKINGSET_RESTORE); + } +#ifndef NO_SECS_IN_WORKINGSET + trace_printk("WKST_ACT[%d]:rft_dis %ld, act %ld\ + rft_ratio %ld rft_time %ld avg_rft_time %ld\ + refault %ld eviction %ld secs %d pre_secs %d page %p\n", + tradition, refault_distance, active_file, + refaults_ratio, refault_time, avg_refault_time, + refault, eviction, secs, prev_secs, page); +#endif } out: rcu_read_unlock(); @@ -539,7 +637,9 @@ static int __init workingset_init(void) unsigned int max_order; int ret; - BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); + BUILD_BUG_ON(BITS_PER_LONG < (EVICTION_SHIFT + + EVICTION_SECS_POS_SHIFT + + EVICTION_SECS_SHRINK_SHIFT)); /* * Calculate the eviction bucket size to cover the longest * actionable refault distance, which is currently half of @@ -547,7 +647,9 @@ static int __init workingset_init(void) * some more pages at runtime, so keep working with up to * double the initial memory by using totalram_pages as-is. */ - timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; + timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT + - EVICTION_SECS_POS_SHIFT - EVICTION_SECS_SHRINK_SHIFT; + max_order = fls_long(totalram_pages() - 1); if (max_order > timestamp_bits) bucket_order = max_order - timestamp_bits; On Wed, Apr 17, 2019 at 9:37 PM Matthew Wilcox <willy@infradead.org> wrote: > > On Wed, Apr 17, 2019 at 08:26:22PM +0800, Zhaoyang Huang wrote: > [quoting Johannes here] > > As Matthew says, you are fairly randomly making refault activations > > more aggressive (especially with that timestamp unpacking bug), and > > while that expectedly boosts workload transition / startup, it comes > > at the cost of disrupting stable states because you can flood a very > > active in-ram workingset with completely cold cache pages simply > > because they refault uniformly wrt each other. > > [HZY]: I analysis the log got from trace_printk, what we activate have > > proven record of long refault distance but very short refault time. > > You haven't addressed my point, which is that you were only testing > workloads for which your changed algorithm would improve the results. > What you haven't done is shown how other workloads would be negatively > affected. > > Once you do that, we can make a decision about whether to improve your > workload by X% and penalise that other workload by Y%.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 32699b2..6f30673 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -240,6 +240,7 @@ struct lruvec { atomic_long_t inactive_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults; + atomic_long_t refaults_ratio; #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif diff --git a/mm/workingset.c b/mm/workingset.c index 40ee02c..66c177b 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -160,6 +160,21 @@ MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) +#ifdef CONFIG_64BIT +#define EVICTION_SECS_POS_SHIFT 20 +#define EVICTION_SECS_SHRINK_SHIFT 4 +#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1) +#else +#ifndef CONFIG_MEMCG +#define EVICTION_SECS_POS_SHIFT 12 +#define EVICTION_SECS_SHRINK_SHIFT 4 +#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1) +#else +#define EVICTION_SECS_POS_SHIFT 0 +#define EVICTION_SECS_SHRINK_SHIFT 0 +#define NO_SECS_IN_WORKINGSET +#endif +#endif /* * Eviction timestamps need to be able to cover the full range of * actionable refaults. However, bits are tight in the radix tree @@ -169,10 +184,54 @@ * evictions into coarser buckets by shaving off lower timestamp bits. */ static unsigned int bucket_order __read_mostly; - +#ifdef NO_SECS_IN_WORKINGSET +static void pack_secs(unsigned long *peviction) { } +static unsigned int unpack_secs(unsigned long entry) {return 0; } +#else +/* + * Shrink the timestamp according to its value and store it together + * with the shrink size in the entry. + */ +static void pack_secs(unsigned long *peviction) +{ + unsigned int secs; + unsigned long eviction; + int order; + int secs_shrink_size; + struct timespec ts; + + get_monotonic_boottime(&ts); + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1; + order = get_count_order(secs); + secs_shrink_size = (order <= EVICTION_SECS_POS_SHIFT) + ? 0 : (order - EVICTION_SECS_POS_SHIFT); + + eviction = *peviction; + eviction = (eviction << EVICTION_SECS_POS_SHIFT) + | ((secs >> secs_shrink_size) & EVICTION_SECS_POS_MASK); + eviction = (eviction << EVICTION_SECS_SHRINK_SHIFT) | (secs_shrink_size & 0xf); + *peviction = eviction; +} +/* + * Unpack the second from the entry and restore the value according to the + * shrink size. + */ +static unsigned int unpack_secs(unsigned long entry) +{ + unsigned int secs; + int secs_shrink_size; + + secs_shrink_size = entry & ((1 << EVICTION_SECS_SHRINK_SHIFT) - 1); + entry >>= EVICTION_SECS_SHRINK_SHIFT; + secs = entry & EVICTION_SECS_POS_MASK; + secs = secs << secs_shrink_size; + return secs; +} +#endif static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) { eviction >>= bucket_order; + pack_secs(&eviction); eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); @@ -181,20 +240,24 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, - unsigned long *evictionp) + unsigned long *evictionp, unsigned int *prev_secs) { unsigned long entry = (unsigned long)shadow; int memcgid, nid; + unsigned int secs; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT; + secs = unpack_secs(entry); + entry >>= (EVICTION_SECS_POS_SHIFT + EVICTION_SECS_SHRINK_SHIFT); *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry << bucket_order; + *prev_secs = secs; } /** @@ -242,9 +305,22 @@ bool workingset_refault(void *shadow) unsigned long refault; struct pglist_data *pgdat; int memcgid; +#ifndef NO_SECS_IN_WORKINGSET + unsigned long avg_refault_time; + unsigned long refault_time; + int tradition; + unsigned int prev_secs; + unsigned int secs; + unsigned long refaults_ratio; +#endif + struct timespec ts; + /* + convert jiffies to second + */ + get_monotonic_boottime(&ts); + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1; - unpack_shadow(shadow, &memcgid, &pgdat, &eviction); - + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &prev_secs); rcu_read_lock(); /* * Look up the memcg associated with the stored ID. It might @@ -288,14 +364,37 @@ bool workingset_refault(void *shadow) * list is not a problem. */ refault_distance = (refault - eviction) & EVICTION_MASK; - inc_lruvec_state(lruvec, WORKINGSET_REFAULT); - - if (refault_distance <= active_file) { +#ifndef NO_SECS_IN_WORKINGSET + refaults_ratio = (atomic_long_read(&lruvec->inactive_age) + 1) / secs; + atomic_long_set(&lruvec->refaults_ratio, refaults_ratio); + refault_time = secs - prev_secs; + avg_refault_time = active_file / refaults_ratio; + tradition = !!(refault_distance < active_file); + if (refault_time <= avg_refault_time) { +#else + if (refault_distance < active_file) { +#endif inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); +#ifndef NO_SECS_IN_WORKINGSET + trace_printk("WKST_ACT[%d]:rft_dis %ld, act_file %ld \ + rft_ratio %ld rft_time %ld avg_rft_time %ld \ + refault %ld eviction %ld secs %d pre_secs %d\n", + tradition, refault_distance, active_file, + refaults_ratio, refault_time, avg_refault_time, + refault, eviction, secs, prev_secs); +#endif rcu_read_unlock(); return true; } +#ifndef NO_SECS_IN_WORKINGSET + trace_printk("WKST_INACT[%d]:rft_dis %ld, act_file %ld \ + rft_ratio %ld rft_time %ld avg_rft_time %ld \ + refault %ld eviction %ld secs %d pre_secs %d\n", + tradition, refault_distance, active_file, + refaults_ratio, refault_time, avg_refault_time, + refault, eviction, secs, prev_secs); +#endif rcu_read_unlock(); return false; } @@ -513,7 +612,9 @@ static int __init workingset_init(void) unsigned int max_order; int ret; - BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); + BUILD_BUG_ON(BITS_PER_LONG < (EVICTION_SHIFT + + EVICTION_SECS_POS_SHIFT + + EVICTION_SECS_SHRINK_SHIFT)); /* * Calculate the eviction bucket size to cover the longest * actionable refault distance, which is currently half of @@ -521,7 +622,8 @@ static int __init workingset_init(void) * some more pages at runtime, so keep working with up to * double the initial memory by using totalram_pages as-is. */ - timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; + timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT + - EVICTION_SECS_POS_SHIFT - EVICTION_SECS_SHRINK_SHIFT; max_order = fls_long(totalram_pages - 1); if (max_order > timestamp_bits) bucket_order = max_order - timestamp_bits;