From patchwork Mon Mar 15 07:22:15 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Balbir Singh X-Patchwork-Id: 85892 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.3/8.14.3) with ESMTP id o2F7Mg2Q001264 for ; Mon, 15 Mar 2010 07:22:43 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934472Ab0COHWW (ORCPT ); Mon, 15 Mar 2010 03:22:22 -0400 Received: from e28smtp05.in.ibm.com ([122.248.162.5]:45312 "EHLO e28smtp05.in.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1757247Ab0COHWV (ORCPT ); Mon, 15 Mar 2010 03:22:21 -0400 Received: from d28relay01.in.ibm.com (d28relay01.in.ibm.com [9.184.220.58]) by e28smtp05.in.ibm.com (8.14.3/8.13.1) with ESMTP id o2F7MHoq025372; Mon, 15 Mar 2010 12:52:17 +0530 Received: from d28av02.in.ibm.com (d28av02.in.ibm.com [9.184.220.64]) by d28relay01.in.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id o2F7MHFn3158084; Mon, 15 Mar 2010 12:52:17 +0530 Received: from d28av02.in.ibm.com (loopback [127.0.0.1]) by d28av02.in.ibm.com (8.14.3/8.13.1/NCO v10.0 AVout) with ESMTP id o2F7MGEu014281; Mon, 15 Mar 2010 18:22:16 +1100 Received: from balbir-laptop.localdomain ([9.77.212.221]) by d28av02.in.ibm.com (8.14.3/8.13.1/NCO v10.0 AVin) with ESMTP id o2F7MGjt014278; Mon, 15 Mar 2010 18:22:16 +1100 Received: by balbir-laptop.localdomain (Postfix, from userid 500) id A52A0D0DF9; Mon, 15 Mar 2010 12:52:15 +0530 (IST) Date: Mon, 15 Mar 2010 12:52:15 +0530 From: Balbir Singh To: KVM development list Cc: Rik van Riel , KAMEZAWA Hiroyuki , "linux-mm@kvack.org" , "linux-kernel@vger.kernel.org" Subject: [PATCH][RF C/T/D] Unmapped page cache control - via boot parameter Message-ID: <20100315072214.GA18054@balbir.in.ibm.com> Reply-To: balbir@linux.vnet.ibm.com MIME-Version: 1.0 Content-Disposition: inline User-Agent: Mutt/1.5.20 (2009-08-17) Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter.kernel.org [140.211.167.41]); Mon, 15 Mar 2010 07:22:43 +0000 (UTC) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ad5abcf..f0b245f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -293,12 +293,12 @@ struct zone { */ unsigned long lowmem_reserve[MAX_NR_ZONES]; + unsigned long min_unmapped_pages; #ifdef CONFIG_NUMA int node; /* * zone reclaim becomes active if more unmapped pages exist. */ - unsigned long min_unmapped_pages; unsigned long min_slab_pages; #endif struct per_cpu_pageset __percpu *pageset; diff --git a/include/linux/swap.h b/include/linux/swap.h index c2a4295..d0c8176 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -254,10 +254,11 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; extern int remove_mapping(struct address_space *mapping, struct page *page); extern long vm_total_pages; +extern bool should_balance_unmapped_pages(struct zone *zone); +extern int sysctl_min_unmapped_ratio; #ifdef CONFIG_NUMA extern int zone_reclaim_mode; -extern int sysctl_min_unmapped_ratio; extern int sysctl_min_slab_ratio; extern int zone_reclaim(struct zone *, gfp_t, unsigned int); #else diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 416b056..1cc5c75 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1578,6 +1578,9 @@ zonelist_scan: unsigned long mark; int ret; + if (should_balance_unmapped_pages(zone)) + wakeup_kswapd(zone, order); + mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (zone_watermark_ok(zone, order, mark, classzone_idx, alloc_flags)) @@ -3816,10 +3819,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->spanned_pages = size; zone->present_pages = realsize; -#ifdef CONFIG_NUMA - zone->node = nid; zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) / 100; +#ifdef CONFIG_NUMA + zone->node = nid; zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; @@ -4727,7 +4730,6 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, return 0; } -#ifdef CONFIG_NUMA int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -4744,6 +4746,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, return 0; } +#ifdef CONFIG_NUMA int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 5cbf64d..46026e7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -138,6 +138,18 @@ static DECLARE_RWSEM(shrinker_rwsem); #define scanning_global_lru(sc) (1) #endif +static int unmapped_page_control __read_mostly; + +static int __init unmapped_page_control_parm(char *str) +{ + unmapped_page_control = 1; + /* + * XXX: Should we tweak swappiness here? + */ + return 1; +} +__setup("unmapped_page_control", unmapped_page_control_parm); + static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, struct scan_control *sc) { @@ -1938,6 +1950,103 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) } /* + * Percentage of pages in a zone that must be unmapped for zone_reclaim to + * occur. + */ +int sysctl_min_unmapped_ratio = 1; +/* + * Priority for ZONE_RECLAIM. This determines the fraction of pages + * of a node considered for each zone_reclaim. 4 scans 1/16th of + * a zone. + */ +#define ZONE_RECLAIM_PRIORITY 4 + + +#define RECLAIM_OFF 0 +#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ +#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ +#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ + +static inline unsigned long zone_unmapped_file_pages(struct zone *zone) +{ + unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); + unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + + zone_page_state(zone, NR_ACTIVE_FILE); + + /* + * It's possible for there to be more file mapped pages than + * accounted for by the pages on the file LRU lists because + * tmpfs pages accounted for as ANON can also be FILE_MAPPED + */ + return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; +} + +/* + * Helper function to reclaim unmapped pages, we might add something + * similar to this for slab cache as well. Currently this function + * is shared with __zone_reclaim() + */ +static inline void +zone_reclaim_unmapped_pages(struct zone *zone, struct scan_control *sc, + unsigned long nr_pages) +{ + int priority; + /* + * Free memory by calling shrink zone with increasing + * priorities until we have enough memory freed. + */ + priority = ZONE_RECLAIM_PRIORITY; + do { + note_zone_scanning_priority(zone, priority); + shrink_zone(priority, zone, sc); + priority--; + } while (priority >= 0 && sc->nr_reclaimed < nr_pages); +} + +/* + * Routine to balance unmapped pages, inspired from the code under + * CONFIG_NUMA that does unmapped page and slab page control by keeping + * min_unmapped_pages in the zone. We currently reclaim just unmapped + * pages, slab control will come in soon, at which point this routine + * should be called balance cached pages + */ +static unsigned long balance_unmapped_pages(int priority, struct zone *zone, + struct scan_control *sc) +{ + if (unmapped_page_control && + (zone_unmapped_file_pages(zone) > zone->min_unmapped_pages)) { + struct scan_control nsc; + unsigned long nr_pages; + + nsc = *sc; + + nsc.swappiness = 0; + nsc.may_writepage = 0; + nsc.may_unmap = 0; + nsc.nr_reclaimed = 0; + + nr_pages = zone_unmapped_file_pages(zone) - + zone->min_unmapped_pages; + /* Magically try to reclaim eighth the unmapped cache pages */ + nr_pages >>= 3; + + zone_reclaim_unmapped_pages(zone, &nsc, nr_pages); + return nsc.nr_reclaimed; + } + return 0; +} + +#define UNMAPPED_PAGE_RATIO 16 +bool should_balance_unmapped_pages(struct zone *zone) +{ + if (unmapped_page_control && + (zone_unmapped_file_pages(zone) > + UNMAPPED_PAGE_RATIO * zone->min_unmapped_pages)) + return true; + return false; +} + +/* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * @@ -2027,6 +2136,12 @@ loop_again: shrink_active_list(SWAP_CLUSTER_MAX, zone, &sc, priority, 0); + /* + * We do unmapped page balancing once here and once + * below, so that we don't lose out + */ + balance_unmapped_pages(priority, zone, &sc); + if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 0, 0)) { end_zone = i; @@ -2068,6 +2183,13 @@ loop_again: nid = pgdat->node_id; zid = zone_idx(zone); + + /* + * Balance unmapped pages upfront, this should be + * really cheap + */ + balance_unmapped_pages(priority, zone, &sc); + /* * Call soft limit reclaim before calling shrink_zone. * For now we ignore the return value @@ -2289,7 +2411,8 @@ void wakeup_kswapd(struct zone *zone, int order) return; pgdat = zone->zone_pgdat; - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0) && + !should_balance_unmapped_pages(zone)) return; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; @@ -2456,44 +2579,12 @@ module_init(kswapd_init) */ int zone_reclaim_mode __read_mostly; -#define RECLAIM_OFF 0 -#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ -#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ -#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ - -/* - * Priority for ZONE_RECLAIM. This determines the fraction of pages - * of a node considered for each zone_reclaim. 4 scans 1/16th of - * a zone. - */ -#define ZONE_RECLAIM_PRIORITY 4 - -/* - * Percentage of pages in a zone that must be unmapped for zone_reclaim to - * occur. - */ -int sysctl_min_unmapped_ratio = 1; - /* * If the number of slab pages in a zone grows beyond this percentage then * slab reclaim needs to occur. */ int sysctl_min_slab_ratio = 5; -static inline unsigned long zone_unmapped_file_pages(struct zone *zone) -{ - unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); - unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + - zone_page_state(zone, NR_ACTIVE_FILE); - - /* - * It's possible for there to be more file mapped pages than - * accounted for by the pages on the file LRU lists because - * tmpfs pages accounted for as ANON can also be FILE_MAPPED - */ - return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; -} - /* Work out how many page cache pages we can reclaim in this reclaim_mode */ static long zone_pagecache_reclaimable(struct zone *zone) { @@ -2531,7 +2622,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; - int priority; struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), @@ -2562,12 +2652,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Free memory by calling shrink zone with increasing * priorities until we have enough memory freed. */ - priority = ZONE_RECLAIM_PRIORITY; - do { - note_zone_scanning_priority(zone, priority); - shrink_zone(priority, zone, &sc); - priority--; - } while (priority >= 0 && sc.nr_reclaimed < nr_pages); + zone_reclaim_unmapped_pages(zone, &sc, nr_pages); } slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);