@@ -79,7 +79,7 @@ void kill_bdev(struct block_device *bdev)
{
struct address_space *mapping = bdev->bd_inode->i_mapping;
- if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+ if (mapping_empty(mapping))
return;
invalidate_bh_lrus();
@@ -478,9 +478,11 @@ static void *grab_mapping_entry(struct xa_state *xas,
{
unsigned long index = xas->xa_index;
bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */
+ int populated;
void *entry;
retry:
+ populated = 0;
xas_lock_irq(xas);
entry = get_unlocked_entry(xas, order);
@@ -526,6 +528,8 @@ static void *grab_mapping_entry(struct xa_state *xas,
xas_store(xas, NULL); /* undo the PMD join */
dax_wake_entry(xas, entry, true);
mapping->nrexceptional--;
+ if (mapping_empty(mapping))
+ populated = -1;
entry = NULL;
xas_set(xas, index);
}
@@ -541,11 +545,17 @@ static void *grab_mapping_entry(struct xa_state *xas,
dax_lock_entry(xas, entry);
if (xas_error(xas))
goto out_unlock;
+ if (mapping_empty(mapping))
+ populated++;
mapping->nrexceptional++;
}
out_unlock:
xas_unlock_irq(xas);
+ if (populated == -1)
+ inode_pages_clear(mapping->host);
+ else if (populated == 1)
+ inode_pages_set(mapping->host);
if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
goto retry;
if (xas->xa_node == XA_ERROR(-ENOMEM))
@@ -631,6 +641,7 @@ static int __dax_invalidate_entry(struct address_space *mapping,
pgoff_t index, bool trunc)
{
XA_STATE(xas, &mapping->i_pages, index);
+ bool empty = false;
int ret = 0;
void *entry;
@@ -645,10 +656,13 @@ static int __dax_invalidate_entry(struct address_space *mapping,
dax_disassociate_entry(entry, mapping, trunc);
xas_store(&xas, NULL);
mapping->nrexceptional--;
+ empty = mapping_empty(mapping);
ret = 1;
out:
put_unlocked_entry(&xas, entry);
xas_unlock_irq(&xas);
+ if (empty)
+ inode_pages_clear(mapping->host);
return ret;
}
@@ -27,7 +27,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
* we need to reschedule to avoid softlockups.
*/
if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
- (inode->i_mapping->nrpages == 0 && !need_resched())) {
+ (mapping_empty(inode->i_mapping) && !need_resched())) {
spin_unlock(&inode->i_lock);
continue;
}
@@ -430,26 +430,101 @@ static void inode_lru_list_add(struct inode *inode)
inode->i_state |= I_REFERENCED;
}
+static void inode_lru_list_del(struct inode *inode)
+{
+ if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
+ this_cpu_dec(nr_unused);
+}
+
/*
* Add inode to LRU if needed (inode is unused and clean).
*
* Needs inode->i_lock held.
*/
-void inode_add_lru(struct inode *inode)
+bool inode_add_lru(struct inode *inode)
{
- if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
- I_FREEING | I_WILL_FREE)) &&
- !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
- inode_lru_list_add(inode);
+ if (inode->i_state &
+ (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE | I_PAGES))
+ return false;
+ if (atomic_read(&inode->i_count))
+ return false;
+ if (!(inode->i_sb->s_flags & SB_ACTIVE))
+ return false;
+ inode_lru_list_add(inode);
+ return true;
}
+/*
+ * Usually, inodes become reclaimable when they are no longer
+ * referenced and their page cache has been reclaimed. The following
+ * API allows the VM to communicate cache population state to the VFS.
+ *
+ * However, on CONFIG_HIGHMEM we can't wait for the page cache to go
+ * away: cache pages allocated in a large highmem zone could pin
+ * struct inode memory allocated in relatively small lowmem zones. So
+ * when CONFIG_HIGHMEM is enabled, we tie cache to the inode lifetime.
+ */
-static void inode_lru_list_del(struct inode *inode)
+#ifndef CONFIG_HIGHMEM
+/**
+ * inode_pages_set - mark the inode as holding page cache
+ * @inode: the inode whose first cache page was just added
+ *
+ * Tell the VFS that this inode has populated page cache and must not
+ * be reclaimed by the inode shrinker.
+ *
+ * The caller must hold the page lock of the just-added page: by
+ * pinning the page, the page cache cannot become depopulated, and we
+ * can safely set I_PAGES without a race check under the i_pages lock.
+ *
+ * This function acquires the i_lock.
+ */
+void inode_pages_set(struct inode *inode)
{
+ spin_lock(&inode->i_lock);
+ if (!(inode->i_state & I_PAGES)) {
+ inode->i_state |= I_PAGES;
+ if (!list_empty(&inode->i_lru)) {
+ count_vm_event(PGINODERESCUE);
+ inode_lru_list_del(inode);
+ }
+ }
+ spin_unlock(&inode->i_lock);
+}
- if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
- this_cpu_dec(nr_unused);
+/**
+ * inode_pages_clear - mark the inode as not holding page cache
+ * @inode: the inode whose last cache page was just removed
+ *
+ * Tell the VFS that the inode no longer holds page cache and that its
+ * lifetime is to be handed over to the inode shrinker LRU.
+ *
+ * This function acquires the i_lock and the i_pages lock.
+ */
+void inode_pages_clear(struct inode *inode)
+{
+ struct address_space *mapping = &inode->i_data;
+ bool add_to_lru = false;
+ unsigned long flags;
+
+ spin_lock(&inode->i_lock);
+
+ xa_lock_irqsave(&mapping->i_pages, flags);
+ if ((inode->i_state & I_PAGES) && mapping_empty(mapping)) {
+ inode->i_state &= ~I_PAGES;
+ add_to_lru = true;
+ }
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
+
+ if (add_to_lru) {
+ WARN_ON_ONCE(!list_empty(&inode->i_lru));
+ if (inode_add_lru(inode))
+ __count_vm_event(PGINODEDELAYED);
+ }
+
+ spin_unlock(&inode->i_lock);
}
+#endif /* CONFIG_HIGHMEM */
/**
* inode_sb_list_add - add inode to the superblock list of inodes
@@ -742,6 +817,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
if (!spin_trylock(&inode->i_lock))
return LRU_SKIP;
+ WARN_ON_ONCE(inode->i_state & I_PAGES);
+
/*
* Referenced or dirty inodes are still in use. Give them another pass
* through the LRU as we canot reclaim them now.
@@ -761,7 +838,18 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
return LRU_ROTATE;
}
- if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ /*
+ * Usually, populated inodes shouldn't be on the shrinker LRU,
+ * but they can be briefly visible when a new page is added to
+ * an inode that was already linked but inode_pages_set()
+ * hasn't run yet to move them off.
+ *
+ * The other exception is on HIGHMEM systems: highmem cache
+ * can pin lowmem struct inodes, and we might be in dire
+ * straits in the lower zones. Purge cache to free the inode.
+ */
+ if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
+#ifdef CONFIG_HIGHMEM
__iget(inode);
spin_unlock(&inode->i_lock);
spin_unlock(lru_lock);
@@ -778,6 +866,12 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
iput(inode);
spin_lock(lru_lock);
return LRU_RETRY;
+#else
+ list_lru_isolate(lru, &inode->i_lru);
+ spin_unlock(&inode->i_lock);
+ this_cpu_dec(nr_unused);
+ return LRU_REMOVED;
+#endif
}
WARN_ON(inode->i_state & I_NEW);
@@ -139,7 +139,7 @@ extern int vfs_open(const struct path *, struct file *);
* inode.c
*/
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
-extern void inode_add_lru(struct inode *inode);
+extern bool inode_add_lru(struct inode *inode);
extern int dentry_needs_remove_privs(struct dentry *dentry);
/*
@@ -585,6 +585,11 @@ static inline void mapping_allow_writable(struct address_space *mapping)
atomic_inc(&mapping->i_mmap_writable);
}
+static inline bool mapping_empty(struct address_space *mapping)
+{
+ return mapping->nrpages + mapping->nrexceptional == 0;
+}
+
/*
* Use sequence counter to get consistent i_size on 32-bit processors.
*/
@@ -2150,6 +2155,9 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
*
* I_CREATING New object's inode in the middle of setting up.
*
+ * I_PAGES Inode is holding page cache that needs to get reclaimed
+ * first before the inode can go onto the shrinker LRU.
+ *
* Q: What is the difference between I_WILL_FREE and I_FREEING?
*/
#define I_DIRTY_SYNC (1 << 0)
@@ -2172,6 +2180,7 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
#define I_WB_SWITCH (1 << 13)
#define I_OVL_INUSE (1 << 14)
#define I_CREATING (1 << 15)
+#define I_PAGES (1 << 16)
#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
@@ -3097,6 +3106,14 @@ static inline void remove_inode_hash(struct inode *inode)
__remove_inode_hash(inode);
}
+#ifndef CONFIG_HIGHMEM
+extern void inode_pages_set(struct inode *inode);
+extern void inode_pages_clear(struct inode *inode);
+#else
+static inline void inode_pages_set(struct inode *inode) {}
+static inline void inode_pages_clear(struct inode *inode) {}
+#endif
+
extern void inode_sb_list_add(struct inode *inode);
#ifdef CONFIG_BLOCK
@@ -609,7 +609,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
extern void delete_from_page_cache(struct page *page);
-extern void __delete_from_page_cache(struct page *page, void *shadow);
+extern bool __delete_from_page_cache(struct page *page, void *shadow);
int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
void delete_from_page_cache_batch(struct address_space *mapping,
struct pagevec *pvec);
@@ -38,7 +38,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
#ifdef CONFIG_NUMA
PGSCAN_ZONE_RECLAIM_FAILED,
#endif
- PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL,
+ SLABS_SCANNED,
+ PGINODESTEAL, KSWAPD_INODESTEAL, PGINODERESCUE, PGINODEDELAYED,
KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
PAGEOUTRUN, PGROTATED,
DROP_PAGECACHE, DROP_SLAB,
@@ -116,8 +116,8 @@
* ->tasklist_lock (memory_failure, collect_procs_ao)
*/
-static void page_cache_delete(struct address_space *mapping,
- struct page *page, void *shadow)
+static bool __must_check page_cache_delete(struct address_space *mapping,
+ struct page *page, void *shadow)
{
XA_STATE(xas, &mapping->i_pages, page->index);
unsigned int nr = 1;
@@ -151,6 +151,8 @@ static void page_cache_delete(struct address_space *mapping,
smp_wmb();
}
mapping->nrpages -= nr;
+
+ return mapping_empty(mapping);
}
static void unaccount_page_cache_page(struct address_space *mapping,
@@ -227,15 +229,18 @@ static void unaccount_page_cache_page(struct address_space *mapping,
* Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
* is safe. The caller must hold the i_pages lock.
+ *
+ * If this returns true, the caller must call inode_pages_clear()
+ * after dropping the i_pages lock.
*/
-void __delete_from_page_cache(struct page *page, void *shadow)
+bool __must_check __delete_from_page_cache(struct page *page, void *shadow)
{
struct address_space *mapping = page->mapping;
trace_mm_filemap_delete_from_page_cache(page);
unaccount_page_cache_page(mapping, page);
- page_cache_delete(mapping, page, shadow);
+ return page_cache_delete(mapping, page, shadow);
}
static void page_cache_free_page(struct address_space *mapping,
@@ -267,12 +272,16 @@ void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page_mapping(page);
unsigned long flags;
+ bool empty;
BUG_ON(!PageLocked(page));
xa_lock_irqsave(&mapping->i_pages, flags);
- __delete_from_page_cache(page, NULL);
+ empty = __delete_from_page_cache(page, NULL);
xa_unlock_irqrestore(&mapping->i_pages, flags);
+ if (empty)
+ inode_pages_clear(mapping->host);
+
page_cache_free_page(mapping, page);
}
EXPORT_SYMBOL(delete_from_page_cache);
@@ -291,8 +300,8 @@ EXPORT_SYMBOL(delete_from_page_cache);
*
* The function expects the i_pages lock to be held.
*/
-static void page_cache_delete_batch(struct address_space *mapping,
- struct pagevec *pvec)
+static bool __must_check page_cache_delete_batch(struct address_space *mapping,
+ struct pagevec *pvec)
{
XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
int total_pages = 0;
@@ -337,12 +346,15 @@ static void page_cache_delete_batch(struct address_space *mapping,
total_pages++;
}
mapping->nrpages -= total_pages;
+
+ return mapping_empty(mapping);
}
void delete_from_page_cache_batch(struct address_space *mapping,
struct pagevec *pvec)
{
int i;
+ bool empty;
unsigned long flags;
if (!pagevec_count(pvec))
@@ -354,9 +366,12 @@ void delete_from_page_cache_batch(struct address_space *mapping,
unaccount_page_cache_page(mapping, pvec->pages[i]);
}
- page_cache_delete_batch(mapping, pvec);
+ empty = page_cache_delete_batch(mapping, pvec);
xa_unlock_irqrestore(&mapping->i_pages, flags);
+ if (empty)
+ inode_pages_clear(mapping->host);
+
for (i = 0; i < pagevec_count(pvec); i++)
page_cache_free_page(mapping, pvec->pages[i]);
}
@@ -831,9 +846,10 @@ static int __add_to_page_cache_locked(struct page *page,
void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, offset);
+ int error;
int huge = PageHuge(page);
struct mem_cgroup *memcg;
- int error;
+ bool populated = false;
void *old;
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -860,6 +876,7 @@ static int __add_to_page_cache_locked(struct page *page,
if (xas_error(&xas))
goto unlock;
+ populated = mapping_empty(mapping);
if (xa_is_value(old)) {
mapping->nrexceptional--;
if (shadowp)
@@ -880,6 +897,10 @@ static int __add_to_page_cache_locked(struct page *page,
if (!huge)
mem_cgroup_commit_charge(page, memcg, false, false);
trace_mm_filemap_add_to_page_cache(page);
+
+ if (populated)
+ inode_pages_set(mapping->host);
+
return 0;
error:
page->mapping = NULL;
@@ -2535,7 +2535,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
/* Some pages can be beyond i_size: drop them from page cache */
if (head[i].index >= end) {
ClearPageDirty(head + i);
- __delete_from_page_cache(head + i, NULL);
+ /* We know we're not removing the last page */
+ (void)__delete_from_page_cache(head + i, NULL);
if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
shmem_uncharge(head->mapping->host, 1);
put_page(head + i);
@@ -31,24 +31,31 @@
* itself locked. These unlocked entries need verification under the tree
* lock.
*/
-static inline void __clear_shadow_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
+static bool __must_check __clear_shadow_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
{
XA_STATE(xas, &mapping->i_pages, index);
xas_set_update(&xas, workingset_update_node);
if (xas_load(&xas) != entry)
- return;
+ return 0;
xas_store(&xas, NULL);
mapping->nrexceptional--;
+
+ return mapping_empty(mapping);
}
static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
void *entry)
{
+ bool empty;
+
xa_lock_irq(&mapping->i_pages);
- __clear_shadow_entry(mapping, index, entry);
+ empty = __clear_shadow_entry(mapping, index, entry);
xa_unlock_irq(&mapping->i_pages);
+
+ if (empty)
+ inode_pages_clear(mapping->host);
}
/*
@@ -61,7 +68,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
pgoff_t end)
{
int i, j;
- bool dax, lock;
+ bool dax, lock, empty = false;
/* Handled by shmem itself */
if (shmem_mapping(mapping))
@@ -96,11 +103,16 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
continue;
}
- __clear_shadow_entry(mapping, index, page);
+ if (__clear_shadow_entry(mapping, index, page))
+ empty = true;
}
if (lock)
xa_unlock_irq(&mapping->i_pages);
+
+ if (empty)
+ inode_pages_clear(mapping->host);
+
pvec->nr = j;
}
@@ -300,7 +312,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
pgoff_t index;
int i;
- if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+ if (mapping_empty(mapping))
goto out;
/* Offsets within partial pages */
@@ -636,6 +648,7 @@ static int
invalidate_complete_page2(struct address_space *mapping, struct page *page)
{
unsigned long flags;
+ bool empty;
if (page->mapping != mapping)
return 0;
@@ -648,9 +661,12 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
goto failed;
BUG_ON(page_has_private(page));
- __delete_from_page_cache(page, NULL);
+ empty = __delete_from_page_cache(page, NULL);
xa_unlock_irqrestore(&mapping->i_pages, flags);
+ if (empty)
+ inode_pages_clear(mapping->host);
+
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
@@ -692,7 +708,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
int ret2 = 0;
int did_range_unmap = 0;
- if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+ if (mapping_empty(mapping))
goto out;
pagevec_init(&pvec);
@@ -901,6 +901,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
} else {
void (*freepage)(struct page *);
void *shadow = NULL;
+ int empty;
freepage = mapping->a_ops->freepage;
/*
@@ -922,9 +923,12 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (reclaimed && page_is_file_cache(page) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(page, target_memcg);
- __delete_from_page_cache(page, shadow);
+ empty = __delete_from_page_cache(page, shadow);
xa_unlock_irqrestore(&mapping->i_pages, flags);
+ if (empty)
+ inode_pages_clear(mapping->host);
+
if (freepage != NULL)
freepage(page);
}
@@ -1203,9 +1203,11 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_NUMA
"zone_reclaim_failed",
#endif
- "pginodesteal",
"slabs_scanned",
+ "pginodesteal",
"kswapd_inodesteal",
+ "pginoderescue",
+ "pginodedelayed",
"kswapd_low_wmark_hit_quickly",
"kswapd_high_wmark_hit_quickly",
"pageoutrun",
@@ -491,6 +491,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
struct xa_node *node = container_of(item, struct xa_node, private_list);
XA_STATE(xas, node->array, 0);
struct address_space *mapping;
+ bool empty = false;
int ret;
/*
@@ -529,6 +530,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
if (WARN_ON_ONCE(node->count != node->nr_values))
goto out_invalid;
mapping->nrexceptional -= node->nr_values;
+ empty = mapping_empty(mapping);
xas.xa_node = xa_parent_locked(&mapping->i_pages, node);
xas.xa_offset = node->offset;
xas.xa_shift = node->shift + XA_CHUNK_SHIFT;
@@ -542,6 +544,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
out_invalid:
xa_unlock_irq(&mapping->i_pages);
+ if (empty)
+ inode_pages_clear(mapping->host);
ret = LRU_REMOVED_RETRY;
out:
cond_resched();