@@ -34,6 +34,7 @@
#include <linux/security.h>
#include <linux/cpuset.h>
#include <linux/hugetlb.h>
+#include <linux/memblock.h>
#include <linux/memcontrol.h>
#include <linux/cleancache.h>
#include <linux/shmem_fs.h>
@@ -969,19 +970,36 @@ EXPORT_SYMBOL(__page_cache_alloc);
* at a cost of "thundering herd" phenomena during rare hash
* collisions.
*/
-#define PAGE_WAIT_TABLE_BITS 8
-#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
+#define PAGE_WAIT_TABLE_SIZE (1 << page_wait_table_bits)
+#if CONFIG_BASE_SMALL
+static const unsigned int page_wait_table_bits = 4;
static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+#else
+static unsigned int page_wait_table_bits __ro_after_init;
+static wait_queue_head_t *page_wait_table __ro_after_init;
+#endif
static wait_queue_head_t *page_waitqueue(struct page *page)
{
- return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
+ return &page_wait_table[hash_ptr(page, page_wait_table_bits)];
}
void __init pagecache_init(void)
{
int i;
+ if (!CONFIG_BASE_SMALL) {
+ page_wait_table = alloc_large_system_hash("Page waitqueue hash",
+ sizeof(wait_queue_head_t),
+ 0,
+ 21,
+ 0,
+ &page_wait_table_bits,
+ NULL,
+ 0,
+ 0);
+ }
+
for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
init_waitqueue_head(&page_wait_table[i]);
The page waitqueue hash is a bit small (256 entries) on very big systems. A 16 socket 1536 thread POWER9 system was found to encounter hash collisions and excessive time in waitqueue locking at times. This was intermittent and hard to reproduce easily with the setup we had (very little real IO capacity). The thought is some important pages happened to collide in the hash, slowing down page locking, causing the problem to snowball. An small test case was made where threads would write and fsync different pages, generating just a small amount of contention across many pages. Increasing page waitqueue hash size to 262144 entries increased throughput by 182% while also reducing standard deviation 3x. perf before the increase: 36.23% [k] _raw_spin_lock_irqsave - - | |--34.60%--wake_up_page_bit | 0 | iomap_write_end.isra.38 | iomap_write_actor | iomap_apply | iomap_file_buffered_write | xfs_file_buffered_aio_write | new_sync_write 17.93% [k] native_queued_spin_lock_slowpath - - | |--16.74%--_raw_spin_lock_irqsave | | | --16.44%--wake_up_page_bit | iomap_write_end.isra.38 | iomap_write_actor | iomap_apply | iomap_file_buffered_write | xfs_file_buffered_aio_write This patch uses alloc_large_system_hash to allocate a bigger system hash that scales somewhat with memory size. This hash could be made per-node, which should help reduce remote accesses on well localised workloads, but that adds some complexity with hotplug, so until we get a less artificial workload to test with, let's keep it simple. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> --- mm/filemap.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-)