From patchwork Wed Mar 29 05:31:43 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Luis Chamberlain <mcgrof@kernel.org>
X-Patchwork-Id: 13191902
Return-Path: <linux-modules-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 09F93C6FD18
	for <linux-modules@archiver.kernel.org>;
 Wed, 29 Mar 2023 05:32:09 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229770AbjC2FcH (ORCPT
        <rfc822;linux-modules@archiver.kernel.org>);
        Wed, 29 Mar 2023 01:32:07 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57228 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229753AbjC2FcF (ORCPT
        <rfc822;linux-modules@vger.kernel.org>);
        Wed, 29 Mar 2023 01:32:05 -0400
Received: from bombadil.infradead.org (bombadil.infradead.org
 [IPv6:2607:7c80:54:3::133])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 3FC6D3C27;
        Tue, 28 Mar 2023 22:31:54 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
        d=infradead.org; s=bombadil.20210309;
 h=Sender:Content-Transfer-Encoding:
        MIME-Version:References:In-Reply-To:Message-Id:Date:Subject:Cc:To:From:
        Reply-To:Content-Type:Content-ID:Content-Description;
        bh=gd8HbSo5xrQsR5kA4HVA4gKzlE/8w/S5arnYzQjZrxg=;
 b=xGWh2JgKRYwpppiRzsWetlki5t
        SVeBNZzs7bh2gAzAdY3c4dPLIqKQ56oSDKzinEazY6EhcXBffYAE1BcrA4RD1I8UB8UxwhlyXHxmK
        E/JKeSfdBahNY/fWI+b38Kf0WHEmp5qRM+J5c2Tc9/0JMZB29rDgeZCoKqc2+pse21WP1cvPGgv2j
        +TS3Aw0BoqVz0XL2rSKX9QyduwIqa0AUcLUQCZUPeNvFM4L5JQcEMNtqv0yXiYcXImRcc4tRvL5c5
        GbPvsKhY/FQ4y+Bc0G6I1sdi/F3VcHfN6n+zJlt617XOG53+obPbnyIdgO6d3lukOl04AKooj4asy
        zW1gZagw==;
Received: from mcgrof by bombadil.infradead.org with local (Exim 4.96 #2 (Red
 Hat Linux))
        id 1phOPW-00GgRS-1v;
        Wed, 29 Mar 2023 05:31:50 +0000
From: Luis Chamberlain <mcgrof@kernel.org>
To: david@redhat.com, patches@lists.linux.dev,
        linux-modules@vger.kernel.org, linux-mm@kvack.org,
        linux-kernel@vger.kernel.org, pmladek@suse.com,
        petr.pavlu@suse.com, prarit@redhat.com,
        torvalds@linux-foundation.org, gregkh@linuxfoundation.org,
        rafael@kernel.org
Cc: christophe.leroy@csgroup.eu, tglx@linutronix.de,
        peterz@infradead.org, song@kernel.org, rppt@kernel.org,
        willy@infradead.org, vbabka@suse.cz, mhocko@suse.com,
        dave.hansen@linux.intel.com, mcgrof@kernel.org
Subject: [PATCH 1/7] module: move finished_loading()
Date: Tue, 28 Mar 2023 22:31:43 -0700
Message-Id: <20230329053149.3976378-2-mcgrof@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20230329053149.3976378-1-mcgrof@kernel.org>
References: <20230329053149.3976378-1-mcgrof@kernel.org>
MIME-Version: 1.0
Sender: Luis Chamberlain <mcgrof@infradead.org>
Precedence: bulk
List-ID: <linux-modules.vger.kernel.org>

This has no functional change, just moves a routine earlier
as we'll make use of it next.

Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/main.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/kernel/module/main.c b/kernel/module/main.c
index 5cc21083af04..af58e63e5daf 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2442,27 +2442,6 @@ static int post_relocation(struct module *mod, const struct load_info *info)
 	return module_finalize(info->hdr, info->sechdrs, mod);
 }
 
-/* Is this module of this name done loading?  No locks held. */
-static bool finished_loading(const char *name)
-{
-	struct module *mod;
-	bool ret;
-
-	/*
-	 * The module_mutex should not be a heavily contended lock;
-	 * if we get the occasional sleep here, we'll go an extra iteration
-	 * in the wait_event_interruptible(), which is harmless.
-	 */
-	sched_annotate_sleep();
-	mutex_lock(&module_mutex);
-	mod = find_module_all(name, strlen(name), true);
-	ret = !mod || mod->state == MODULE_STATE_LIVE
-		|| mod->state == MODULE_STATE_GOING;
-	mutex_unlock(&module_mutex);
-
-	return ret;
-}
-
 /* Call module constructors. */
 static void do_mod_ctors(struct module *mod)
 {
@@ -2626,6 +2605,27 @@ static int may_init_module(void)
 	return 0;
 }
 
+/* Is this module of this name done loading?  No locks held. */
+static bool finished_loading(const char *name)
+{
+	struct module *mod;
+	bool ret;
+
+	/*
+	 * The module_mutex should not be a heavily contended lock;
+	 * if we get the occasional sleep here, we'll go an extra iteration
+	 * in the wait_event_interruptible(), which is harmless.
+	 */
+	sched_annotate_sleep();
+	mutex_lock(&module_mutex);
+	mod = find_module_all(name, strlen(name), true);
+	ret = !mod || mod->state == MODULE_STATE_LIVE
+		|| mod->state == MODULE_STATE_GOING;
+	mutex_unlock(&module_mutex);
+
+	return ret;
+}
+
 /*
  * We try to place it in the list now to make sure it's unique before
  * we dedicate too many resources.  In particular, temporary percpu

From patchwork Wed Mar 29 05:31:44 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Luis Chamberlain <mcgrof@kernel.org>
X-Patchwork-Id: 13191904
Return-Path: <linux-modules-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 191ACC74A5B
	for <linux-modules@archiver.kernel.org>;
 Wed, 29 Mar 2023 05:32:11 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229750AbjC2FcJ (ORCPT
        <rfc822;linux-modules@archiver.kernel.org>);
        Wed, 29 Mar 2023 01:32:09 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57690 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229759AbjC2FcF (ORCPT
        <rfc822;linux-modules@vger.kernel.org>);
        Wed, 29 Mar 2023 01:32:05 -0400
Received: from bombadil.infradead.org (bombadil.infradead.org
 [IPv6:2607:7c80:54:3::133])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 5BEBB3C2B;
        Tue, 28 Mar 2023 22:31:54 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
        d=infradead.org; s=bombadil.20210309;
 h=Sender:Content-Transfer-Encoding:
        MIME-Version:References:In-Reply-To:Message-Id:Date:Subject:Cc:To:From:
        Reply-To:Content-Type:Content-ID:Content-Description;
        bh=C7tEOVKbmd31kJKkSCr8L6PeUYpxvjj7kEhb9lxj2lU=;
 b=Hq7MpAhblc40y8k1Y4Pqo/enTW
        SslWm7fO+exvvf3HZRxh+bgxhZOS3P1IYnAydKJ5ZTarr8GhRDhxNEnxrSNR8qTIPYRQkgU3YNnx+
        +hKEQpyCrK+p5tnt4YfYVc53F88QARD2q7d7/0iy49zIQ53fg9qMpgODyjglcWGSsnuydP/xVL3Pf
        FSVm+HQx1wkfASdfb4mN3vTxsgy2DNRu5unP+d8hu1QfAr3UlqH/JXn0cVkRXiLoYO4YS6swQky9l
        OqFeYbHv1S5N2op7xH0OmscjuK1xIZLjsk8AFnOHnf/yFLpseFdNdJMXGHqpmE5FhbNXRa3cLgGxL
        JBMH5C3A==;
Received: from mcgrof by bombadil.infradead.org with local (Exim 4.96 #2 (Red
 Hat Linux))
        id 1phOPW-00GgRW-23;
        Wed, 29 Mar 2023 05:31:50 +0000
From: Luis Chamberlain <mcgrof@kernel.org>
To: david@redhat.com, patches@lists.linux.dev,
        linux-modules@vger.kernel.org, linux-mm@kvack.org,
        linux-kernel@vger.kernel.org, pmladek@suse.com,
        petr.pavlu@suse.com, prarit@redhat.com,
        torvalds@linux-foundation.org, gregkh@linuxfoundation.org,
        rafael@kernel.org
Cc: christophe.leroy@csgroup.eu, tglx@linutronix.de,
        peterz@infradead.org, song@kernel.org, rppt@kernel.org,
        willy@infradead.org, vbabka@suse.cz, mhocko@suse.com,
        dave.hansen@linux.intel.com, mcgrof@kernel.org
Subject: [PATCH 2/7] module: extract patient module check into helper
Date: Tue, 28 Mar 2023 22:31:44 -0700
Message-Id: <20230329053149.3976378-3-mcgrof@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20230329053149.3976378-1-mcgrof@kernel.org>
References: <20230329053149.3976378-1-mcgrof@kernel.org>
MIME-Version: 1.0
Sender: Luis Chamberlain <mcgrof@infradead.org>
Precedence: bulk
List-ID: <linux-modules.vger.kernel.org>

The patient module check inside add_unformed_module() is large
enough as we need it. It is a bit hard to read too, so just
move it to a helper and do the inverse checks first to help
shift the code and make it easier to read. The new helper then
is module_patient_check_exists().

Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/main.c | 71 +++++++++++++++++++++++++-------------------
 1 file changed, 40 insertions(+), 31 deletions(-)

diff --git a/kernel/module/main.c b/kernel/module/main.c
index af58e63e5daf..77c2e7a60f2e 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2626,6 +2626,43 @@ static bool finished_loading(const char *name)
 	return ret;
 }
 
+/* Must be called with module_mutex held */
+static int module_patient_check_exists(const char *name)
+{
+	struct module *old;
+	int err = 0;
+
+	old = find_module_all(name, strlen(name), true);
+	if (old == NULL)
+		return 0;
+
+	if (old->state == MODULE_STATE_COMING
+	    || old->state == MODULE_STATE_UNFORMED) {
+		/* Wait in case it fails to load. */
+		mutex_unlock(&module_mutex);
+		err = wait_event_interruptible(module_wq,
+				       finished_loading(name));
+		if (err)
+			return err;
+
+		/* The module might have gone in the meantime. */
+		mutex_lock(&module_mutex);
+		old = find_module_all(name, strlen(name), true);
+	}
+
+	/*
+	 * We are here only when the same module was being loaded. Do
+	 * not try to load it again right now. It prevents long delays
+	 * caused by serialized module load failures. It might happen
+	 * when more devices of the same type trigger load of
+	 * a particular module.
+	 */
+	if (old && old->state == MODULE_STATE_LIVE)
+		return -EEXIST;
+	else
+		return -EBUSY;
+}
+
 /*
  * We try to place it in the list now to make sure it's unique before
  * we dedicate too many resources.  In particular, temporary percpu
@@ -2634,41 +2671,14 @@ static bool finished_loading(const char *name)
 static int add_unformed_module(struct module *mod)
 {
 	int err;
-	struct module *old;
 
 	mod->state = MODULE_STATE_UNFORMED;
 
 	mutex_lock(&module_mutex);
-	old = find_module_all(mod->name, strlen(mod->name), true);
-	if (old != NULL) {
-		if (old->state == MODULE_STATE_COMING
-		    || old->state == MODULE_STATE_UNFORMED) {
-			/* Wait in case it fails to load. */
-			mutex_unlock(&module_mutex);
-			err = wait_event_interruptible(module_wq,
-					       finished_loading(mod->name));
-			if (err)
-				goto out_unlocked;
-
-			/* The module might have gone in the meantime. */
-			mutex_lock(&module_mutex);
-			old = find_module_all(mod->name, strlen(mod->name),
-					      true);
-		}
-
-		/*
-		 * We are here only when the same module was being loaded. Do
-		 * not try to load it again right now. It prevents long delays
-		 * caused by serialized module load failures. It might happen
-		 * when more devices of the same type trigger load of
-		 * a particular module.
-		 */
-		if (old && old->state == MODULE_STATE_LIVE)
-			err = -EEXIST;
-		else
-			err = -EBUSY;
+	err = module_patient_check_exists(mod->name);
+	if (err)
 		goto out;
-	}
+
 	mod_update_bounds(mod);
 	list_add_rcu(&mod->list, &modules);
 	mod_tree_insert(mod);
@@ -2676,7 +2686,6 @@ static int add_unformed_module(struct module *mod)
 
 out:
 	mutex_unlock(&module_mutex);
-out_unlocked:
 	return err;
 }
 

From patchwork Wed Mar 29 05:31:45 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Luis Chamberlain <mcgrof@kernel.org>
X-Patchwork-Id: 13191907
Return-Path: <linux-modules-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 8B521C74A5B
	for <linux-modules@archiver.kernel.org>;
 Wed, 29 Mar 2023 05:32:13 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229792AbjC2FcM (ORCPT
        <rfc822;linux-modules@archiver.kernel.org>);
        Wed, 29 Mar 2023 01:32:12 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57262 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229752AbjC2FcH (ORCPT
        <rfc822;linux-modules@vger.kernel.org>);
        Wed, 29 Mar 2023 01:32:07 -0400
Received: from bombadil.infradead.org (bombadil.infradead.org
 [IPv6:2607:7c80:54:3::133])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id CA1143C33;
        Tue, 28 Mar 2023 22:31:54 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
        d=infradead.org; s=bombadil.20210309;
 h=Sender:Content-Transfer-Encoding:
        MIME-Version:References:In-Reply-To:Message-Id:Date:Subject:Cc:To:From:
        Reply-To:Content-Type:Content-ID:Content-Description;
        bh=RbshZVKDekPDOboeHnEAcgErH9krvuwQN7TysZlnIZw=;
 b=GyXZwQdzvgM7yhc8r+C5Vhai/9
        IOpEs0a2tDWNEdfprqiEIcZ2E8v1vexNsaoG9ZN2e1ZBToe1wdV9OPP21g4oLpqbMgEuo3fyX/8r2
        z8ZG617RPED8eMv9RVJgKv6yxZsjZikUTsRsoepiIWpblLpufupifKeeB2RZS9hCVfFLS5Qi/zaPs
        wbXqYiFQNVQ9NpGD3zNYkWbEk7J5CwMo3VeJ2PMxGIbDq6flmPN4cv/e7EzAP13A/ztHYle/wUwUP
        pdzO68cb4rR6+2D/EtIoNOIRkfU1dIvlJhCsfV4cOL31V/agcdygbpFFjQy9YclM9/4eqvm8S4zQH
        /GKQHogQ==;
Received: from mcgrof by bombadil.infradead.org with local (Exim 4.96 #2 (Red
 Hat Linux))
        id 1phOPW-00GgRY-2D;
        Wed, 29 Mar 2023 05:31:50 +0000
From: Luis Chamberlain <mcgrof@kernel.org>
To: david@redhat.com, patches@lists.linux.dev,
        linux-modules@vger.kernel.org, linux-mm@kvack.org,
        linux-kernel@vger.kernel.org, pmladek@suse.com,
        petr.pavlu@suse.com, prarit@redhat.com,
        torvalds@linux-foundation.org, gregkh@linuxfoundation.org,
        rafael@kernel.org
Cc: christophe.leroy@csgroup.eu, tglx@linutronix.de,
        peterz@infradead.org, song@kernel.org, rppt@kernel.org,
        willy@infradead.org, vbabka@suse.cz, mhocko@suse.com,
        dave.hansen@linux.intel.com, mcgrof@kernel.org
Subject: [PATCH 3/7] module: avoid allocation if module is already present and
 ready
Date: Tue, 28 Mar 2023 22:31:45 -0700
Message-Id: <20230329053149.3976378-4-mcgrof@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20230329053149.3976378-1-mcgrof@kernel.org>
References: <20230329053149.3976378-1-mcgrof@kernel.org>
MIME-Version: 1.0
Sender: Luis Chamberlain <mcgrof@infradead.org>
Precedence: bulk
List-ID: <linux-modules.vger.kernel.org>

load_module() will allocate a struct module before even checking
if the module is already loaded. This can create unecessary memory
pressure since we can easily just check if the module is already
present early with the copy of the module information from userspace
after we've validated it a bit.

This can only be an issue if a system is getting hammered with userspace
loading modules. Note that there are two ways to load modules, one is
kernel moduile auto-loading (request_module() calls in-kernel) and the
other is modprobe calls from userspace. The auto-loading is in-kernel, that
pings back to userspace to just call modprobe. We already have a way to
restrict the amount of concurrent kernel auto-loads in a given time, however
that does not stop a system from issuing tons of system calls to load a
module and for the races to exist. Userspace itself *is* supposed to check
if a module is present before loading it. But we're observing situations
where tons of the same module are in effect being loaded. Although some of
these are acknolwedged as in-kernel bugs such as the ACPI frequency
modules, issues for which we already have fixes merged or are working
towards, but we can also help a bit more in the modules side to avoid
those dramatic situations. All that is just memory being allocated to then
be thrown away.

To avoid memory pressure for such stupid cases put a stop gap for them.
We now check for the module being present *before* allocation, and then
right after we are going to add it to the system.

On a 8vcpu 8 GiB RAM system using kdevops and testing against selftests
kmod.sh -t 0008 I see a saving in the *highest* side of memory
consumption of up to ~ 84 MiB with the Linux kernel selftests kmod
test 0008. With the new stress-ng module test I see a 145 MiB difference
in max memory consumption with 100 ops. The stress-ng module ops tests can be
pretty pathalogical -- it is not realistic, however it was used to
finally successfully reproduce issues which are only reported to happen on
system with over 400 CPUs [0] by just usign 100 ops on a 8vcpu 8 GiB RAM
system.

This can be observed and visualized below. The time it takes to run the
test is also not affected.

The kmod tests 0008:

The gnuplot is set to a range from 400000 KiB (390 Mib) - 580000 (566 Mib)
given the tests peak around that range.

cat kmod.plot
set term dumb
set output fileout
set yrange [400000:580000]
plot filein with linespoints title "Memory usage (KiB)"

Before:
root@kmod ~ # /data/linux-next/tools/testing/selftests/kmod/kmod.sh -t 0008
root@kmod ~ # free -k -s 1 -c 40 | grep Mem | awk '{print $3}' > log-0008-before.txt ^C
root@kmod ~ # sort -n -r log-0008-before.txt | head -1
528732

So ~516.33 MiB

After:

root@kmod ~ # /data/linux-next/tools/testing/selftests/kmod/kmod.sh -t 0008
root@kmod ~ # free -k -s 1 -c 40 | grep Mem | awk '{print $3}' > log-0008-after.txt ^C

root@kmod ~ # sort -n -r log-0008-after.txt | head -1
442516

So ~432.14 MiB

That's about 84 ~MiB in savings in the worst case. The graphs:

root@kmod ~ # gnuplot -e "filein='log-0008-before.txt'; fileout='graph-0008-before.txt'" kmod.plot
root@kmod ~ # gnuplot -e "filein='log-0008-after.txt';  fileout='graph-0008-after.txt'"  kmod.plot

root@kmod ~ # cat graph-0008-before.txt

  580000 +-----------------------------------------------------------------+
         |       +        +       +       +       +        +       +       |
  560000 |-+                                    Memory usage (KiB) ***A***-|
         |                                                                 |
  540000 |-+                                                             +-|
         |                                                                 |
         |        *A     *AA*AA*A*AA          *A*AA    A*A*A *AA*A*AA*A  A |
  520000 |-+A*A*AA  *AA*A           *A*AA*A*AA     *A*A     A          *A+-|
         |*A                                                               |
  500000 |-+                                                             +-|
         |                                                                 |
  480000 |-+                                                             +-|
         |                                                                 |
  460000 |-+                                                             +-|
         |                                                                 |
         |                                                                 |
  440000 |-+                                                             +-|
         |                                                                 |
  420000 |-+                                                             +-|
         |       +        +       +       +       +        +       +       |
  400000 +-----------------------------------------------------------------+
         0       5        10      15      20      25       30      35      40

root@kmod ~ # cat graph-0008-after.txt

  580000 +-----------------------------------------------------------------+
         |       +        +       +       +       +        +       +       |
  560000 |-+                                    Memory usage (KiB) ***A***-|
         |                                                                 |
  540000 |-+                                                             +-|
         |                                                                 |
         |                                                                 |
  520000 |-+                                                             +-|
         |                                                                 |
  500000 |-+                                                             +-|
         |                                                                 |
  480000 |-+                                                             +-|
         |                                                                 |
  460000 |-+                                                             +-|
         |                                                                 |
         |          *A              *A*A                                   |
  440000 |-+A*A*AA*A  A       A*A*AA    A*A*AA*A*AA*A*AA*A*AA*AA*A*AA*A*AA-|
         |*A           *A*AA*A                                             |
  420000 |-+                                                             +-|
         |       +        +       +       +       +        +       +       |
  400000 +-----------------------------------------------------------------+
         0       5        10      15      20      25       30      35      40

The stress-ng module tests:

This is used to run the test to try to reproduce the vmap issues
reported by David:

  echo 0 > /proc/sys/vm/oom_dump_tasks
  ./stress-ng --module 100 --module-name xfs

Prior to this commit:
root@kmod ~ # free -k -s 1 -c 40 | grep Mem | awk '{print $3}' > baseline-stress-ng.txt
root@kmod ~ # sort -n -r baseline-stress-ng.txt | head -1
5046456

After this commit:
root@kmod ~ # free -k -s 1 -c 40 | grep Mem | awk '{print $3}' > after-stress-ng.txt
root@kmod ~ # sort -n -r after-stress-ng.txt | head -1
4896972

5046456 - 4896972
149484
149484/1024
145.98046875000000000000

So this commit using stress-ng reveals saving about 145 MiB in memory
using 100 ops from stress-ng which reproduced the vmap issue reported.

cat kmod.plot
set term dumb
set output fileout
set yrange [4700000:5070000]
plot filein with linespoints title "Memory usage (KiB)"

root@kmod ~ # gnuplot -e "filein='baseline-stress-ng.txt'; fileout='graph-stress-ng-before.txt'"  kmod-simple-stress-ng.plot
root@kmod ~ # gnuplot -e "filein='after-stress-ng.txt'; fileout='graph-stress-ng-after.txt'"  kmod-simple-stress-ng.plot

root@kmod ~ # cat graph-stress-ng-before.txt

           +---------------------------------------------------------------+
  5.05e+06 |-+     + A     +       +       +       +       +       +     +-|
           |         *                          Memory usage (KiB) ***A*** |
           |         *                             A                       |
     5e+06 |-+      **                            **                     +-|
           |        **                            * *    A                 |
  4.95e+06 |-+      * *                          A  *   A*               +-|
           |        * *      A       A           *  *  *  *             A  |
           |       *  *     * *     * *        *A   *  *  *      A      *  |
   4.9e+06 |-+     *  *     * A*A   * A*AA*A  A      *A    **A   **A*A  *+-|
           |       A  A*A  A    *  A       *  *      A     A *  A    * **  |
           |      *      **      **         * *              *  *    * * * |
  4.85e+06 |-+   A       A       A          **               *  *     ** *-|
           |     *                           *               * *      ** * |
           |     *                           A               * *      *  * |
   4.8e+06 |-+   *                                           * *      A  A-|
           |     *                                           * *           |
  4.75e+06 |-+  *                                            * *         +-|
           |    *                                            **            |
           |    *  +       +       +       +       +       + **    +       |
   4.7e+06 +---------------------------------------------------------------+
           0       5       10      15      20      25      30      35      40

root@kmod ~ # cat graph-stress-ng-after.txt

           +---------------------------------------------------------------+
  5.05e+06 |-+     +       +       +       +       +       +       +     +-|
           |                                    Memory usage (KiB) ***A*** |
           |                                                               |
     5e+06 |-+                                                           +-|
           |                                                               |
  4.95e+06 |-+                                                           +-|
           |                                                               |
           |                                                               |
   4.9e+06 |-+                                      *AA                  +-|
           |  A*AA*A*A  A  A*AA*AA*A*AA*A  A  A  A*A   *AA*A*A  A  A*AA*AA |
           |  *      * **  *            *  *  ** *            ***  *       |
  4.85e+06 |-+*       ***  *            * * * ***             A *  *     +-|
           |  *       A *  *             ** * * A               *  *       |
           |  *         *  *             *  **                  *  *       |
   4.8e+06 |-+*         *  *             A   *                  *  *     +-|
           | *          * *                  A                  * *        |
  4.75e+06 |-*          * *                                     * *      +-|
           | *          * *                                     * *        |
           | *     +    * *+       +       +       +       +    * *+       |
   4.7e+06 +---------------------------------------------------------------+
           0       5       10      15      20      25      30      35      40

[0] https://lkml.kernel.org/r/20221013180518.217405-1-david@redhat.com

Reported-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/main.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/module/main.c b/kernel/module/main.c
index 77c2e7a60f2e..145e15f19576 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2785,7 +2785,11 @@ static int early_mod_check(struct load_info *info, int flags)
 	if (err)
 		return err;
 
-	return 0;
+	mutex_lock(&module_mutex);
+	err = module_patient_check_exists(info->mod->name);
+	mutex_unlock(&module_mutex);
+
+	return err;
 }
 
 /*

From patchwork Wed Mar 29 05:31:46 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Luis Chamberlain <mcgrof@kernel.org>
X-Patchwork-Id: 13191905
Return-Path: <linux-modules-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id D1076C77B6C
	for <linux-modules@archiver.kernel.org>;
 Wed, 29 Mar 2023 05:32:11 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229781AbjC2FcK (ORCPT
        <rfc822;linux-modules@archiver.kernel.org>);
        Wed, 29 Mar 2023 01:32:10 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57698 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229765AbjC2FcF (ORCPT
        <rfc822;linux-modules@vger.kernel.org>);
        Wed, 29 Mar 2023 01:32:05 -0400
Received: from bombadil.infradead.org (bombadil.infradead.org
 [IPv6:2607:7c80:54:3::133])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 657F13C32;
        Tue, 28 Mar 2023 22:31:54 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
        d=infradead.org; s=bombadil.20210309;
 h=Sender:Content-Transfer-Encoding:
        MIME-Version:References:In-Reply-To:Message-Id:Date:Subject:Cc:To:From:
        Reply-To:Content-Type:Content-ID:Content-Description;
        bh=guyHag9qCqT02ycAP5gFYMiS5Z8MMs8Ps7B6B4e3yQk=;
 b=0s+ZsQtqvZrSA+dzaZX4im6LhE
        WyYiRJxXstb/bqMR3zp8lDLo3bPTmJaJEWSTCh06Aez4lV2UP5frUnAfsl7KhQGMydKgbO1lw7zhb
        5jDrZkRjvFJfeHZVno0lB+v2bfAxpKG/fGbbO0mlzuPuljHB9FQjYJ/jVkpBdrh4YEiBJCnK3TUmn
        hiaTcVuownYisCxprFvcXdMGJps9Idj/5unt/K9XrpcpkHwN+cyMrQDW7fOwdPR91ENj/EqR2BYlX
        ZDIkrXjOcSqgk2+GqwuSjX7aetm38G9LIM3wHa+55pCtzqIALL7J0jkg4T34ZcZ7/7O5zQ66cDSM2
        QzeATbHg==;
Received: from mcgrof by bombadil.infradead.org with local (Exim 4.96 #2 (Red
 Hat Linux))
        id 1phOPW-00GgRb-2M;
        Wed, 29 Mar 2023 05:31:50 +0000
From: Luis Chamberlain <mcgrof@kernel.org>
To: david@redhat.com, patches@lists.linux.dev,
        linux-modules@vger.kernel.org, linux-mm@kvack.org,
        linux-kernel@vger.kernel.org, pmladek@suse.com,
        petr.pavlu@suse.com, prarit@redhat.com,
        torvalds@linux-foundation.org, gregkh@linuxfoundation.org,
        rafael@kernel.org
Cc: christophe.leroy@csgroup.eu, tglx@linutronix.de,
        peterz@infradead.org, song@kernel.org, rppt@kernel.org,
        willy@infradead.org, vbabka@suse.cz, mhocko@suse.com,
        dave.hansen@linux.intel.com, mcgrof@kernel.org
Subject: [PATCH 4/7] sempahore: add a helper for a concurrency limiter
Date: Tue, 28 Mar 2023 22:31:46 -0700
Message-Id: <20230329053149.3976378-5-mcgrof@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20230329053149.3976378-1-mcgrof@kernel.org>
References: <20230329053149.3976378-1-mcgrof@kernel.org>
MIME-Version: 1.0
Sender: Luis Chamberlain <mcgrof@infradead.org>
Precedence: bulk
List-ID: <linux-modules.vger.kernel.org>

While I looked at re-using the old kernel/kmod.c (now kernel/module/kmod.c)
concurrency delimiter methodology for another place in the kernel Linus
noted that this could be simply replaced with a sempahore [0].

So add that so we we don't re-invent the wheel and make it obvious to use.

[0] https://lore.kernel.org/all/CAHk-=whkj6=wyi201JXkw9iT_eTUTsSx+Yb9d4OgmZFjDJA18g@mail.gmail.com/

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/semaphore.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
index 6694d0019a68..2ecdffdb9814 100644
--- a/include/linux/semaphore.h
+++ b/include/linux/semaphore.h
@@ -28,6 +28,9 @@ struct semaphore {
 #define DEFINE_SEMAPHORE(name)	\
 	struct semaphore name = __SEMAPHORE_INITIALIZER(name, 1)
 
+#define CONCURRENCY_LIMITER(name, n) \
+	struct semaphore name = __SEMAPHORE_INITIALIZER(name, n)
+
 static inline void sema_init(struct semaphore *sem, int val)
 {
 	static struct lock_class_key __key;

From patchwork Wed Mar 29 05:31:47 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Luis Chamberlain <mcgrof@kernel.org>
X-Patchwork-Id: 13191906
Return-Path: <linux-modules-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id B332AC6FD18
	for <linux-modules@archiver.kernel.org>;
 Wed, 29 Mar 2023 05:32:12 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229784AbjC2FcL (ORCPT
        <rfc822;linux-modules@archiver.kernel.org>);
        Wed, 29 Mar 2023 01:32:11 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57256 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229666AbjC2FcG (ORCPT
        <rfc822;linux-modules@vger.kernel.org>);
        Wed, 29 Mar 2023 01:32:06 -0400
Received: from bombadil.infradead.org (bombadil.infradead.org
 [IPv6:2607:7c80:54:3::133])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id B97C53C04;
        Tue, 28 Mar 2023 22:31:54 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
        d=infradead.org; s=bombadil.20210309;
 h=Sender:Content-Transfer-Encoding:
        MIME-Version:References:In-Reply-To:Message-Id:Date:Subject:Cc:To:From:
        Reply-To:Content-Type:Content-ID:Content-Description;
        bh=4WklghmhRoXLW11BvhMQXmcUtv91pDbajk9xVWUSoKw=;
 b=Rc1W4XBsmzSM4fsephNuxA5+Hc
        hN1F7QwRigICSNV8c1+dRaLlmHSl15upfOraFBq7OwpG+Irgc3a7wongSfIqAiZZfSOn9Vsx1o0J4
        LQXqgi70HEADX3OtaFpqZPnDiPLnoNMo1bW9+0Uh/P8BBqyznnMRKUmfZrU49Pb7DA5POilNBpkbE
        OhzYrkOlMapvraFlDXnKzTlsgNgPpRBElg4Ini3Te0CJpdITvxeakGC1tf+zXJvpBOWnRn+z0YUHg
        w9LEXdbu+FKrCjDM1jpzTRuCtqNiEhzDAzbDQB+ZOFhsRGAege8eJEtU1q3Ewz1uUm3tlCAcO86kL
        /ZFLa/6Q==;
Received: from mcgrof by bombadil.infradead.org with local (Exim 4.96 #2 (Red
 Hat Linux))
        id 1phOPW-00GgRm-2Z;
        Wed, 29 Mar 2023 05:31:50 +0000
From: Luis Chamberlain <mcgrof@kernel.org>
To: david@redhat.com, patches@lists.linux.dev,
        linux-modules@vger.kernel.org, linux-mm@kvack.org,
        linux-kernel@vger.kernel.org, pmladek@suse.com,
        petr.pavlu@suse.com, prarit@redhat.com,
        torvalds@linux-foundation.org, gregkh@linuxfoundation.org,
        rafael@kernel.org
Cc: christophe.leroy@csgroup.eu, tglx@linutronix.de,
        peterz@infradead.org, song@kernel.org, rppt@kernel.org,
        willy@infradead.org, vbabka@suse.cz, mhocko@suse.com,
        dave.hansen@linux.intel.com, mcgrof@kernel.org
Subject: [PATCH 5/7] modules/kmod: replace implementation with a sempahore
Date: Tue, 28 Mar 2023 22:31:47 -0700
Message-Id: <20230329053149.3976378-6-mcgrof@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20230329053149.3976378-1-mcgrof@kernel.org>
References: <20230329053149.3976378-1-mcgrof@kernel.org>
MIME-Version: 1.0
Sender: Luis Chamberlain <mcgrof@infradead.org>
Precedence: bulk
List-ID: <linux-modules.vger.kernel.org>

Simplfy the concurrency delimiter we user for kmod with the semaphore.
I had used the kmod strategy to try to implement a similar concurrency
delimiter for the kernel_read*() calls from the finit_module() path
so to reduce vmalloc() memory pressure. That effort didn't provid yet
conclusive results, but one thing that did became clear is we can use
the suggested alternative solution with semaphores which Linus hinted
at instead of using the atomic / wait strategy.

I've stress tested this with kmod test 0008:

time /data/linux-next/tools/testing/selftests/kmod/kmod.sh -t 0008

And I get only a *slight* delay. That delay however is small, a few
seconds for a full test loop run that runs 150 times, for about ~30-40
seconds. The small delay is worth the simplfication IMHO.

Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/kmod.c | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/kernel/module/kmod.c b/kernel/module/kmod.c
index b717134ebe17..fd7c461387f8 100644
--- a/kernel/module/kmod.c
+++ b/kernel/module/kmod.c
@@ -40,8 +40,7 @@
  * effect. Systems like these are very unlikely if modules are enabled.
  */
 #define MAX_KMOD_CONCURRENT 50
-static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
-static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
+static CONCURRENCY_LIMITER(kmod_concurrent_max, MAX_KMOD_CONCURRENT);
 
 /*
  * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
@@ -148,29 +147,18 @@ int __request_module(bool wait, const char *fmt, ...)
 	if (ret)
 		return ret;
 
-	if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) {
-		pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
-				    atomic_read(&kmod_concurrent_max),
-				    MAX_KMOD_CONCURRENT, module_name);
-		ret = wait_event_killable_timeout(kmod_wq,
-						  atomic_dec_if_positive(&kmod_concurrent_max) >= 0,
-						  MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
-		if (!ret) {
-			pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
-					    module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
-			return -ETIME;
-		} else if (ret == -ERESTARTSYS) {
-			pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name);
-			return ret;
-		}
+	ret = down_timeout(&kmod_concurrent_max, MAX_KMOD_ALL_BUSY_TIMEOUT);
+	if (ret) {
+		pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
+				    module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
+		return ret;
 	}
 
 	trace_module_request(module_name, wait, _RET_IP_);
 
 	ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
 
-	atomic_inc(&kmod_concurrent_max);
-	wake_up(&kmod_wq);
+	up(&kmod_concurrent_max);
 
 	return ret;
 }

From patchwork Wed Mar 29 05:31:48 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Luis Chamberlain <mcgrof@kernel.org>
X-Patchwork-Id: 13191903
Return-Path: <linux-modules-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id DE1BEC77B61
	for <linux-modules@archiver.kernel.org>;
 Wed, 29 Mar 2023 05:32:09 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229779AbjC2FcI (ORCPT
        <rfc822;linux-modules@archiver.kernel.org>);
        Wed, 29 Mar 2023 01:32:08 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57230 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229755AbjC2FcF (ORCPT
        <rfc822;linux-modules@vger.kernel.org>);
        Wed, 29 Mar 2023 01:32:05 -0400
Received: from bombadil.infradead.org (bombadil.infradead.org
 [IPv6:2607:7c80:54:3::133])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 536603C29;
        Tue, 28 Mar 2023 22:31:54 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
        d=infradead.org; s=bombadil.20210309;
 h=Sender:Content-Transfer-Encoding:
        MIME-Version:References:In-Reply-To:Message-Id:Date:Subject:Cc:To:From:
        Reply-To:Content-Type:Content-ID:Content-Description;
        bh=9aCwO6lydcYrtyg5yPLXnIQyE+LoOte6FX6HU5bmimM=;
 b=wRqNpi2g15Ano4d7S1D1Se8pFe
        rx5M2O1Oylzh0O3wIDwg5BZf5hYJHEZ4wPOlQT1a/cw9WGV4lbuzzH3WstC9eh95IXhDd/nqi7S+S
        SFQFG/EjhJzTx7CJXXzo8zx4qEijzU2qxBpNccJBe709o2dplGwgg3D94qIg+svDQ1qh/h/Qv4HuW
        VxbOuAJqab03K+uJZLBZZzHPknX+5nXLKzGD7VI9droyfDnQ8ZfZ5YS/sPGgUYNx6lCil+4rNyXfd
        5PlNA67XBZ4kedjsm+pbkJ7idlMJ0AYK1mNpQyGKJNfsQhIm3llHEAfEAMvgImFfmCkuNtHVzGrJh
        NWs6awpQ==;
Received: from mcgrof by bombadil.infradead.org with local (Exim 4.96 #2 (Red
 Hat Linux))
        id 1phOPW-00GgRq-2i;
        Wed, 29 Mar 2023 05:31:50 +0000
From: Luis Chamberlain <mcgrof@kernel.org>
To: david@redhat.com, patches@lists.linux.dev,
        linux-modules@vger.kernel.org, linux-mm@kvack.org,
        linux-kernel@vger.kernel.org, pmladek@suse.com,
        petr.pavlu@suse.com, prarit@redhat.com,
        torvalds@linux-foundation.org, gregkh@linuxfoundation.org,
        rafael@kernel.org
Cc: christophe.leroy@csgroup.eu, tglx@linutronix.de,
        peterz@infradead.org, song@kernel.org, rppt@kernel.org,
        willy@infradead.org, vbabka@suse.cz, mhocko@suse.com,
        dave.hansen@linux.intel.com, mcgrof@kernel.org
Subject: [PATCH 6/7] debugfs: add debugfs_create_atomic64_t for atomic64_t
Date: Tue, 28 Mar 2023 22:31:48 -0700
Message-Id: <20230329053149.3976378-7-mcgrof@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20230329053149.3976378-1-mcgrof@kernel.org>
References: <20230329053149.3976378-1-mcgrof@kernel.org>
MIME-Version: 1.0
Sender: Luis Chamberlain <mcgrof@infradead.org>
Precedence: bulk
List-ID: <linux-modules.vger.kernel.org>

Sometimes you want to add debugfs entries for atomic counters which
can be pretty large using atomic64_t. Add support for these.

Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c       | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/debugfs.h |  2 ++
 2 files changed, 38 insertions(+)

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 1f971c880dde..76d923503861 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -780,6 +780,42 @@ void debugfs_create_atomic_t(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
 
+static int debugfs_atomic64_t_set(void *data, u64 val)
+{
+	atomic64_set((atomic64_t *)data, val);
+	return 0;
+}
+static int debugfs_atomic64_t_get(void *data, u64 *val)
+{
+	*val = atomic64_read((atomic64_t *)data);
+	return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic64_t, debugfs_atomic64_t_get,
+			debugfs_atomic64_t_set, "%lld\n");
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic64_t_ro, debugfs_atomic64_t_get, NULL,
+			"%lld\n");
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic64_t_wo, NULL, debugfs_atomic64_t_set,
+			"%lld\n");
+
+/**
+ * debugfs_create_atomic64_t - create a debugfs file that is used to read and
+ * write an atomic64_t value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+void debugfs_create_atomic64_t(const char *name, umode_t mode,
+			     struct dentry *parent, atomic64_t *value)
+{
+	debugfs_create_mode_unsafe(name, mode, parent, value, &fops_atomic64_t,
+				   &fops_atomic64_t_ro, &fops_atomic64_t_wo);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_atomic64_t);
+
 ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
 			       size_t count, loff_t *ppos)
 {
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index ea2d919fd9c7..f5cc613a545e 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -136,6 +136,8 @@ void debugfs_create_size_t(const char *name, umode_t mode,
 			   struct dentry *parent, size_t *value);
 void debugfs_create_atomic_t(const char *name, umode_t mode,
 			     struct dentry *parent, atomic_t *value);
+void debugfs_create_atomic64_t(const char *name, umode_t mode,
+			       struct dentry *parent, atomic64_t *value);
 void debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent,
 			 bool *value);
 void debugfs_create_str(const char *name, umode_t mode,

From patchwork Wed Mar 29 05:31:49 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Luis Chamberlain <mcgrof@kernel.org>
X-Patchwork-Id: 13191908
Return-Path: <linux-modules-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 8DF2DC761AF
	for <linux-modules@archiver.kernel.org>;
 Wed, 29 Mar 2023 05:32:17 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229817AbjC2FcQ (ORCPT
        <rfc822;linux-modules@archiver.kernel.org>);
        Wed, 29 Mar 2023 01:32:16 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57262 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229776AbjC2FcJ (ORCPT
        <rfc822;linux-modules@vger.kernel.org>);
        Wed, 29 Mar 2023 01:32:09 -0400
Received: from bombadil.infradead.org (bombadil.infradead.org
 [IPv6:2607:7c80:54:3::133])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 25DA13C39;
        Tue, 28 Mar 2023 22:31:55 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
        d=infradead.org; s=bombadil.20210309;
 h=Sender:Content-Transfer-Encoding:
        MIME-Version:References:In-Reply-To:Message-Id:Date:Subject:Cc:To:From:
        Reply-To:Content-Type:Content-ID:Content-Description;
        bh=2bVkSUsIADWswZ5bVimfZdo+Dy3JEjup3DneXTttdjU=;
 b=XhxXDw48uuZL/uQqb/68ZIqJmD
        rmpQ5+Q9WG115GOyc2PP14uow1Fu9Fyl+xWkZjxKZPYGV93xilgKKaBYoUt+OfXEwj8QKkWJYdBAP
        IlOSqIb2pSUIbY05lI7yLe9mQz5sYUPfe2ZaCkc0mtkmJ5EgcE6Wl/9UhBxe1X1SjI0HtQj4Nsklu
        G0XJOAjov0Mty6kJx7mKk8qTyGDWjwDWIjOJYGmew3nmK1fFOcBMhqcc3k6rUwweq/cTtXlDD6n0v
        29oZ94XSTgSoPHrpY41wLms5xSkIGpjfT/z1uB11c2CAUA2biuHqIOL5jHz7+HStSb9kGiUkN6CoT
        qrdIB1TQ==;
Received: from mcgrof by bombadil.infradead.org with local (Exim 4.96 #2 (Red
 Hat Linux))
        id 1phOPW-00GgRu-2q;
        Wed, 29 Mar 2023 05:31:50 +0000
From: Luis Chamberlain <mcgrof@kernel.org>
To: david@redhat.com, patches@lists.linux.dev,
        linux-modules@vger.kernel.org, linux-mm@kvack.org,
        linux-kernel@vger.kernel.org, pmladek@suse.com,
        petr.pavlu@suse.com, prarit@redhat.com,
        torvalds@linux-foundation.org, gregkh@linuxfoundation.org,
        rafael@kernel.org
Cc: christophe.leroy@csgroup.eu, tglx@linutronix.de,
        peterz@infradead.org, song@kernel.org, rppt@kernel.org,
        willy@infradead.org, vbabka@suse.cz, mhocko@suse.com,
        dave.hansen@linux.intel.com, mcgrof@kernel.org
Subject: [PATCH 7/7] module: add debug stats to help identify memory pressure
Date: Tue, 28 Mar 2023 22:31:49 -0700
Message-Id: <20230329053149.3976378-8-mcgrof@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20230329053149.3976378-1-mcgrof@kernel.org>
References: <20230329053149.3976378-1-mcgrof@kernel.org>
MIME-Version: 1.0
Sender: Luis Chamberlain <mcgrof@infradead.org>
Precedence: bulk
List-ID: <linux-modules.vger.kernel.org>

Loading modules with finit_module() can end up using vmalloc(), vmap()
and vmalloc() again, for a total of up to 3 separate allocations in the
worse case for a single module. We always kernel_read*() the module,
that's a vmalloc(). Then vmap() is used for the module decompression,
and if so the last read buffer is freed as we use the now decompressed
module buffer to stuff data into our copy module. The last one is
specific to architectures but pretty much that's generally a series
of vmalloc() for different ELF sections...

Evaluation with new stress-ng module support [1] with just 100 ops
us proving that you can end up using GiBs of data easily even if
we are trying to be very careful not to load modules which are already
loaded. 100 ops seems to resemble the sort of pressure a system with
about 400 CPUs can create on modules. Although those issues for so
many concurrent loads per CPU is silly and are being fixed, we lack
proper tooling to help diagnose easily what happened, when it happened
and what likely are the culprits -- userspace or kernel module
autoloading.

Provide an initial set of stats for debugfs which let us easily scrape
post-boot information about failed loads. This sort of information can
be used on production worklaods to try to optimize *avoiding* redundant
memory pressure using finit_module().

Screen shot:

root@kmod ~ # cat /sys/kernel/debug/modules/stats
           Modules loaded       67
        Total module size       11464704
      Total mod text size       4194304
       Failed kread bytes       890064
        Failed kmod bytes       890064
      Invalid kread bytes       890064
 Invalid decompress bytes       0
        Invalid mod bytes       890064
         Average mod size       171115
    Average mod text size       62602
Failed modules:
                kvm_intel
                      kvm
                irqbypass
         crct10dif_pclmul
      ghash_clmulni_intel
             sha512_ssse3
           sha512_generic
              aesni_intel
              crypto_simd
                   cryptd
                    evdev
                serio_raw
               virtio_pci
                     nvme
                nvme_core
    virtio_pci_legacy_dev
                   t10_pi
           crc64_rocksoft
    virtio_pci_modern_dev
             crc32_pclmul
                   virtio
             crc32c_intel
              virtio_ring
                    crc64

[0] https://github.com/ColinIanKing/stress-ng.git
[1] echo 0 > /proc/sys/vm/oom_dump_tasks
    ./stress-ng --module 100 --module-name xfs

Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/Kconfig    |  32 +++++++
 kernel/module/Makefile   |   4 +
 kernel/module/debug.c    |  16 ++++
 kernel/module/internal.h |  35 +++++++
 kernel/module/main.c     |  45 ++++++++-
 kernel/module/stats.c    | 200 +++++++++++++++++++++++++++++++++++++++
 kernel/module/tracking.c |   7 +-
 7 files changed, 331 insertions(+), 8 deletions(-)
 create mode 100644 kernel/module/debug.c
 create mode 100644 kernel/module/stats.c

diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 424b3bc58f3f..fbf7b92cb3d0 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -22,6 +22,12 @@ menuconfig MODULES
 
 if MODULES
 
+config MODULE_DEBUG
+	bool "Module debugging"
+	default n
+	help
+	  Allows to help debug module functionality.
+
 config MODULE_FORCE_LOAD
 	bool "Forced module loading"
 	default n
@@ -48,6 +54,8 @@ config MODULE_FORCE_UNLOAD
 	  rmmod).  This is mainly for kernel developers and desperate users.
 	  If unsure, say N.
 
+if MODULE_DEBUG
+
 config MODULE_UNLOAD_TAINT_TRACKING
 	bool "Tainted module unload tracking"
 	depends on MODULE_UNLOAD
@@ -59,6 +67,30 @@ config MODULE_UNLOAD_TAINT_TRACKING
 	  page (see bad_page()), the aforementioned details are also
 	  shown. If unsure, say N.
 
+config MODULE_STATS
+	bool "Module statistics"
+	depends on DEBUG_FS
+	default n
+	help
+	  This option allows you to maintain a record of module statistics.
+	  For example each all modules size, average size, text size, and
+	  failed modules and the size for each of those. For failed
+	  modules we keep track of module which failed due to either the
+	  existing module taking too long to load or that module already
+	  was loaded.
+
+	  You should enable this if you are debugging production loads
+	  and want to see if userspace or the kernel is doing stupid things
+	  with loading modules when it shouldn't or if you want to help
+	  optimize userspace / kernel space module autoloading schemes.
+	  You might want to do this because failed modules tend to use
+	  use up significan amount of memory, and so you'd be doing everyone
+	  a favor in avoiding these failure proactively.
+
+	  If unsure, say N.
+
+endif # MODULE_DEBUG
+
 config MODVERSIONS
 	bool "Module versioning support"
 	help
diff --git a/kernel/module/Makefile b/kernel/module/Makefile
index 5b1d26b53b8d..fe97047f3807 100644
--- a/kernel/module/Makefile
+++ b/kernel/module/Makefile
@@ -20,4 +20,8 @@ obj-$(CONFIG_PROC_FS) += procfs.o
 obj-$(CONFIG_SYSFS) += sysfs.o
 obj-$(CONFIG_KGDB_KDB) += kdb.o
 obj-$(CONFIG_MODVERSIONS) += version.o
+
+# Link order matters here, keep debug.o first.
+obj-$(CONFIG_MODULE_DEBUG) += debug.o
+obj-$(CONFIG_MODULE_STATS) += stats.o
 obj-$(CONFIG_MODULE_UNLOAD_TAINT_TRACKING) += tracking.o
diff --git a/kernel/module/debug.c b/kernel/module/debug.c
new file mode 100644
index 000000000000..ef580d70b751
--- /dev/null
+++ b/kernel/module/debug.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#include <linux/debugfs.h>
+#include "internal.h"
+
+struct dentry *mod_debugfs_root;
+
+static int module_debugfs_init(void)
+{
+	mod_debugfs_root = debugfs_create_dir("modules", NULL);
+	return 0;
+}
+module_init(module_debugfs_init);
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 6ae29bb8836f..a645cb3fafc7 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -143,6 +143,41 @@ static inline bool set_livepatch_module(struct module *mod)
 #endif
 }
 
+#ifdef CONFIG_MODULE_STATS
+
+#define mod_stat_add64(count, var) atomic64_add(count, var)
+#define mod_stat_inc(name) atomic_inc(name)
+
+extern atomic64_t total_mod_size;
+extern atomic64_t total_text_size;
+extern atomic64_t invalid_kread_bytes;
+extern atomic64_t invalid_decompress_bytes;
+extern atomic64_t invalid_mod_becoming_bytes;
+extern atomic64_t invalid_mod_bytes;
+
+extern atomic_t modcount;
+extern atomic_t failed_kreads;
+extern atomic_t failed_decompress;
+extern atomic_t failed_load_modules;
+struct mod_fail_load {
+	struct list_head list;
+	char name[MODULE_NAME_LEN];
+	atomic_t count;
+};
+
+int try_add_failed_module(const char *name);
+
+#else
+
+#define mod_stat_inc64(name)
+#define mod_stat_inc(name) atomic_inc(name)
+
+static inline int try_add_failed_module(const char *name)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULE_STATS */
+
 #ifdef CONFIG_MODULE_UNLOAD_TAINT_TRACKING
 struct mod_unload_taint {
 	struct list_head list;
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 145e15f19576..8b851042b7f9 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2495,6 +2495,16 @@ static noinline int do_init_module(struct module *mod)
 {
 	int ret = 0;
 	struct mod_initfree *freeinit;
+	unsigned int text_size = 0, total_size = 0;
+
+	for_each_mod_mem_type(type) {
+		const struct module_memory *mod_mem = &mod->mem[type];
+		if (mod_mem->size) {
+			total_size += mod_mem->size;
+			if (type == MOD_TEXT || type == MOD_INIT_TEXT)
+				text_size += mod->mem[type].size;
+		}
+	}
 
 	freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
 	if (!freeinit) {
@@ -2556,6 +2566,7 @@ static noinline int do_init_module(struct module *mod)
 		mod->mem[type].base = NULL;
 		mod->mem[type].size = 0;
 	}
+
 #ifdef CONFIG_DEBUG_INFO_BTF_MODULES
 	/* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */
 	mod->btf_data = NULL;
@@ -2579,6 +2590,11 @@ static noinline int do_init_module(struct module *mod)
 	mutex_unlock(&module_mutex);
 	wake_up_all(&module_wq);
 
+	mod_stat_add64(text_size, &total_text_size);
+	mod_stat_add64(total_size, &total_mod_size);
+
+	mod_stat_inc(&modcount);
+
 	return 0;
 
 fail_free_freeinit:
@@ -2594,6 +2610,7 @@ static noinline int do_init_module(struct module *mod)
 	ftrace_release_mod(mod);
 	free_module(mod);
 	wake_up_all(&module_wq);
+
 	return ret;
 }
 
@@ -2650,6 +2667,9 @@ static int module_patient_check_exists(const char *name)
 		old = find_module_all(name, strlen(name), true);
 	}
 
+	if (try_add_failed_module(name))
+		pr_warn("Could not add fail-tracking for module: %s\n", name);
+
 	/*
 	 * We are here only when the same module was being loaded. Do
 	 * not try to load it again right now. It prevents long delays
@@ -2800,6 +2820,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 		       int flags)
 {
 	struct module *mod;
+	bool module_allocated = false;
 	long err = 0;
 	char *after_dashes;
 
@@ -2839,6 +2860,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
 		goto free_copy;
 	}
 
+	module_allocated = true;
+
 	audit_log_kern_module(mod->name);
 
 	/* Reserve our place in the list. */
@@ -2983,6 +3006,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	synchronize_rcu();
 	mutex_unlock(&module_mutex);
  free_module:
+	mod_stat_add64(info->len, &invalid_mod_bytes);
 	/* Free lock-classes; relies on the preceding sync_rcu() */
 	for_class_mod_mem_type(type, core_data) {
 		lockdep_free_key_range(mod->mem[type].base,
@@ -2991,6 +3015,13 @@ static int load_module(struct load_info *info, const char __user *uargs,
 
 	module_deallocate(mod, info);
  free_copy:
+	/*
+	 * The info->len is always set. We distinguish between
+	 * failures once the proper module was allocated and
+	 * before that.
+	 */
+	if (!module_allocated)
+		mod_stat_add64(info->len, &invalid_mod_becoming_bytes);
 	free_copy(info, flags);
 	return err;
 }
@@ -3009,8 +3040,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 	       umod, len, uargs);
 
 	err = copy_module_from_user(umod, len, &info);
-	if (err)
+	if (err) {
+		mod_stat_inc(&failed_kreads);
 		return err;
+	}
 
 	return load_module(&info, uargs, 0);
 }
@@ -3035,14 +3068,20 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
 
 	len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL,
 				       READING_MODULE);
-	if (len < 0)
+	if (len < 0) {
+		mod_stat_inc(&failed_kreads);
+		mod_stat_add64(len, &invalid_kread_bytes);
 		return len;
+	}
 
 	if (flags & MODULE_INIT_COMPRESSED_FILE) {
 		err = module_decompress(&info, buf, len);
 		vfree(buf); /* compressed data is no longer needed */
-		if (err)
+		if (err) {
+			mod_stat_inc(&failed_decompress);
+			mod_stat_add64(len, &invalid_decompress_bytes);
 			return err;
+		}
 	} else {
 		info.hdr = buf;
 		info.len = len;
diff --git a/kernel/module/stats.c b/kernel/module/stats.c
new file mode 100644
index 000000000000..bbf0ddb8b589
--- /dev/null
+++ b/kernel/module/stats.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Let's keep keep tabs on a few important module stats, useful
+ * for debugging production loads and interactions between userspace
+ * and kernelspace for loading modules.
+ *
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/debugfs.h>
+#include <linux/rculist.h>
+
+#include "internal.h"
+
+extern struct dentry *mod_debugfs_root;
+
+/*
+ * Tracks modules which failed to be loaded as they were being processed.
+ * These require modulesed consumed vmalloc space for all finit_module()
+ * calls as kernel_read*() is used. Then if compression is used vmap()
+ * is used to allocate space for the decompressed version of what userspace
+ * has on the filesystem, we vfree() the compressed data which kerne_read*()
+ * fetched for us. Finally, a final module is allocated as well which we
+ * use to keep around, and that *can* use vmalloc() too.
+ *
+ * In the worst case, when module compression is used then we use the vmap
+ * space three times.
+ *
+ * We really should strive to get this list to be empty. This not being empty
+ * is a reflection of us needing to do more work to ensure either the kernel
+ * or usersapce does not do unnecessary calls to load modules which it should
+ * know are already loaded or on its way to be loaded.
+ */
+static LIST_HEAD(failed_modules);
+
+/* Total bytes used by all modules we've dealt with on this system */
+atomic64_t total_mod_size;
+
+/* Total .text section sizes we've dealt with on this system */
+atomic64_t total_text_size;
+
+/* Failures happen on the initial kernel_read_*() call. They use vmalloc() */
+atomic64_t invalid_kread_bytes;
+
+/* Failures happen on the module decompression path, these use use vmap(). */
+atomic64_t invalid_decompress_bytes;
+
+/*
+ * The invalid_mod_becoming_bytes only keeps tabs of failures in between kread
+ * success and right before we allocate the module to process it. These
+ * can be failures due to:
+ *
+ *  o module_sig_check() - module signature checks
+ *  o elf_validity_cache_copy() - ELF does not add up
+ *  o early_mod_check():
+ *  	- blacklist
+ *  	- failed to rewrite section headers
+ *  	- verion magic
+ *  	- live patch requirements didn't check out
+ *  	- the module was detected as being already present, this
+ *  	  first check avoids a new vmalloc for the full size of
+ *  	  the module.
+ */
+atomic64_t invalid_mod_becoming_bytes;
+
+/*
+ * These are failures after we did all the sanity checks of the
+ * module userspace passed to us. This can still fail if we detect
+ * the module is loaded, we do this check after we allocate space
+ * for the module.
+ */
+atomic64_t invalid_mod_bytes;
+
+/* How many modules we've loaded in our kernel life time */
+atomic_t modcount;
+
+/* How many modules failed due to failed kernel_read*() */
+atomic_t failed_kreads;
+
+/* How many failed decompression attempts we've had */
+atomic_t failed_decompress;
+
+/* How many modules failed once we've allocated space for our module */
+atomic_t failed_load_modules;
+
+int try_add_failed_module(const char *name)
+{
+	struct mod_fail_load *mod_fail;
+
+	list_for_each_entry_rcu(mod_fail, &failed_modules, list,
+				lockdep_is_held(&module_mutex)) {
+                if (!strcmp(mod_fail->name, name)) {
+                        atomic_inc(&mod_fail->count);
+                        goto out;
+                }
+        }
+
+	mod_fail = kmalloc(sizeof(*mod_fail), GFP_KERNEL);
+	if (!mod_fail)
+		return -ENOMEM;
+	strscpy(mod_fail->name, name, MODULE_NAME_LEN);
+        atomic_inc(&mod_fail->count);
+        list_add_rcu(&mod_fail->list, &failed_modules);
+out:
+	return 0;
+}
+
+static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf,
+				   size_t count, loff_t *ppos)
+{
+	struct mod_fail_load *mod_fail;
+	unsigned int len;
+	const unsigned int size = 1024;
+	char *buf;
+	u32 live_mod_count, fkreads, floads;
+	u64 total_size, text_size, ikread_bytes, idecomp_bytes, imod_bytes;
+
+	live_mod_count = atomic_read(&modcount);
+	fkreads = atomic_read(&failed_kreads);
+	floads = atomic_read(&failed_load_modules);
+
+	total_size = atomic64_read(&total_mod_size);
+	text_size = atomic64_read(&total_text_size);
+	ikread_bytes = atomic64_read(&invalid_mod_bytes);
+	idecomp_bytes = atomic64_read(&invalid_decompress_bytes);
+	imod_bytes = atomic64_read(&invalid_mod_bytes);
+
+	buf = kzalloc(size, GFP_KERNEL);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Modules loaded", live_mod_count);
+	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Total module size", total_size);
+	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Total mod text size", text_size);
+
+	/*
+	 * Failed kmod bytes do not contain any failed kreads bytes as those
+	 * failures would happen earlier on kread. Failed kread bytes are wasted
+	 * vmalloc() space allocations and are indicative of invalid modules.
+	 */
+	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Failed kread bytes", ikread_bytes);
+
+	/*
+	 * Failed kmod bytes are modules which for whatever reason fail to load
+	 * on the load_module() effort. They are good signs the kernel or userspace
+	 * is doing something stupid or that could be improved.
+	 */
+	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Failed kmod bytes", imod_bytes);
+
+	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Invalid kread bytes", ikread_bytes);
+	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Invalid decompress bytes", idecomp_bytes);
+	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Invalid mod bytes", imod_bytes);
+
+	if (live_mod_count && total_size) {
+		len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Average mod size",
+				 DIV_ROUND_UP(total_size, live_mod_count));
+	}
+
+	if (live_mod_count && text_size) {
+		len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Average mod text size",
+				 DIV_ROUND_UP(text_size, live_mod_count));
+	}
+
+	if (list_empty(&failed_modules))
+		goto out;
+
+	len += scnprintf(buf + len, size - len, "Failed modules:\n");
+	list_for_each_entry_rcu(mod_fail, &failed_modules, list)
+		len += scnprintf(buf + len, size - len, "%25s\n", mod_fail->name);
+out:
+        return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static const struct file_operations fops_mod_stats = {
+	.read = read_file_mod_stats,
+	.open = simple_open,
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
+};
+
+static int __init module_stats_init(void)
+{
+	debugfs_create_atomic64_t("total_mod_size", 0400, mod_debugfs_root, &total_mod_size);
+	debugfs_create_atomic64_t("total_text_size", 0400, mod_debugfs_root, &total_text_size);
+	debugfs_create_atomic64_t("invalid_kread_bytes", 0400, mod_debugfs_root, &invalid_kread_bytes);
+	debugfs_create_atomic64_t("invalid_decompress_bytes", 0400, mod_debugfs_root, &invalid_decompress_bytes);
+	debugfs_create_atomic64_t("invalid_mod_bytes", 0400, mod_debugfs_root, &invalid_mod_bytes);
+	debugfs_create_atomic_t("modcount", 0400, mod_debugfs_root, &modcount);
+	debugfs_create_atomic_t("failed_kreads", 0400, mod_debugfs_root, &failed_kreads);
+	debugfs_create_atomic_t("failed_load_modules", 0400, mod_debugfs_root, &failed_load_modules);
+	debugfs_create_file("stats", 0400, mod_debugfs_root, mod_debugfs_root, &fops_mod_stats);
+
+	return 0;
+}
+module_init(module_stats_init);
diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c
index 26d812e07615..cbeb9330db9b 100644
--- a/kernel/module/tracking.c
+++ b/kernel/module/tracking.c
@@ -15,6 +15,7 @@
 #include "internal.h"
 
 static LIST_HEAD(unloaded_tainted_modules);
+extern struct dentry *mod_debugfs_root
 
 int try_add_tainted_module(struct module *mod)
 {
@@ -120,12 +121,8 @@ static const struct file_operations unloaded_tainted_modules_fops = {
 
 static int __init unloaded_tainted_modules_init(void)
 {
-	struct dentry *dir;
-
-	dir = debugfs_create_dir("modules", NULL);
-	debugfs_create_file("unloaded_tainted", 0444, dir, NULL,
+	debugfs_create_file("unloaded_tainted", 0444, mod_debugfs_root, NULL,
 			    &unloaded_tainted_modules_fops);
-
 	return 0;
 }
 module_init(unloaded_tainted_modules_init);