diff mbox series

[18/19] mm/mmap: Charge locked memory to pins cgroup

Message ID fa58b745a597a32b434b0d92d55cf0b97317cfec.1675669136.git-series.apopple@nvidia.com (mailing list archive)
State New
Headers show
Series mm: Introduce a cgroup to limit the amount of locked and pinned memory | expand

Commit Message

Alistair Popple Feb. 6, 2023, 7:47 a.m. UTC
account_locked_vm() is used to account memory to mm->locked_vm. This
adds accounting to the pins cgorup as it behaves similarly and should
be accounted against the same global limit if set.

This means memory must now be unaccounted for correctly, as the cgroup
typically outlives both the mm and the task. It is assumed that
callers of account_locked_vm() only do accounting against the current
task. Callers that need to do accounting against remote tasks should
use account_pinned_vm() and associated struct vm_account to hold
references to the cgroup.

Signed-off-by: Alistair Popple <apopple@nvidia.com>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
---
 mm/util.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

Comments

Yosry Ahmed Feb. 6, 2023, 9:12 p.m. UTC | #1
On Sun, Feb 5, 2023 at 11:50 PM Alistair Popple <apopple@nvidia.com> wrote:
>
> account_locked_vm() is used to account memory to mm->locked_vm. This
> adds accounting to the pins cgorup as it behaves similarly and should
> be accounted against the same global limit if set.
>
> This means memory must now be unaccounted for correctly, as the cgroup
> typically outlives both the mm and the task. It is assumed that
> callers of account_locked_vm() only do accounting against the current
> task. Callers that need to do accounting against remote tasks should
> use account_pinned_vm() and associated struct vm_account to hold
> references to the cgroup.
>
> Signed-off-by: Alistair Popple <apopple@nvidia.com>
> Cc: linux-mm@kvack.org
> Cc: linux-kernel@vger.kernel.org
> ---
>  mm/util.c | 24 +++++++++++++++++++++++-
>  1 file changed, 23 insertions(+), 1 deletion(-)
>
> diff --git a/mm/util.c b/mm/util.c
> index 1ca0dfe..755bada 100644
> --- a/mm/util.c
> +++ b/mm/util.c
> @@ -589,15 +589,21 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages,
>                         struct task_struct *task, bool bypass_rlim)
>  {
>         unsigned long locked_vm, limit;
> +       struct pins_cgroup *pins_cg = get_pins_cg(task);

Here we get one ref one the pins cgroup for the entire locked region
that may contain multiple pages, right? During unlock, we drop the
ref. Is it possible that we lock a region (acquiring one ref), and
then unlock it in chunks (dropping multiple refs)?

If this is possible, we may have a problem here. We may need to
acquire one ref per pinned page (not sure if this can overflow). We
may also want to defer the refcount handling to the pins cgroup
controller code, similar to charge_memcg(), a function that tries to
charge and acquires any necessary refs, same for uncharging.

WDYT?

>         int ret = 0;
>
>         mmap_assert_write_locked(mm);
>
> +       if (pins_cg && !pins_try_charge(pins_cg, pages))
> +               return -ENOMEM;
> +
>         locked_vm = mm->locked_vm;
>         if (!bypass_rlim) {
>                 limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> -               if (locked_vm + pages > limit)
> +               if (locked_vm + pages > limit) {
> +                       pins_uncharge(pins_cg, pages);
>                         ret = -ENOMEM;
> +               }
>         }
>
>         if (!ret)
> @@ -607,6 +613,12 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages,
>                  (void *)_RET_IP_, pages << PAGE_SHIFT, locked_vm << PAGE_SHIFT,
>                 task_rlimit(task, RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
>
> +       pr_debug("%s: [%d] caller %ps %lu %lu/%lu%s\n", __func__, task->pid,
> +                (void *)_RET_IP_, pages << PAGE_SHIFT, locked_vm << PAGE_SHIFT,
> +               task_rlimit(task, RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
> +
> +       if (pins_cg)
> +               put_pins_cg(pins_cg);
>         return ret;
>  }
>  EXPORT_SYMBOL_GPL(__account_locked_vm);
> @@ -622,8 +634,18 @@ void __unaccount_locked_vm(struct mm_struct *mm, unsigned long pages)
>  {
>         unsigned long locked_vm = mm->locked_vm;
>
> +       /*
> +        * TODO: Convert book3s vio to use pinned vm to ensure
> +        * unaccounting happens to the correct cgroup.
> +        */
> +       struct pins_cgroup *pins_cg = get_pins_cg(current);
> +
>         mmap_assert_write_locked(mm);
>         WARN_ON_ONCE(pages > locked_vm);
> +       if (pins_cg) {
> +               pins_uncharge(pins_cg, pages);
> +               put_pins_cg(pins_cg);
> +       }
>         mm->locked_vm = locked_vm - pages;
>  }
>  EXPORT_SYMBOL_GPL(__unaccount_locked_vm);
> --
> git-series 0.9.1
>
diff mbox series

Patch

diff --git a/mm/util.c b/mm/util.c
index 1ca0dfe..755bada 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -589,15 +589,21 @@  int __account_locked_vm(struct mm_struct *mm, unsigned long pages,
 			struct task_struct *task, bool bypass_rlim)
 {
 	unsigned long locked_vm, limit;
+	struct pins_cgroup *pins_cg = get_pins_cg(task);
 	int ret = 0;
 
 	mmap_assert_write_locked(mm);
 
+	if (pins_cg && !pins_try_charge(pins_cg, pages))
+		return -ENOMEM;
+
 	locked_vm = mm->locked_vm;
 	if (!bypass_rlim) {
 		limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-		if (locked_vm + pages > limit)
+		if (locked_vm + pages > limit) {
+			pins_uncharge(pins_cg, pages);
 			ret = -ENOMEM;
+		}
 	}
 
 	if (!ret)
@@ -607,6 +613,12 @@  int __account_locked_vm(struct mm_struct *mm, unsigned long pages,
 		 (void *)_RET_IP_, pages << PAGE_SHIFT, locked_vm << PAGE_SHIFT,
 		task_rlimit(task, RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
 
+	pr_debug("%s: [%d] caller %ps %lu %lu/%lu%s\n", __func__, task->pid,
+		 (void *)_RET_IP_, pages << PAGE_SHIFT, locked_vm << PAGE_SHIFT,
+		task_rlimit(task, RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
+
+	if (pins_cg)
+		put_pins_cg(pins_cg);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(__account_locked_vm);
@@ -622,8 +634,18 @@  void __unaccount_locked_vm(struct mm_struct *mm, unsigned long pages)
 {
 	unsigned long locked_vm = mm->locked_vm;
 
+	/*
+	 * TODO: Convert book3s vio to use pinned vm to ensure
+	 * unaccounting happens to the correct cgroup.
+	 */
+	struct pins_cgroup *pins_cg = get_pins_cg(current);
+
 	mmap_assert_write_locked(mm);
 	WARN_ON_ONCE(pages > locked_vm);
+	if (pins_cg) {
+		pins_uncharge(pins_cg, pages);
+		put_pins_cg(pins_cg);
+	}
 	mm->locked_vm = locked_vm - pages;
 }
 EXPORT_SYMBOL_GPL(__unaccount_locked_vm);