diff mbox series

[RFC,4/5] mm/ksm, proc: introduce remote merge

Message ID 20190516094234.9116-5-oleksandr@redhat.com (mailing list archive)
State New, archived
Headers show
Series mm/ksm, proc: introduce remote madvise | expand

Commit Message

Oleksandr Natalenko May 16, 2019, 9:42 a.m. UTC
Use previously introduced remote madvise knob to mark task's
anonymous memory as mergeable.

To force merging task's VMAs, "merge" hint is used:

   # echo merge > /proc/<pid>/madvise

Force unmerging is done similarly:

   # echo unmerge > /proc/<pid>/madvise

To achieve this, previously introduced ksm_madvise_*() helpers
are used.

Signed-off-by: Oleksandr Natalenko <oleksandr@redhat.com>
---
 fs/proc/base.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

Comments

Jann Horn May 16, 2019, 10 a.m. UTC | #1
On Thu, May 16, 2019 at 11:43 AM Oleksandr Natalenko
<oleksandr@redhat.com> wrote:
> Use previously introduced remote madvise knob to mark task's
> anonymous memory as mergeable.
>
> To force merging task's VMAs, "merge" hint is used:
>
>    # echo merge > /proc/<pid>/madvise
>
> Force unmerging is done similarly:
>
>    # echo unmerge > /proc/<pid>/madvise
>
> To achieve this, previously introduced ksm_madvise_*() helpers
> are used.

Why does this not require PTRACE_MODE_ATTACH_FSCREDS to the target
process? Enabling KSM on another process is hazardous because it
significantly increases the attack surface for side channels.

(Note that if you change this to require PTRACE_MODE_ATTACH_FSCREDS,
you'll want to use mm_access() in the ->open handler and drop the mm
in ->release. mm_access() from a ->write handler is not permitted.)

[...]
> @@ -2960,15 +2962,63 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
>  static ssize_t madvise_write(struct file *file, const char __user *buf,
>                 size_t count, loff_t *ppos)
>  {
> +       /* For now, only KSM hints are implemented */
> +#ifdef CONFIG_KSM
> +       char buffer[PROC_NUMBUF];
> +       int behaviour;
>         struct task_struct *task;
> +       struct mm_struct *mm;
> +       int err = 0;
> +       struct vm_area_struct *vma;
> +
> +       memset(buffer, 0, sizeof(buffer));
> +       if (count > sizeof(buffer) - 1)
> +               count = sizeof(buffer) - 1;
> +       if (copy_from_user(buffer, buf, count))
> +               return -EFAULT;
> +
> +       if (!memcmp("merge", buffer, min(sizeof("merge")-1, count)))

This means that you also match on something like "mergeblah". Just use strcmp().

> +               behaviour = MADV_MERGEABLE;
> +       else if (!memcmp("unmerge", buffer, min(sizeof("unmerge")-1, count)))
> +               behaviour = MADV_UNMERGEABLE;
> +       else
> +               return -EINVAL;
>
>         task = get_proc_task(file_inode(file));
>         if (!task)
>                 return -ESRCH;
>
> +       mm = get_task_mm(task);
> +       if (!mm) {
> +               err = -EINVAL;
> +               goto out_put_task_struct;
> +       }
> +
> +       down_write(&mm->mmap_sem);

Should a check for mmget_still_valid(mm) be inserted here? See commit
04f5866e41fb70690e28397487d8bd8eea7d712a.

> +       switch (behaviour) {
> +       case MADV_MERGEABLE:
> +       case MADV_UNMERGEABLE:

This switch isn't actually necessary at this point, right?

> +               vma = mm->mmap;
> +               while (vma) {
> +                       if (behaviour == MADV_MERGEABLE)
> +                               ksm_madvise_merge(vma->vm_mm, vma, &vma->vm_flags);
> +                       else
> +                               ksm_madvise_unmerge(vma, vma->vm_start, vma->vm_end, &vma->vm_flags);
> +                       vma = vma->vm_next;
> +               }
> +               break;
> +       }
> +       up_write(&mm->mmap_sem);
> +
> +       mmput(mm);
> +
> +out_put_task_struct:
>         put_task_struct(task);
>
> -       return count;
> +       return err ? err : count;
> +#else
> +       return -EINVAL;
> +#endif /* CONFIG_KSM */
>  }
Oleksandr Natalenko May 16, 2019, 2:20 p.m. UTC | #2
Hi.

On Thu, May 16, 2019 at 12:00:24PM +0200, Jann Horn wrote:
> On Thu, May 16, 2019 at 11:43 AM Oleksandr Natalenko
> <oleksandr@redhat.com> wrote:
> > Use previously introduced remote madvise knob to mark task's
> > anonymous memory as mergeable.
> >
> > To force merging task's VMAs, "merge" hint is used:
> >
> >    # echo merge > /proc/<pid>/madvise
> >
> > Force unmerging is done similarly:
> >
> >    # echo unmerge > /proc/<pid>/madvise
> >
> > To achieve this, previously introduced ksm_madvise_*() helpers
> > are used.
> 
> Why does this not require PTRACE_MODE_ATTACH_FSCREDS to the target
> process? Enabling KSM on another process is hazardous because it
> significantly increases the attack surface for side channels.
> 
> (Note that if you change this to require PTRACE_MODE_ATTACH_FSCREDS,
> you'll want to use mm_access() in the ->open handler and drop the mm
> in ->release. mm_access() from a ->write handler is not permitted.)

Sounds reasonable. So, something similar to what mem_open() & friends do
now:

static int madvise_open(...)
...
	struct task_struct *task = get_proc_task(inode);
...
	if (task) {
		mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
		put_task_struct(task);
		if (!IS_ERR_OR_NULL(mm)) {
			mmgrab(mm);
			mmput(mm);
...

Then:

static ssize_t madvise_write(...)
...
	if (!mmget_not_zero(mm))
		goto out;

	down_write(&mm->mmap_sem);
	if (!mmget_still_valid(mm))
		goto skip_mm;
...
skip_mm:
	up_write(&mm->mmap_sem);

	mmput(mm);
out:
	return ...;

And, finally:

static int madvise_release(...)
...
		mmdrop(mm);
...

Right?

> [...]
> > @@ -2960,15 +2962,63 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
> >  static ssize_t madvise_write(struct file *file, const char __user *buf,
> >                 size_t count, loff_t *ppos)
> >  {
> > +       /* For now, only KSM hints are implemented */
> > +#ifdef CONFIG_KSM
> > +       char buffer[PROC_NUMBUF];
> > +       int behaviour;
> >         struct task_struct *task;
> > +       struct mm_struct *mm;
> > +       int err = 0;
> > +       struct vm_area_struct *vma;
> > +
> > +       memset(buffer, 0, sizeof(buffer));
> > +       if (count > sizeof(buffer) - 1)
> > +               count = sizeof(buffer) - 1;
> > +       if (copy_from_user(buffer, buf, count))
> > +               return -EFAULT;
> > +
> > +       if (!memcmp("merge", buffer, min(sizeof("merge")-1, count)))
> 
> This means that you also match on something like "mergeblah". Just use strcmp().

I agree. Just to make it more interesting I must say that

   /sys/kernel/mm/transparent_hugepage/enabled

uses memcmp in the very same way, and thus echoing "alwaysssss" or
"madviseeee" works perfectly there, and it was like that from the very
beginning, it seems. Should we fix it, or it became (zomg) a public API?

> > +               behaviour = MADV_MERGEABLE;
> > +       else if (!memcmp("unmerge", buffer, min(sizeof("unmerge")-1, count)))
> > +               behaviour = MADV_UNMERGEABLE;
> > +       else
> > +               return -EINVAL;
> >
> >         task = get_proc_task(file_inode(file));
> >         if (!task)
> >                 return -ESRCH;
> >
> > +       mm = get_task_mm(task);
> > +       if (!mm) {
> > +               err = -EINVAL;
> > +               goto out_put_task_struct;
> > +       }
> > +
> > +       down_write(&mm->mmap_sem);
> 
> Should a check for mmget_still_valid(mm) be inserted here? See commit
> 04f5866e41fb70690e28397487d8bd8eea7d712a.

Yeah, it seems so :/. Thanks for the pointer. I've put it into the
madvise_write snippet above.

> > +       switch (behaviour) {
> > +       case MADV_MERGEABLE:
> > +       case MADV_UNMERGEABLE:
> 
> This switch isn't actually necessary at this point, right?

Yup, but it is there to highlight a possibility of adding other, non-KSM
options. So, let it be, and I'll just re-arrange CONFIG_KSM ifdef
instead.

Thank you.

> > +               vma = mm->mmap;
> > +               while (vma) {
> > +                       if (behaviour == MADV_MERGEABLE)
> > +                               ksm_madvise_merge(vma->vm_mm, vma, &vma->vm_flags);
> > +                       else
> > +                               ksm_madvise_unmerge(vma, vma->vm_start, vma->vm_end, &vma->vm_flags);
> > +                       vma = vma->vm_next;
> > +               }
> > +               break;
> > +       }
> > +       up_write(&mm->mmap_sem);
> > +
> > +       mmput(mm);
> > +
> > +out_put_task_struct:
> >         put_task_struct(task);
> >
> > -       return count;
> > +       return err ? err : count;
> > +#else
> > +       return -EINVAL;
> > +#endif /* CONFIG_KSM */
> >  }
Oleksandr Natalenko May 16, 2019, 2:43 p.m. UTC | #3
On Thu, May 16, 2019 at 04:20:13PM +0200, Oleksandr Natalenko wrote:
> > [...]
> > > @@ -2960,15 +2962,63 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
> > >  static ssize_t madvise_write(struct file *file, const char __user *buf,
> > >                 size_t count, loff_t *ppos)
> > >  {
> > > +       /* For now, only KSM hints are implemented */
> > > +#ifdef CONFIG_KSM
> > > +       char buffer[PROC_NUMBUF];
> > > +       int behaviour;
> > >         struct task_struct *task;
> > > +       struct mm_struct *mm;
> > > +       int err = 0;
> > > +       struct vm_area_struct *vma;
> > > +
> > > +       memset(buffer, 0, sizeof(buffer));
> > > +       if (count > sizeof(buffer) - 1)
> > > +               count = sizeof(buffer) - 1;
> > > +       if (copy_from_user(buffer, buf, count))
> > > +               return -EFAULT;
> > > +
> > > +       if (!memcmp("merge", buffer, min(sizeof("merge")-1, count)))
> > 
> > This means that you also match on something like "mergeblah". Just use strcmp().
> 
> I agree. Just to make it more interesting I must say that
> 
>    /sys/kernel/mm/transparent_hugepage/enabled
> 
> uses memcmp in the very same way, and thus echoing "alwaysssss" or
> "madviseeee" works perfectly there, and it was like that from the very
> beginning, it seems. Should we fix it, or it became (zomg) a public API?

Actually, maybe, the reason for using memcmp is to handle "echo"
properly: by default it puts a newline character at the end, so if we use
just strcmp, echo should be called with -n, otherwise strcmp won't match
the string.

Huh?

> [...]
Jann Horn May 16, 2019, 4:06 p.m. UTC | #4
On Thu, May 16, 2019 at 4:20 PM Oleksandr Natalenko
<oleksandr@redhat.com> wrote:
> On Thu, May 16, 2019 at 12:00:24PM +0200, Jann Horn wrote:
> > On Thu, May 16, 2019 at 11:43 AM Oleksandr Natalenko
> > <oleksandr@redhat.com> wrote:
> > > Use previously introduced remote madvise knob to mark task's
> > > anonymous memory as mergeable.
> > >
> > > To force merging task's VMAs, "merge" hint is used:
> > >
> > >    # echo merge > /proc/<pid>/madvise
> > >
> > > Force unmerging is done similarly:
> > >
> > >    # echo unmerge > /proc/<pid>/madvise
> > >
> > > To achieve this, previously introduced ksm_madvise_*() helpers
> > > are used.
> >
> > Why does this not require PTRACE_MODE_ATTACH_FSCREDS to the target
> > process? Enabling KSM on another process is hazardous because it
> > significantly increases the attack surface for side channels.
> >
> > (Note that if you change this to require PTRACE_MODE_ATTACH_FSCREDS,
> > you'll want to use mm_access() in the ->open handler and drop the mm
> > in ->release. mm_access() from a ->write handler is not permitted.)
>
> Sounds reasonable. So, something similar to what mem_open() & friends do
> now:
>
> static int madvise_open(...)
> ...
>         struct task_struct *task = get_proc_task(inode);
> ...
>         if (task) {
>                 mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
>                 put_task_struct(task);
>                 if (!IS_ERR_OR_NULL(mm)) {
>                         mmgrab(mm);
>                         mmput(mm);
> ...
>
> Then:
>
> static ssize_t madvise_write(...)
> ...
>         if (!mmget_not_zero(mm))
>                 goto out;
>
>         down_write(&mm->mmap_sem);
>         if (!mmget_still_valid(mm))
>                 goto skip_mm;
> ...
> skip_mm:
>         up_write(&mm->mmap_sem);
>
>         mmput(mm);
> out:
>         return ...;
>
> And, finally:
>
> static int madvise_release(...)
> ...
>                 mmdrop(mm);
> ...
>
> Right?

Yeah, that looks reasonable.
Jann Horn May 16, 2019, 4:09 p.m. UTC | #5
On Thu, May 16, 2019 at 4:43 PM Oleksandr Natalenko
<oleksandr@redhat.com> wrote:
> On Thu, May 16, 2019 at 04:20:13PM +0200, Oleksandr Natalenko wrote:
> > > [...]
> > > > @@ -2960,15 +2962,63 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
> > > >  static ssize_t madvise_write(struct file *file, const char __user *buf,
> > > >                 size_t count, loff_t *ppos)
> > > >  {
> > > > +       /* For now, only KSM hints are implemented */
> > > > +#ifdef CONFIG_KSM
> > > > +       char buffer[PROC_NUMBUF];
> > > > +       int behaviour;
> > > >         struct task_struct *task;
> > > > +       struct mm_struct *mm;
> > > > +       int err = 0;
> > > > +       struct vm_area_struct *vma;
> > > > +
> > > > +       memset(buffer, 0, sizeof(buffer));
> > > > +       if (count > sizeof(buffer) - 1)
> > > > +               count = sizeof(buffer) - 1;
> > > > +       if (copy_from_user(buffer, buf, count))
> > > > +               return -EFAULT;
> > > > +
> > > > +       if (!memcmp("merge", buffer, min(sizeof("merge")-1, count)))
> > >
> > > This means that you also match on something like "mergeblah". Just use strcmp().
> >
> > I agree. Just to make it more interesting I must say that
> >
> >    /sys/kernel/mm/transparent_hugepage/enabled
> >
> > uses memcmp in the very same way, and thus echoing "alwaysssss" or
> > "madviseeee" works perfectly there, and it was like that from the very
> > beginning, it seems. Should we fix it, or it became (zomg) a public API?
>
> Actually, maybe, the reason for using memcmp is to handle "echo"
> properly: by default it puts a newline character at the end, so if we use
> just strcmp, echo should be called with -n, otherwise strcmp won't match
> the string.
>
> Huh?

Ah, yes, other code like e.g. proc_setgroups_write() uses strncmp()
and then has an extra check to make sure everything trailing is
whitespace.
Aaron Tomlin May 16, 2019, 4:29 p.m. UTC | #6
On Thu 2019-05-16 12:00 +0200, Jann Horn wrote:
> On Thu, May 16, 2019 at 11:43 AM Oleksandr Natalenko
> <oleksandr@redhat.com> wrote:
[ ... ]
> > +       }
> > +
> > +       down_write(&mm->mmap_sem);
> 
> Should a check for mmget_still_valid(mm) be inserted here? See commit
> 04f5866e41fb70690e28397487d8bd8eea7d712a.

Yes - I'd say this is required here.

Thanks,
diff mbox series

Patch

diff --git a/fs/proc/base.c b/fs/proc/base.c
index f69532d6b74f..6677580080ed 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -94,6 +94,8 @@ 
 #include <linux/sched/debug.h>
 #include <linux/sched/stat.h>
 #include <linux/posix-timers.h>
+#include <linux/mman.h>
+#include <linux/ksm.h>
 #include <trace/events/oom.h>
 #include "internal.h"
 #include "fd.h"
@@ -2960,15 +2962,63 @@  static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
 static ssize_t madvise_write(struct file *file, const char __user *buf,
 		size_t count, loff_t *ppos)
 {
+	/* For now, only KSM hints are implemented */
+#ifdef CONFIG_KSM
+	char buffer[PROC_NUMBUF];
+	int behaviour;
 	struct task_struct *task;
+	struct mm_struct *mm;
+	int err = 0;
+	struct vm_area_struct *vma;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+
+	if (!memcmp("merge", buffer, min(sizeof("merge")-1, count)))
+		behaviour = MADV_MERGEABLE;
+	else if (!memcmp("unmerge", buffer, min(sizeof("unmerge")-1, count)))
+		behaviour = MADV_UNMERGEABLE;
+	else
+		return -EINVAL;
 
 	task = get_proc_task(file_inode(file));
 	if (!task)
 		return -ESRCH;
 
+	mm = get_task_mm(task);
+	if (!mm) {
+		err = -EINVAL;
+		goto out_put_task_struct;
+	}
+
+	down_write(&mm->mmap_sem);
+	switch (behaviour) {
+	case MADV_MERGEABLE:
+	case MADV_UNMERGEABLE:
+		vma = mm->mmap;
+		while (vma) {
+			if (behaviour == MADV_MERGEABLE)
+				ksm_madvise_merge(vma->vm_mm, vma, &vma->vm_flags);
+			else
+				ksm_madvise_unmerge(vma, vma->vm_start, vma->vm_end, &vma->vm_flags);
+			vma = vma->vm_next;
+		}
+		break;
+	}
+	up_write(&mm->mmap_sem);
+
+	mmput(mm);
+
+out_put_task_struct:
 	put_task_struct(task);
 
-	return count;
+	return err ? err : count;
+#else
+	return -EINVAL;
+#endif /* CONFIG_KSM */
 }
 
 static const struct file_operations proc_madvise_operations = {