diff mbox series

[v2] proc: Allow pid_revalidate() during LOOKUP_RCU

Message ID 20201204000212.773032-1-stephen.s.brennan@oracle.com (mailing list archive)
State Superseded
Headers show
Series [v2] proc: Allow pid_revalidate() during LOOKUP_RCU | expand

Commit Message

Stephen Brennan Dec. 4, 2020, 12:02 a.m. UTC
The pid_revalidate() function requires dropping from RCU into REF lookup
mode. When many threads are resolving paths within /proc in parallel,
this can result in heavy spinlock contention as each thread tries to
grab a reference to the /proc dentry lock (and drop it shortly
thereafter).

Allow the pid_revalidate() function to execute under LOOKUP_RCU. When
updates must be made to the inode due to the owning task performing
setuid(), drop out of RCU and into REF mode. Remove the call to
security_task_to_inode(), since we can rely on the call from
proc_pid_make_inode().

Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
---
I'd like to use this patch as an RFC on this approach for reducing spinlock
contention during many parallel path lookups in the /proc filesystem. The
contention can be triggered by, for example, running ~100 parallel instances of
"TZ=/etc/localtime ps -fe >/dev/null" on a 100CPU machine. The %sys utilization
in such a case reaches around 90%, and profiles show two code paths with high
utilization:

    walk_component
      lookup_fast
        unlazy_child
          legitimize_root
            __legitimize_path
              lockref_get_not_dead

    terminate_walk
      dput
        dput

By applying this patch, %sys utilization falls to around 60% under the same
workload. Although this particular workload is a bit contrived, we have seen
some monitoring scripts which produced high %sys time due to this contention.

Changes from v2:
- Remove get_pid_task_rcu_user() and get_proc_task_rcu(), since they were
  unnecessary.
- Remove the call to security_task_to_inode().

 fs/proc/base.c | 47 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 14 deletions(-)

Comments

Matthew Wilcox Dec. 12, 2020, 8:55 p.m. UTC | #1
On Thu, Dec 03, 2020 at 04:02:12PM -0800, Stephen Brennan wrote:
> -void pid_update_inode(struct task_struct *task, struct inode *inode)
> +static int do_pid_update_inode(struct task_struct *task, struct inode *inode,
> +			       unsigned int flags)

I'm really nitpicking here, but this function only _updates_ the inode
if flags says it should.  So I was thinking something like this
(compile tested only).

I'd really appreocate feedback from someone like Casey or Stephen on
what they need for their security modules.

diff --git a/fs/proc/base.c b/fs/proc/base.c
index b362523a9829..771f330bfce7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1968,6 +1968,25 @@ void pid_update_inode(struct task_struct *task, struct inode *inode)
 	security_task_to_inode(task, inode);
 }
 
+/* See if we can avoid the above call.  Assumes RCU lock held */
+static bool inode_needs_pid_update(struct task_struct *task,
+		const struct inode *inode)
+{
+	kuid_t uid;
+	kgid_t gid;
+
+	if (inode->i_mode & (S_ISUID | S_ISGID))
+		return true;
+	task_dump_owner(task, inode->i_mode, &uid, &gid);
+	if (!uid_eq(uid, inode->i_uid) || !gid_eq(gid, inode->i_gid))
+		return true;
+	/*
+	 * XXX: Do we need to call the security system here to see if
+	 * there's a pending update?
+	 */
+	return false;
+}
+
 /*
  * Rewrite the inode's ownerships here because the owning task may have
  * performed a setuid(), etc.
@@ -1978,8 +1997,15 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags)
 	struct inode *inode;
 	struct task_struct *task;
 
-	if (flags & LOOKUP_RCU)
+	if (flags & LOOKUP_RCU) {
+		inode = d_inode_rcu(dentry);
+		task = pid_task(proc_pid(inode), PIDTYPE_PID);
+		if (!task)
+			return 0;
+		if (!inode_needs_pid_update(task, inode))
+			return 1;
 		return -ECHILD;
+	}
 
 	inode = d_inode(dentry);
 	task = get_proc_task(inode);
Eric W. Biederman Dec. 13, 2020, 2:22 p.m. UTC | #2
Matthew Wilcox <willy@infradead.org> writes:

> On Thu, Dec 03, 2020 at 04:02:12PM -0800, Stephen Brennan wrote:
>> -void pid_update_inode(struct task_struct *task, struct inode *inode)
>> +static int do_pid_update_inode(struct task_struct *task, struct inode *inode,
>> +			       unsigned int flags)
>
> I'm really nitpicking here, but this function only _updates_ the inode
> if flags says it should.  So I was thinking something like this
> (compile tested only).
>
> I'd really appreocate feedback from someone like Casey or Stephen on
> what they need for their security modules.

Just so we don't have security module questions confusing things
can we please make this a 2 patch series?  With the first
patch removing security_task_to_inode?

The justification for the removal is that all security_task_to_inode
appears to care about is the file type bits in inode->i_mode.  Something
that never changes.  Having this in a separate patch would make that
logical change easier to verify.

Eric

>
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index b362523a9829..771f330bfce7 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1968,6 +1968,25 @@ void pid_update_inode(struct task_struct *task, struct inode *inode)
>  	security_task_to_inode(task, inode);
>  }
>  
> +/* See if we can avoid the above call.  Assumes RCU lock held */
> +static bool inode_needs_pid_update(struct task_struct *task,
> +		const struct inode *inode)
> +{
> +	kuid_t uid;
> +	kgid_t gid;
> +
> +	if (inode->i_mode & (S_ISUID | S_ISGID))
> +		return true;
> +	task_dump_owner(task, inode->i_mode, &uid, &gid);
> +	if (!uid_eq(uid, inode->i_uid) || !gid_eq(gid, inode->i_gid))
> +		return true;
> +	/*
> +	 * XXX: Do we need to call the security system here to see if
> +	 * there's a pending update?
> +	 */
> +	return false;
> +}
> +
>  /*
>   * Rewrite the inode's ownerships here because the owning task may have
>   * performed a setuid(), etc.
> @@ -1978,8 +1997,15 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags)
>  	struct inode *inode;
>  	struct task_struct *task;
>  
> -	if (flags & LOOKUP_RCU)
> +	if (flags & LOOKUP_RCU) {
> +		inode = d_inode_rcu(dentry);
> +		task = pid_task(proc_pid(inode), PIDTYPE_PID);
> +		if (!task)
> +			return 0;
> +		if (!inode_needs_pid_update(task, inode))
> +			return 1;
>  		return -ECHILD;
> +	}
>  
>  	inode = d_inode(dentry);
>  	task = get_proc_task(inode);
Eric W. Biederman Dec. 13, 2020, 2:30 p.m. UTC | #3
Stephen Brennan <stephen.s.brennan@oracle.com> writes:

> The pid_revalidate() function requires dropping from RCU into REF lookup
> mode. When many threads are resolving paths within /proc in parallel,
> this can result in heavy spinlock contention as each thread tries to
> grab a reference to the /proc dentry lock (and drop it shortly
> thereafter).

I am feeling dense at the moment.  Which lock specifically are you
referring to?  The only locks I can thinking of are sleeping locks,
not spinlocks.

> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index ebea9501afb8..833d55a59e20 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1830,19 +1846,22 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags)
>  {
>  	struct inode *inode;
>  	struct task_struct *task;
> +	int rv = 0;
>  
> -	if (flags & LOOKUP_RCU)
> -		return -ECHILD;
> -
> -	inode = d_inode(dentry);
> -	task = get_proc_task(inode);
> -
> -	if (task) {
> -		pid_update_inode(task, inode);
> -		put_task_struct(task);
> -		return 1;
> +	if (flags & LOOKUP_RCU) {

Why do we need to test flags here at all?
Why can't the code simply take an rcu_read_lock unconditionally and just
pass flags into do_pid_update_inode?


> +		inode = d_inode_rcu(dentry);
> +		task = pid_task(proc_pid(inode), PIDTYPE_PID);
> +		if (task)
> +			rv = do_pid_update_inode(task, inode, flags);
> +	} else {
> +		inode = d_inode(dentry);
> +		task = get_proc_task(inode);
> +		if (task) {
> +			rv = do_pid_update_inode(task, inode, flags);
> +			put_task_struct(task);
> +		}

>  	}
> -	return 0;
> +	return rv;
>  }
>  
>  static inline bool proc_inode_is_dead(struct inode *inode)

Eric
Matthew Wilcox Dec. 13, 2020, 4:29 p.m. UTC | #4
On Sun, Dec 13, 2020 at 08:22:32AM -0600, Eric W. Biederman wrote:
> Matthew Wilcox <willy@infradead.org> writes:
> 
> > On Thu, Dec 03, 2020 at 04:02:12PM -0800, Stephen Brennan wrote:
> >> -void pid_update_inode(struct task_struct *task, struct inode *inode)
> >> +static int do_pid_update_inode(struct task_struct *task, struct inode *inode,
> >> +			       unsigned int flags)
> >
> > I'm really nitpicking here, but this function only _updates_ the inode
> > if flags says it should.  So I was thinking something like this
> > (compile tested only).
> >
> > I'd really appreocate feedback from someone like Casey or Stephen on
> > what they need for their security modules.
> 
> Just so we don't have security module questions confusing things
> can we please make this a 2 patch series?  With the first
> patch removing security_task_to_inode?
> 
> The justification for the removal is that all security_task_to_inode
> appears to care about is the file type bits in inode->i_mode.  Something
> that never changes.  Having this in a separate patch would make that
> logical change easier to verify.

I don't think that's right, which is why I keep asking Stephen & Casey
for their thoughts.  For example,

 * Sets the smack pointer in the inode security blob
 */
static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
{
        struct inode_smack *isp = smack_inode(inode);
        struct smack_known *skp = smk_of_task_struct(p);

        isp->smk_inode = skp;
        isp->smk_flags |= SMK_INODE_INSTANT;
}

That seems to do rather more than checking the file type bits.
Matthew Wilcox Dec. 13, 2020, 4:32 p.m. UTC | #5
On Sun, Dec 13, 2020 at 08:30:40AM -0600, Eric W. Biederman wrote:
> Stephen Brennan <stephen.s.brennan@oracle.com> writes:
> 
> > The pid_revalidate() function requires dropping from RCU into REF lookup
> > mode. When many threads are resolving paths within /proc in parallel,
> > this can result in heavy spinlock contention as each thread tries to
> > grab a reference to the /proc dentry lock (and drop it shortly
> > thereafter).
> 
> I am feeling dense at the moment.  Which lock specifically are you
> referring to?  The only locks I can thinking of are sleeping locks,
> not spinlocks.

Stephen may have a better answer than this, but our mutex implementation
spins if the owner is still running, so he may have misspoken slightly.
He's testing on a giant system with hundreds of CPUs, so a mutex is
going to behave like a spinlock for him.

> Why do we need to test flags here at all?
> Why can't the code simply take an rcu_read_lock unconditionally and just
> pass flags into do_pid_update_inode?

Hah!  I was thinking about that possibility this morning, and I was
going to ask you that question.
Paul Moore Dec. 13, 2020, 11 p.m. UTC | #6
On Sun, Dec 13, 2020 at 11:30 AM Matthew Wilcox <willy@infradead.org> wrote:
> On Sun, Dec 13, 2020 at 08:22:32AM -0600, Eric W. Biederman wrote:
> > Matthew Wilcox <willy@infradead.org> writes:
> >
> > > On Thu, Dec 03, 2020 at 04:02:12PM -0800, Stephen Brennan wrote:
> > >> -void pid_update_inode(struct task_struct *task, struct inode *inode)
> > >> +static int do_pid_update_inode(struct task_struct *task, struct inode *inode,
> > >> +                         unsigned int flags)
> > >
> > > I'm really nitpicking here, but this function only _updates_ the inode
> > > if flags says it should.  So I was thinking something like this
> > > (compile tested only).
> > >
> > > I'd really appreocate feedback from someone like Casey or Stephen on
> > > what they need for their security modules.
> >
> > Just so we don't have security module questions confusing things
> > can we please make this a 2 patch series?  With the first
> > patch removing security_task_to_inode?
> >
> > The justification for the removal is that all security_task_to_inode
> > appears to care about is the file type bits in inode->i_mode.  Something
> > that never changes.  Having this in a separate patch would make that
> > logical change easier to verify.
>
> I don't think that's right, which is why I keep asking Stephen & Casey
> for their thoughts.

The SELinux security_task_to_inode() implementation only cares about
inode->i_mode S_IFMT bits from the inode so that we can set the object
class correctly.  The inode's SELinux label is taken from the
associated task.

Casey would need to comment on Smack's needs.

> For example,
>
>  * Sets the smack pointer in the inode security blob
>  */
> static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
> {
>         struct inode_smack *isp = smack_inode(inode);
>         struct smack_known *skp = smk_of_task_struct(p);
>
>         isp->smk_inode = skp;
>         isp->smk_flags |= SMK_INODE_INSTANT;
> }
>
> That seems to do rather more than checking the file type bits.
Stephen Brennan Dec. 14, 2020, 5:19 p.m. UTC | #7
ebiederm@xmission.com (Eric W. Biederman) writes:

> Stephen Brennan <stephen.s.brennan@oracle.com> writes:
>
>> The pid_revalidate() function requires dropping from RCU into REF lookup
>> mode. When many threads are resolving paths within /proc in parallel,
>> this can result in heavy spinlock contention as each thread tries to
>> grab a reference to the /proc dentry lock (and drop it shortly
>> thereafter).
>
> I am feeling dense at the moment.  Which lock specifically are you
> referring to?  The only locks I can thinking of are sleeping locks,
> not spinlocks.

The lock in question is the d_lockref field (aliased as d_lock) of
struct dentry. It is contended in this code path while processing the
"/proc" dentry, switching from RCU to REF mode.

    walk_component()
      lookup_fast()
        d_revalidate()
          pid_revalidate() // returns -ECHILD
        unlazy_child()
          lockref_get_not_dead(&nd->path.dentry->d_lockref)

>
>> diff --git a/fs/proc/base.c b/fs/proc/base.c
>> index ebea9501afb8..833d55a59e20 100644
>> --- a/fs/proc/base.c
>> +++ b/fs/proc/base.c
>> @@ -1830,19 +1846,22 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags)
>>  {
>>  	struct inode *inode;
>>  	struct task_struct *task;
>> +	int rv = 0;
>>  
>> -	if (flags & LOOKUP_RCU)
>> -		return -ECHILD;
>> -
>> -	inode = d_inode(dentry);
>> -	task = get_proc_task(inode);
>> -
>> -	if (task) {
>> -		pid_update_inode(task, inode);
>> -		put_task_struct(task);
>> -		return 1;
>> +	if (flags & LOOKUP_RCU) {
>
> Why do we need to test flags here at all?
> Why can't the code simply take an rcu_read_lock unconditionally and just
> pass flags into do_pid_update_inode?
>

I don't have any good reason. If it is safe to update the inode without
holding a reference to the task struct (or holding any other lock) then
I can consolidate the whole conditional.

>
>> +		inode = d_inode_rcu(dentry);
>> +		task = pid_task(proc_pid(inode), PIDTYPE_PID);
>> +		if (task)
>> +			rv = do_pid_update_inode(task, inode, flags);
>> +	} else {
>> +		inode = d_inode(dentry);
>> +		task = get_proc_task(inode);
>> +		if (task) {
>> +			rv = do_pid_update_inode(task, inode, flags);
>> +			put_task_struct(task);
>> +		}
>
>>  	}
>> -	return 0;
>> +	return rv;
>>  }
>>  
>>  static inline bool proc_inode_is_dead(struct inode *inode)
>
> Eric
Stephen Brennan Dec. 14, 2020, 6:15 p.m. UTC | #8
ebiederm@xmission.com (Eric W. Biederman) writes:

> Matthew Wilcox <willy@infradead.org> writes:
>
>> On Thu, Dec 03, 2020 at 04:02:12PM -0800, Stephen Brennan wrote:
>>> -void pid_update_inode(struct task_struct *task, struct inode *inode)
>>> +static int do_pid_update_inode(struct task_struct *task, struct inode *inode,
>>> +			       unsigned int flags)
>>
>> I'm really nitpicking here, but this function only _updates_ the inode
>> if flags says it should.  So I was thinking something like this
>> (compile tested only).
>>
>> I'd really appreocate feedback from someone like Casey or Stephen on
>> what they need for their security modules.
>
> Just so we don't have security module questions confusing things
> can we please make this a 2 patch series?  With the first
> patch removing security_task_to_inode?
>
> The justification for the removal is that all security_task_to_inode
> appears to care about is the file type bits in inode->i_mode.  Something
> that never changes.  Having this in a separate patch would make that
> logical change easier to verify.
>

I'll gladly split that out in v3 so we can continue the discussion
there.

I'll also include some changes with Matthew's suggestion of
inode_needs_pid_update(). This in combination with your suggestion to do
fewer flag checks in pid_revalidate() should cleanup the code a fair bit.

Stephen

> Eric
>
>>
>> diff --git a/fs/proc/base.c b/fs/proc/base.c
>> index b362523a9829..771f330bfce7 100644
>> --- a/fs/proc/base.c
>> +++ b/fs/proc/base.c
>> @@ -1968,6 +1968,25 @@ void pid_update_inode(struct task_struct *task, struct inode *inode)
>>  	security_task_to_inode(task, inode);
>>  }
>>  
>> +/* See if we can avoid the above call.  Assumes RCU lock held */
>> +static bool inode_needs_pid_update(struct task_struct *task,
>> +		const struct inode *inode)
>> +{
>> +	kuid_t uid;
>> +	kgid_t gid;
>> +
>> +	if (inode->i_mode & (S_ISUID | S_ISGID))
>> +		return true;
>> +	task_dump_owner(task, inode->i_mode, &uid, &gid);
>> +	if (!uid_eq(uid, inode->i_uid) || !gid_eq(gid, inode->i_gid))
>> +		return true;
>> +	/*
>> +	 * XXX: Do we need to call the security system here to see if
>> +	 * there's a pending update?
>> +	 */
>> +	return false;
>> +}
>> +
>>  /*
>>   * Rewrite the inode's ownerships here because the owning task may have
>>   * performed a setuid(), etc.
>> @@ -1978,8 +1997,15 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags)
>>  	struct inode *inode;
>>  	struct task_struct *task;
>>  
>> -	if (flags & LOOKUP_RCU)
>> +	if (flags & LOOKUP_RCU) {
>> +		inode = d_inode_rcu(dentry);
>> +		task = pid_task(proc_pid(inode), PIDTYPE_PID);
>> +		if (!task)
>> +			return 0;
>> +		if (!inode_needs_pid_update(task, inode))
>> +			return 1;
>>  		return -ECHILD;
>> +	}
>>  
>>  	inode = d_inode(dentry);
>>  	task = get_proc_task(inode);
Casey Schaufler Dec. 14, 2020, 6:45 p.m. UTC | #9
On 12/13/2020 8:29 AM, Matthew Wilcox wrote:
> On Sun, Dec 13, 2020 at 08:22:32AM -0600, Eric W. Biederman wrote:
>> Matthew Wilcox <willy@infradead.org> writes:
>>
>>> On Thu, Dec 03, 2020 at 04:02:12PM -0800, Stephen Brennan wrote:
>>>> -void pid_update_inode(struct task_struct *task, struct inode *inode)
>>>> +static int do_pid_update_inode(struct task_struct *task, struct inode *inode,
>>>> +			       unsigned int flags)
>>> I'm really nitpicking here, but this function only _updates_ the inode
>>> if flags says it should.  So I was thinking something like this
>>> (compile tested only).
>>>
>>> I'd really appreocate feedback from someone like Casey or Stephen on
>>> what they need for their security modules.
>> Just so we don't have security module questions confusing things
>> can we please make this a 2 patch series?  With the first
>> patch removing security_task_to_inode?
>>
>> The justification for the removal is that all security_task_to_inode
>> appears to care about is the file type bits in inode->i_mode.  Something
>> that never changes.  Having this in a separate patch would make that
>> logical change easier to verify.
> I don't think that's right, which is why I keep asking Stephen & Casey
> for their thoughts.  For example,
>
>  * Sets the smack pointer in the inode security blob
>  */
> static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
> {
>         struct inode_smack *isp = smack_inode(inode);
>         struct smack_known *skp = smk_of_task_struct(p);
>
>         isp->smk_inode = skp;
>         isp->smk_flags |= SMK_INODE_INSTANT;
> }
>
> That seems to do rather more than checking the file type bits.

I'm going to have to bring myself up to speed on the
discussion before I say anything dumb. I'm supposed to
be Not! Working! today. I will get on it as permitted.
Casey Schaufler Dec. 15, 2020, 6:09 p.m. UTC | #10
On 12/13/2020 3:00 PM, Paul Moore wrote:
> On Sun, Dec 13, 2020 at 11:30 AM Matthew Wilcox <willy@infradead.org> wrote:
>> On Sun, Dec 13, 2020 at 08:22:32AM -0600, Eric W. Biederman wrote:
>>> Matthew Wilcox <willy@infradead.org> writes:
>>>
>>>> On Thu, Dec 03, 2020 at 04:02:12PM -0800, Stephen Brennan wrote:
>>>>> -void pid_update_inode(struct task_struct *task, struct inode *inode)
>>>>> +static int do_pid_update_inode(struct task_struct *task, struct inode *inode,
>>>>> +                         unsigned int flags)
>>>> I'm really nitpicking here, but this function only _updates_ the inode
>>>> if flags says it should.  So I was thinking something like this
>>>> (compile tested only).
>>>>
>>>> I'd really appreocate feedback from someone like Casey or Stephen on
>>>> what they need for their security modules.
>>> Just so we don't have security module questions confusing things
>>> can we please make this a 2 patch series?  With the first
>>> patch removing security_task_to_inode?
>>>
>>> The justification for the removal is that all security_task_to_inode
>>> appears to care about is the file type bits in inode->i_mode.  Something
>>> that never changes.  Having this in a separate patch would make that
>>> logical change easier to verify.
>> I don't think that's right, which is why I keep asking Stephen & Casey
>> for their thoughts.
> The SELinux security_task_to_inode() implementation only cares about
> inode->i_mode S_IFMT bits from the inode so that we can set the object
> class correctly.  The inode's SELinux label is taken from the
> associated task.
>
> Casey would need to comment on Smack's needs.

SELinux uses different "class"es on subjects and objects.
Smack does not differentiate, so knows the label it wants
the inode to have when smack_task_to_inode() is called,
and sets it accordingly. Nothing is allocated in the process,
and the new value is coming from the Smack master label list.
It isn't going to go away. It appears that this is the point
of the hook. Am I missing something?

>
>> For example,
>>
>>  * Sets the smack pointer in the inode security blob
>>  */
>> static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
>> {
>>         struct inode_smack *isp = smack_inode(inode);
>>         struct smack_known *skp = smk_of_task_struct(p);
>>
>>         isp->smk_inode = skp;
>>         isp->smk_flags |= SMK_INODE_INSTANT;
>> }
>>
>> That seems to do rather more than checking the file type bits.
Eric W. Biederman Dec. 15, 2020, 9:45 p.m. UTC | #11
Stephen Brennan <stephen.s.brennan@oracle.com> writes:

> ebiederm@xmission.com (Eric W. Biederman) writes:
>
>> Stephen Brennan <stephen.s.brennan@oracle.com> writes:
>>
>>> The pid_revalidate() function requires dropping from RCU into REF lookup
>>> mode. When many threads are resolving paths within /proc in parallel,
>>> this can result in heavy spinlock contention as each thread tries to
>>> grab a reference to the /proc dentry lock (and drop it shortly
>>> thereafter).
>>
>> I am feeling dense at the moment.  Which lock specifically are you
>> referring to?  The only locks I can thinking of are sleeping locks,
>> not spinlocks.
>
> The lock in question is the d_lockref field (aliased as d_lock) of
> struct dentry. It is contended in this code path while processing the
> "/proc" dentry, switching from RCU to REF mode.
>
>     walk_component()
>       lookup_fast()
>         d_revalidate()
>           pid_revalidate() // returns -ECHILD
>         unlazy_child()
>           lockref_get_not_dead(&nd->path.dentry->d_lockref)
>
>>
>>> diff --git a/fs/proc/base.c b/fs/proc/base.c
>>> index ebea9501afb8..833d55a59e20 100644
>>> --- a/fs/proc/base.c
>>> +++ b/fs/proc/base.c
>>> @@ -1830,19 +1846,22 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags)
>>>  {
>>>  	struct inode *inode;
>>>  	struct task_struct *task;
>>> +	int rv = 0;
>>>  
>>> -	if (flags & LOOKUP_RCU)
>>> -		return -ECHILD;
>>> -
>>> -	inode = d_inode(dentry);
>>> -	task = get_proc_task(inode);
>>> -
>>> -	if (task) {
>>> -		pid_update_inode(task, inode);
>>> -		put_task_struct(task);
>>> -		return 1;
>>> +	if (flags & LOOKUP_RCU) {
>>
>> Why do we need to test flags here at all?
>> Why can't the code simply take an rcu_read_lock unconditionally and just
>> pass flags into do_pid_update_inode?
>>
>
> I don't have any good reason. If it is safe to update the inode without
> holding a reference to the task struct (or holding any other lock) then
> I can consolidate the whole conditional.


I am not certain if there is anything that makes it safe to change uid
and gid on the inode during lookup.  The current code might be buggy in
that respect.

However it is absoltely safe to read from the task_struct with just the
rcu_read_lock.  Which means it is only do_pid_update_inode that needs
locking to update the inode.

>>> +		inode = d_inode_rcu(dentry);
>>> +		task = pid_task(proc_pid(inode), PIDTYPE_PID);
>>> +		if (task)
>>> +			rv = do_pid_update_inode(task, inode, flags);
>>> +	} else {
>>> +		inode = d_inode(dentry);
>>> +		task = get_proc_task(inode);
>>> +		if (task) {
>>> +			rv = do_pid_update_inode(task, inode, flags);
>>> +			put_task_struct(task);
>>> +		}
>>
>>>  	}
>>> -	return 0;
>>> +	return rv;
>>>  }
>>>  
>>>  static inline bool proc_inode_is_dead(struct inode *inode)
>>
>> Eric

Eric
Eric W. Biederman Dec. 15, 2020, 10:04 p.m. UTC | #12
Casey Schaufler <casey@schaufler-ca.com> writes:

> On 12/13/2020 3:00 PM, Paul Moore wrote:
>> On Sun, Dec 13, 2020 at 11:30 AM Matthew Wilcox <willy@infradead.org> wrote:
>>> On Sun, Dec 13, 2020 at 08:22:32AM -0600, Eric W. Biederman wrote:
>>>> Matthew Wilcox <willy@infradead.org> writes:
>>>>
>>>>> On Thu, Dec 03, 2020 at 04:02:12PM -0800, Stephen Brennan wrote:
>>>>>> -void pid_update_inode(struct task_struct *task, struct inode *inode)
>>>>>> +static int do_pid_update_inode(struct task_struct *task, struct inode *inode,
>>>>>> +                         unsigned int flags)
>>>>> I'm really nitpicking here, but this function only _updates_ the inode
>>>>> if flags says it should.  So I was thinking something like this
>>>>> (compile tested only).
>>>>>
>>>>> I'd really appreocate feedback from someone like Casey or Stephen on
>>>>> what they need for their security modules.
>>>> Just so we don't have security module questions confusing things
>>>> can we please make this a 2 patch series?  With the first
>>>> patch removing security_task_to_inode?
>>>>
>>>> The justification for the removal is that all security_task_to_inode
>>>> appears to care about is the file type bits in inode->i_mode.  Something
>>>> that never changes.  Having this in a separate patch would make that
>>>> logical change easier to verify.
>>> I don't think that's right, which is why I keep asking Stephen & Casey
>>> for their thoughts.
>> The SELinux security_task_to_inode() implementation only cares about
>> inode->i_mode S_IFMT bits from the inode so that we can set the object
>> class correctly.  The inode's SELinux label is taken from the
>> associated task.
>>
>> Casey would need to comment on Smack's needs.
>
> SELinux uses different "class"es on subjects and objects.
> Smack does not differentiate, so knows the label it wants
> the inode to have when smack_task_to_inode() is called,
> and sets it accordingly. Nothing is allocated in the process,
> and the new value is coming from the Smack master label list.
> It isn't going to go away. It appears that this is the point
> of the hook. Am I missing something?

security_task_to_inode (strangely named as this is proc specific) is
currently called both when the inode is initialized in proc and when
pid_revalidate is called and the uid and gid of the proc inode
are updated to match the traced task.

I am suggesting that the call of security_task_to_inode in
pid_revalidate be removed as neither of the two implementations of this
security hook smack nor selinux care of the uid or gid changes.

Removal of the security check will allow proc to be accessed in rcu look
mode.  AKA give proc go faster stripes.

The two implementations are:

static void selinux_task_to_inode(struct task_struct *p,
				  struct inode *inode)
{
	struct inode_security_struct *isec = selinux_inode(inode);
	u32 sid = task_sid(p);

	spin_lock(&isec->lock);
	isec->sclass = inode_mode_to_security_class(inode->i_mode);
	isec->sid = sid;
	isec->initialized = LABEL_INITIALIZED;
	spin_unlock(&isec->lock);
}


static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
{
	struct inode_smack *isp = smack_inode(inode);
	struct smack_known *skp = smk_of_task_struct(p);

	isp->smk_inode = skp;
	isp->smk_flags |= SMK_INODE_INSTANT;
}

I see two questions gating the safe removal of the call of
security_task_to_inode from pid_revalidate.

1) Does any of this code care about uids or gids.
   It appears the answer is no from a quick inspection of the code.

2) Does smack_task_to_inode need to be called after exec?
   - Exec especially suid exec changes the the cred on a task.
   - Execing of a non-leader thread changes the thread_pid of a task
     so that it is the pid of the entire thread group.

   If either of those are significant perhaps we can limit calling
   security_task_to_inode if task->self_exec_id is different.

   I haven't yet take the time to trace through and see if
   task_sid(p) or smk_of_task_struct(p) could change based on
   the security hooks called during exec.  Or how bad the races are if
   such a change can happen.

Does that clarify the question that is being asked?

Eric
Casey Schaufler Dec. 15, 2020, 10:53 p.m. UTC | #13
On 12/15/2020 2:04 PM, Eric W. Biederman wrote:
> Casey Schaufler <casey@schaufler-ca.com> writes:
>
>> On 12/13/2020 3:00 PM, Paul Moore wrote:
>>> On Sun, Dec 13, 2020 at 11:30 AM Matthew Wilcox <willy@infradead.org> wrote:
>>>> On Sun, Dec 13, 2020 at 08:22:32AM -0600, Eric W. Biederman wrote:
>>>>> Matthew Wilcox <willy@infradead.org> writes:
>>>>>
>>>>>> On Thu, Dec 03, 2020 at 04:02:12PM -0800, Stephen Brennan wrote:
>>>>>>> -void pid_update_inode(struct task_struct *task, struct inode *inode)
>>>>>>> +static int do_pid_update_inode(struct task_struct *task, struct inode *inode,
>>>>>>> +                         unsigned int flags)
>>>>>> I'm really nitpicking here, but this function only _updates_ the inode
>>>>>> if flags says it should.  So I was thinking something like this
>>>>>> (compile tested only).
>>>>>>
>>>>>> I'd really appreocate feedback from someone like Casey or Stephen on
>>>>>> what they need for their security modules.
>>>>> Just so we don't have security module questions confusing things
>>>>> can we please make this a 2 patch series?  With the first
>>>>> patch removing security_task_to_inode?
>>>>>
>>>>> The justification for the removal is that all security_task_to_inode
>>>>> appears to care about is the file type bits in inode->i_mode.  Something
>>>>> that never changes.  Having this in a separate patch would make that
>>>>> logical change easier to verify.
>>>> I don't think that's right, which is why I keep asking Stephen & Casey
>>>> for their thoughts.
>>> The SELinux security_task_to_inode() implementation only cares about
>>> inode->i_mode S_IFMT bits from the inode so that we can set the object
>>> class correctly.  The inode's SELinux label is taken from the
>>> associated task.
>>>
>>> Casey would need to comment on Smack's needs.
>> SELinux uses different "class"es on subjects and objects.
>> Smack does not differentiate, so knows the label it wants
>> the inode to have when smack_task_to_inode() is called,
>> and sets it accordingly. Nothing is allocated in the process,
>> and the new value is coming from the Smack master label list.
>> It isn't going to go away. It appears that this is the point
>> of the hook. Am I missing something?
> security_task_to_inode (strangely named as this is proc specific) is
> currently called both when the inode is initialized in proc and when
> pid_revalidate is called and the uid and gid of the proc inode
> are updated to match the traced task.
>
> I am suggesting that the call of security_task_to_inode in
> pid_revalidate be removed as neither of the two implementations of this
> security hook smack nor selinux care of the uid or gid changes.

If you're sure that the only case where pid_revalidate() would matter
is for the uid/gid cases that would be OK.

>
> Removal of the security check will allow proc to be accessed in rcu look
> mode.  AKA give proc go faster stripes.
>
> The two implementations are:
>
> static void selinux_task_to_inode(struct task_struct *p,
> 				  struct inode *inode)
> {
> 	struct inode_security_struct *isec = selinux_inode(inode);
> 	u32 sid = task_sid(p);
>
> 	spin_lock(&isec->lock);
> 	isec->sclass = inode_mode_to_security_class(inode->i_mode);
> 	isec->sid = sid;
> 	isec->initialized = LABEL_INITIALIZED;
> 	spin_unlock(&isec->lock);
> }
>
>
> static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
> {
> 	struct inode_smack *isp = smack_inode(inode);
> 	struct smack_known *skp = smk_of_task_struct(p);
>
> 	isp->smk_inode = skp;
> 	isp->smk_flags |= SMK_INODE_INSTANT;
> }
>
> I see two questions gating the safe removal of the call of
> security_task_to_inode from pid_revalidate.
>
> 1) Does any of this code care about uids or gids.
>    It appears the answer is no from a quick inspection of the code.

It looks that way.

>
> 2) Does smack_task_to_inode need to be called after exec?
>    - Exec especially suid exec changes the the cred on a task.
>    - Execing of a non-leader thread changes the thread_pid of a task
>      so that it is the pid of the entire thread group.

I think so. If SMACK64EXEC is set on a binary the label will
be changed on exec. The /proc inode Smack label would need to
be changed.

>
>    If either of those are significant perhaps we can limit calling
>    security_task_to_inode if task->self_exec_id is different.
>
>    I haven't yet take the time to trace through and see if
>    task_sid(p) or smk_of_task_struct(p) could change based on
>    the security hooks called during exec.  Or how bad the races are if
>    such a change can happen.
>
> Does that clarify the question that is being asked?
>
> Eric
Stephen Brennan Dec. 16, 2020, 1:05 a.m. UTC | #14
Casey Schaufler <casey@schaufler-ca.com> writes:

> On 12/15/2020 2:04 PM, Eric W. Biederman wrote:
>> Casey Schaufler <casey@schaufler-ca.com> writes:
>>
>>> On 12/13/2020 3:00 PM, Paul Moore wrote:
>>>> On Sun, Dec 13, 2020 at 11:30 AM Matthew Wilcox <willy@infradead.org> wrote:
>>>>> On Sun, Dec 13, 2020 at 08:22:32AM -0600, Eric W. Biederman wrote:
>>>>>> Matthew Wilcox <willy@infradead.org> writes:
>>>>>>
>>>>>>> On Thu, Dec 03, 2020 at 04:02:12PM -0800, Stephen Brennan wrote:
>>>>>>>> -void pid_update_inode(struct task_struct *task, struct inode *inode)
>>>>>>>> +static int do_pid_update_inode(struct task_struct *task, struct inode *inode,
>>>>>>>> +                         unsigned int flags)
>>>>>>> I'm really nitpicking here, but this function only _updates_ the inode
>>>>>>> if flags says it should.  So I was thinking something like this
>>>>>>> (compile tested only).
>>>>>>>
>>>>>>> I'd really appreocate feedback from someone like Casey or Stephen on
>>>>>>> what they need for their security modules.
>>>>>> Just so we don't have security module questions confusing things
>>>>>> can we please make this a 2 patch series?  With the first
>>>>>> patch removing security_task_to_inode?
>>>>>>
>>>>>> The justification for the removal is that all security_task_to_inode
>>>>>> appears to care about is the file type bits in inode->i_mode.  Something
>>>>>> that never changes.  Having this in a separate patch would make that
>>>>>> logical change easier to verify.
>>>>> I don't think that's right, which is why I keep asking Stephen & Casey
>>>>> for their thoughts.
>>>> The SELinux security_task_to_inode() implementation only cares about
>>>> inode->i_mode S_IFMT bits from the inode so that we can set the object
>>>> class correctly.  The inode's SELinux label is taken from the
>>>> associated task.
>>>>
>>>> Casey would need to comment on Smack's needs.
>>> SELinux uses different "class"es on subjects and objects.
>>> Smack does not differentiate, so knows the label it wants
>>> the inode to have when smack_task_to_inode() is called,
>>> and sets it accordingly. Nothing is allocated in the process,
>>> and the new value is coming from the Smack master label list.
>>> It isn't going to go away. It appears that this is the point
>>> of the hook. Am I missing something?
>> security_task_to_inode (strangely named as this is proc specific) is
>> currently called both when the inode is initialized in proc and when
>> pid_revalidate is called and the uid and gid of the proc inode
>> are updated to match the traced task.
>>
>> I am suggesting that the call of security_task_to_inode in
>> pid_revalidate be removed as neither of the two implementations of this
>> security hook smack nor selinux care of the uid or gid changes.
>
> If you're sure that the only case where pid_revalidate() would matter
> is for the uid/gid cases that would be OK.
>
>>
>> Removal of the security check will allow proc to be accessed in rcu look
>> mode.  AKA give proc go faster stripes.
>>
>> The two implementations are:
>>
>> static void selinux_task_to_inode(struct task_struct *p,
>> 				  struct inode *inode)
>> {
>> 	struct inode_security_struct *isec = selinux_inode(inode);
>> 	u32 sid = task_sid(p);
>>
>> 	spin_lock(&isec->lock);
>> 	isec->sclass = inode_mode_to_security_class(inode->i_mode);
>> 	isec->sid = sid;
>> 	isec->initialized = LABEL_INITIALIZED;
>> 	spin_unlock(&isec->lock);
>> }
>>
>>
>> static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
>> {
>> 	struct inode_smack *isp = smack_inode(inode);
>> 	struct smack_known *skp = smk_of_task_struct(p);
>>
>> 	isp->smk_inode = skp;
>> 	isp->smk_flags |= SMK_INODE_INSTANT;
>> }
>>
>> I see two questions gating the safe removal of the call of
>> security_task_to_inode from pid_revalidate.
>>
>> 1) Does any of this code care about uids or gids.
>>    It appears the answer is no from a quick inspection of the code.
>
> It looks that way.
>
>>
>> 2) Does smack_task_to_inode need to be called after exec?
>>    - Exec especially suid exec changes the the cred on a task.
>>    - Execing of a non-leader thread changes the thread_pid of a task
>>      so that it is the pid of the entire thread group.
>
> I think so. If SMACK64EXEC is set on a binary the label will
> be changed on exec. The /proc inode Smack label would need to
> be changed.
>
>>
>>    If either of those are significant perhaps we can limit calling
>>    security_task_to_inode if task->self_exec_id is different.

Given these answers then, it seems like a proper implementation would
leave the security_task_to_inode() call in pid_update_inode(). Then,
pid_revalidate() would drop out of RCU mode whenever some function like
this (drawing on Matthew's idea above) returns true:

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 449204e9f749..02805076c42b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1820,6 +1820,26 @@ void pid_update_inode(struct task_struct *task, struct inode *inode)
 	inode->i_mode &= ~(S_ISUID | S_ISGID);
 }
 
+/* See if we can avoid the above call. Assumes RCU lock held */
+static bool pid_inode_needs_update(struct task_struct *task, struct inode *inode)
+{
+	kuid_t uid;
+	kgid_t gid;
+	u32 exec_id, last_exec_id;
+
+	if (inode->i_mode & (S_ISUID | S_ISGID))
+		return true;
+	task_dump_owner(task, inode->i_mode, &uid, &gid);
+	if (!uid_eq(uid, inode->i_uid) || !gid_eq(gid, inode->i_gid))
+		return true;
+
+	last_exec_id = /* find this stored somewhere? */;
+	task_lock(task);
+	exec_id = task->self_exec_id;
+	task_unlock(task);
+	return exec_id != last_exec_id;
+}
+
 /*
  * Rewrite the inode's ownerships here because the owning task may have
  * performed a setuid(), etc.


Does this make sense?

Stephen

>>
>>    I haven't yet take the time to trace through and see if
>>    task_sid(p) or smk_of_task_struct(p) could change based on
>>    the security hooks called during exec.  Or how bad the races are if
>>    such a change can happen.
>>
>> Does that clarify the question that is being asked?
>>
>> Eric
diff mbox series

Patch

diff --git a/fs/proc/base.c b/fs/proc/base.c
index ebea9501afb8..833d55a59e20 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1813,12 +1813,28 @@  int pid_getattr(const struct path *path, struct kstat *stat,
 /*
  * Set <pid>/... inode ownership (can change due to setuid(), etc.)
  */
-void pid_update_inode(struct task_struct *task, struct inode *inode)
+static int do_pid_update_inode(struct task_struct *task, struct inode *inode,
+			       unsigned int flags)
 {
-	task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
+	kuid_t uid;
+	kgid_t gid;
+
+	task_dump_owner(task, inode->i_mode, &uid, &gid);
+	if (uid_eq(uid, inode->i_uid) && gid_eq(gid, inode->i_gid) &&
+			!(inode->i_mode & (S_ISUID | S_ISGID)))
+		return 1;
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
 
+	inode->i_uid = uid;
+	inode->i_gid = gid;
 	inode->i_mode &= ~(S_ISUID | S_ISGID);
-	security_task_to_inode(task, inode);
+	return 1;
+}
+
+void pid_update_inode(struct task_struct *task, struct inode *inode)
+{
+	do_pid_update_inode(task, inode, 0);
 }
 
 /*
@@ -1830,19 +1846,22 @@  static int pid_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode;
 	struct task_struct *task;
+	int rv = 0;
 
-	if (flags & LOOKUP_RCU)
-		return -ECHILD;
-
-	inode = d_inode(dentry);
-	task = get_proc_task(inode);
-
-	if (task) {
-		pid_update_inode(task, inode);
-		put_task_struct(task);
-		return 1;
+	if (flags & LOOKUP_RCU) {
+		inode = d_inode_rcu(dentry);
+		task = pid_task(proc_pid(inode), PIDTYPE_PID);
+		if (task)
+			rv = do_pid_update_inode(task, inode, flags);
+	} else {
+		inode = d_inode(dentry);
+		task = get_proc_task(inode);
+		if (task) {
+			rv = do_pid_update_inode(task, inode, flags);
+			put_task_struct(task);
+		}
 	}
-	return 0;
+	return rv;
 }
 
 static inline bool proc_inode_is_dead(struct inode *inode)