diff mbox series

[v2,4/7] tracefs: dentry lookup crapectomy

Message ID 20240131185512.799813912@goodmis.org (mailing list archive)
State Superseded
Headers show
Series eventfs: Rewrite to simplify the code (aka: crapectomy) | expand

Commit Message

Steven Rostedt Jan. 31, 2024, 6:49 p.m. UTC
From: Linus Torvalds <torvalds@linux-foundation.org>

The dentry lookup for eventfs files was very broken, and had lots of
signs of the old situation where the filesystem names were all created
statically in the dentry tree, rather than being looked up dynamically
based on the eventfs data structures.

You could see it in the naming - how it claimed to "create" dentries
rather than just look up the dentries that were given it.

You could see it in various nonsensical and very incorrect operations,
like using "simple_lookup()" on the dentries that were passed in, which
only results in those dentries becoming negative dentries.  Which meant
that any other lookup would possibly return ENOENT if it saw that
negative dentry before the data was then later filled in.

You could see it in the immense amount of nonsensical code that didn't
actually just do lookups.

Link: https://lore.kernel.org/linux-trace-kernel/202401291043.e62e89dc-oliver.sang@intel.com/

Cc: stable@vger.kernel.org
Fixes: c1504e510238 ("eventfs: Implement eventfs dir creation functions")
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes since v1: https://lore.kernel.org/linux-trace-kernel/20240130190355.11486-3-torvalds@linux-foundation.org

- Fixed the lookup case of not found dentry, to return an error.
  This was added in a later patch when it should have been in this one.

- Removed the calls to eventfs_{start,end,failed}_creating()

 fs/tracefs/event_inode.c | 285 ++++++++-------------------------------
 fs/tracefs/inode.c       |  69 ----------
 fs/tracefs/internal.h    |   3 -
 3 files changed, 58 insertions(+), 299 deletions(-)

Comments

Al Viro Feb. 1, 2024, 12:27 a.m. UTC | #1
On Wed, Jan 31, 2024 at 01:49:22PM -0500, Steven Rostedt wrote:

> @@ -329,32 +320,29 @@ static struct dentry *create_file(const char *name, umode_t mode,
>  
>  	ti = get_tracefs(inode);
>  	ti->flags |= TRACEFS_EVENT_INODE;
> -	d_instantiate(dentry, inode);
> +
> +	d_add(dentry, inode);
>  	fsnotify_create(dentry->d_parent->d_inode, dentry);

Seriously?  stat(2), have it go away from dcache on memory pressure,
lather, rinse, repeat...  Won't *snotify get confused by the stream
of creations of the same thing, with not a removal in sight?

> -	return eventfs_end_creating(dentry);
> +	return dentry;
>  };
>  
>  /**
> - * create_dir - create a dir in the tracefs filesystem
> + * lookup_dir_entry - look up a dir in the tracefs filesystem
> + * @dentry: the directory to look up
>   * @ei: the eventfs_inode that represents the directory to create
> - * @parent: parent dentry for this file.
>   *
> - * This function will create a dentry for a directory represented by
> + * This function will look up a dentry for a directory represented by
>   * a eventfs_inode.
>   */
> -static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent)
> +static struct dentry *lookup_dir_entry(struct dentry *dentry,
> +	struct eventfs_inode *pei, struct eventfs_inode *ei)
>  {
>  	struct tracefs_inode *ti;
> -	struct dentry *dentry;
>  	struct inode *inode;
>  
> -	dentry = eventfs_start_creating(ei->name, parent);
> -	if (IS_ERR(dentry))
> -		return dentry;
> -
>  	inode = tracefs_get_inode(dentry->d_sb);
>  	if (unlikely(!inode))
> -		return eventfs_failed_creating(dentry);
> +		return ERR_PTR(-ENOMEM);
>  
>  	/* If the user updated the directory's attributes, use them */
>  	update_inode_attr(dentry, inode, &ei->attr,
> @@ -371,11 +359,14 @@ static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent
>  	/* Only directories have ti->private set to an ei, not files */
>  	ti->private = ei;
>  
> +	dentry->d_fsdata = ei;
> +        ei->dentry = dentry;	// Remove me!
> +
>  	inc_nlink(inode);
> -	d_instantiate(dentry, inode);
> +	d_add(dentry, inode);
>  	inc_nlink(dentry->d_parent->d_inode);

What will happen when that thing gets evicted from dcache,
gets looked up again, and again, and...?

>  	fsnotify_mkdir(dentry->d_parent->d_inode, dentry);

Same re snotify confusion...

> -	return eventfs_end_creating(dentry);
> +	return dentry;
>  }
>  
>  static void free_ei(struct eventfs_inode *ei)
> @@ -425,7 +416,7 @@ void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry)
>  }
>  
>  /**
> - * create_file_dentry - create a dentry for a file of an eventfs_inode
> + * lookup_file_dentry - create a dentry for a file of an eventfs_inode
>   * @ei: the eventfs_inode that the file will be created under
>   * @idx: the index into the d_children[] of the @ei
>   * @parent: The parent dentry of the created file.
> @@ -438,157 +429,21 @@ void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry)
>   * address located at @e_dentry.
>   */
>  static struct dentry *
> -create_file_dentry(struct eventfs_inode *ei, int idx,
> -		   struct dentry *parent, const char *name, umode_t mode, void *data,
> +lookup_file_dentry(struct dentry *dentry,
> +		   struct eventfs_inode *ei, int idx,
> +		   umode_t mode, void *data,
>  		   const struct file_operations *fops)
>  {
>  	struct eventfs_attr *attr = NULL;
>  	struct dentry **e_dentry = &ei->d_children[idx];
> -	struct dentry *dentry;
> -
> -	WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
>  
> -	mutex_lock(&eventfs_mutex);
> -	if (ei->is_freed) {
> -		mutex_unlock(&eventfs_mutex);
> -		return NULL;
> -	}
> -	/* If the e_dentry already has a dentry, use it */
> -	if (*e_dentry) {
> -		dget(*e_dentry);
> -		mutex_unlock(&eventfs_mutex);
> -		return *e_dentry;
> -	}
> -
> -	/* ei->entry_attrs are protected by SRCU */
>  	if (ei->entry_attrs)
>  		attr = &ei->entry_attrs[idx];
>  
> -	mutex_unlock(&eventfs_mutex);
> -
> -	dentry = create_file(name, mode, attr, parent, data, fops);
> -
> -	mutex_lock(&eventfs_mutex);
> -
> -	if (IS_ERR_OR_NULL(dentry)) {
> -		/*
> -		 * When the mutex was released, something else could have
> -		 * created the dentry for this e_dentry. In which case
> -		 * use that one.
> -		 *
> -		 * If ei->is_freed is set, the e_dentry is currently on its
> -		 * way to being freed, don't return it. If e_dentry is NULL
> -		 * it means it was already freed.
> -		 */
> -		if (ei->is_freed) {
> -			dentry = NULL;
> -		} else {
> -			dentry = *e_dentry;
> -			dget(dentry);
> -		}
> -		mutex_unlock(&eventfs_mutex);
> -		return dentry;
> -	}
> -
> -	if (!*e_dentry && !ei->is_freed) {
> -		*e_dentry = dentry;
> -		dentry->d_fsdata = ei;
> -	} else {
> -		/*
> -		 * Should never happen unless we get here due to being freed.
> -		 * Otherwise it means two dentries exist with the same name.
> -		 */
> -		WARN_ON_ONCE(!ei->is_freed);
> -		dentry = NULL;
> -	}
> -	mutex_unlock(&eventfs_mutex);
> -
> -	return dentry;
> -}
> -
> -/**
> - * eventfs_post_create_dir - post create dir routine
> - * @ei: eventfs_inode of recently created dir
> - *
> - * Map the meta-data of files within an eventfs dir to their parent dentry
> - */
> -static void eventfs_post_create_dir(struct eventfs_inode *ei)
> -{
> -	struct eventfs_inode *ei_child;
> -
> -	lockdep_assert_held(&eventfs_mutex);
> -
> -	/* srcu lock already held */
> -	/* fill parent-child relation */
> -	list_for_each_entry_srcu(ei_child, &ei->children, list,
> -				 srcu_read_lock_held(&eventfs_srcu)) {
> -		ei_child->d_parent = ei->dentry;
> -	}
> -}
> -
> -/**
> - * create_dir_dentry - Create a directory dentry for the eventfs_inode
> - * @pei: The eventfs_inode parent of ei.
> - * @ei: The eventfs_inode to create the directory for
> - * @parent: The dentry of the parent of this directory
> - *
> - * This creates and attaches a directory dentry to the eventfs_inode @ei.
> - */
> -static struct dentry *
> -create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei,
> -		  struct dentry *parent)
> -{
> -	struct dentry *dentry = NULL;
> -
> -	WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
> -
> -	mutex_lock(&eventfs_mutex);
> -	if (pei->is_freed || ei->is_freed) {
> -		mutex_unlock(&eventfs_mutex);
> -		return NULL;
> -	}
> -	if (ei->dentry) {
> -		/* If the eventfs_inode already has a dentry, use it */
> -		dentry = ei->dentry;
> -		dget(dentry);
> -		mutex_unlock(&eventfs_mutex);
> -		return dentry;
> -	}
> -	mutex_unlock(&eventfs_mutex);
> +	dentry->d_fsdata = ei;		// NOTE: ei of _parent_
> +	lookup_file(dentry, mode, attr, data, fops);
>  
> -	dentry = create_dir(ei, parent);
> -
> -	mutex_lock(&eventfs_mutex);
> -
> -	if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) {
> -		/*
> -		 * When the mutex was released, something else could have
> -		 * created the dentry for this e_dentry. In which case
> -		 * use that one.
> -		 *
> -		 * If ei->is_freed is set, the e_dentry is currently on its
> -		 * way to being freed.
> -		 */
> -		dentry = ei->dentry;
> -		if (dentry)
> -			dget(dentry);
> -		mutex_unlock(&eventfs_mutex);
> -		return dentry;
> -	}
> -
> -	if (!ei->dentry && !ei->is_freed) {
> -		ei->dentry = dentry;
> -		eventfs_post_create_dir(ei);
> -		dentry->d_fsdata = ei;
> -	} else {
> -		/*
> -		 * Should never happen unless we get here due to being freed.
> -		 * Otherwise it means two dentries exist with the same name.
> -		 */
> -		WARN_ON_ONCE(!ei->is_freed);
> -		dentry = NULL;
> -	}
> -	mutex_unlock(&eventfs_mutex);
> +	*e_dentry = dentry;	// Remove me
>  
>  	return dentry;
>  }
> @@ -607,79 +462,55 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
>  					  struct dentry *dentry,
>  					  unsigned int flags)
>  {
> -	const struct file_operations *fops;
> -	const struct eventfs_entry *entry;
>  	struct eventfs_inode *ei_child;
>  	struct tracefs_inode *ti;
>  	struct eventfs_inode *ei;
> -	struct dentry *ei_dentry = NULL;
> -	struct dentry *ret = NULL;
> -	struct dentry *d;
>  	const char *name = dentry->d_name.name;
> -	umode_t mode;
> -	void *data;
> -	int idx;
> -	int i;
> -	int r;
> +	struct dentry *result = NULL;
>  
>  	ti = get_tracefs(dir);
>  	if (!(ti->flags & TRACEFS_EVENT_INODE))

	Can that ever happen?  I mean, why set ->i_op to something that
has this for ->lookup() on a directory without TRACEFS_EVENT_INODE in
its inode?  It's not as if you ever removed that flag...

> -		return NULL;
> -
> -	/* Grab srcu to prevent the ei from going away */
> -	idx = srcu_read_lock(&eventfs_srcu);
> +		return ERR_PTR(-EIO);
>  
> -	/*
> -	 * Grab the eventfs_mutex to consistent value from ti->private.
> -	 * This s
> -	 */
>  	mutex_lock(&eventfs_mutex);
> -	ei = READ_ONCE(ti->private);
> -	if (ei && !ei->is_freed)
> -		ei_dentry = READ_ONCE(ei->dentry);
> -	mutex_unlock(&eventfs_mutex);
> -
> -	if (!ei || !ei_dentry)
> -		goto out;
>  
> -	data = ei->data;
> +	ei = ti->private;
> +	if (!ei || ei->is_freed)
> +		goto enoent;
>  
> -	list_for_each_entry_srcu(ei_child, &ei->children, list,
> -				 srcu_read_lock_held(&eventfs_srcu)) {
> +	list_for_each_entry(ei_child, &ei->children, list) {
>  		if (strcmp(ei_child->name, name) != 0)
>  			continue;
> -		ret = simple_lookup(dir, dentry, flags);
> -		if (IS_ERR(ret))
> -			goto out;
> -		d = create_dir_dentry(ei, ei_child, ei_dentry);
> -		dput(d);
> +		if (ei_child->is_freed)
> +			goto enoent;

Out of curiosity - can that happen now?  You've got exclusion with
eventfs_remove_rec(), so you shouldn't be able to catch the moment
between setting ->is_freed and removal from the list...

> +		lookup_dir_entry(dentry, ei, ei_child);
>  		goto out;
>  	}
>  
> -	for (i = 0; i < ei->nr_entries; i++) {
> -		entry = &ei->entries[i];
> -		if (strcmp(name, entry->name) == 0) {
> -			void *cdata = data;
> -			mutex_lock(&eventfs_mutex);
> -			/* If ei->is_freed, then the event itself may be too */
> -			if (!ei->is_freed)
> -				r = entry->callback(name, &mode, &cdata, &fops);
> -			else
> -				r = -1;
> -			mutex_unlock(&eventfs_mutex);
> -			if (r <= 0)
> -				continue;
> -			ret = simple_lookup(dir, dentry, flags);
> -			if (IS_ERR(ret))
> -				goto out;
> -			d = create_file_dentry(ei, i, ei_dentry, name, mode, cdata, fops);
> -			dput(d);
> -			break;
> -		}
> +	for (int i = 0; i < ei->nr_entries; i++) {
> +		void *data;
> +		umode_t mode;
> +		const struct file_operations *fops;
> +		const struct eventfs_entry *entry = &ei->entries[i];
> +
> +		if (strcmp(name, entry->name) != 0)
> +			continue;
> +
> +		data = ei->data;
> +		if (entry->callback(name, &mode, &data, &fops) <= 0)
> +			goto enoent;
> +
> +		lookup_file_dentry(dentry, ei, i, mode, data, fops);
> +		goto out;
>  	}
> +
> + enoent:
> +	/* Don't cache negative lookups, just return an error */
> +	result = ERR_PTR(-ENOENT);

Huh?  Just return NULL and be done with that - you'll get an
unhashed negative dentry and let the caller turn that into
-ENOENT...

>   out:
> -	srcu_read_unlock(&eventfs_srcu, idx);
> -	return ret;
> +	mutex_unlock(&eventfs_mutex);
> +	return result;
>  }
>  
>  /*
Steven Rostedt Feb. 1, 2024, 2:26 a.m. UTC | #2
On Thu, 1 Feb 2024 00:27:19 +0000
Al Viro <viro@zeniv.linux.org.uk> wrote:

> On Wed, Jan 31, 2024 at 01:49:22PM -0500, Steven Rostedt wrote:
> 
> > @@ -329,32 +320,29 @@ static struct dentry *create_file(const char *name, umode_t mode,
> >  
> >  	ti = get_tracefs(inode);
> >  	ti->flags |= TRACEFS_EVENT_INODE;
> > -	d_instantiate(dentry, inode);
> > +
> > +	d_add(dentry, inode);
> >  	fsnotify_create(dentry->d_parent->d_inode, dentry);  
> 
> Seriously?  stat(2), have it go away from dcache on memory pressure,
> lather, rinse, repeat...  Won't *snotify get confused by the stream
> of creations of the same thing, with not a removal in sight?
> 

That looks to be cut and paste from the old create in tracefs. I don't know
of a real use case for that. I think we could possibly delete it without
anyone noticing.


> > -	return eventfs_end_creating(dentry);
> > +	return dentry;
> >  };
> >  


> > @@ -371,11 +359,14 @@ static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent
> >  	/* Only directories have ti->private set to an ei, not files */
> >  	ti->private = ei;
> >  
> > +	dentry->d_fsdata = ei;
> > +        ei->dentry = dentry;	// Remove me!
> > +
> >  	inc_nlink(inode);
> > -	d_instantiate(dentry, inode);
> > +	d_add(dentry, inode);
> >  	inc_nlink(dentry->d_parent->d_inode);  
> 
> What will happen when that thing gets evicted from dcache,
> gets looked up again, and again, and...?
> 
> >  	fsnotify_mkdir(dentry->d_parent->d_inode, dentry);  
> 
> Same re snotify confusion...

Yeah, again, I think it's useless. Doing that is more useless than taring
the tracefs directory ;-)

> 
> > -	return eventfs_end_creating(dentry);
> > +	return dentry;
> >  }
> >  
> >  static void free_ei(struct eventfs_inode *ei)
> > @@ -425,7 +416,7 @@ void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry)
> >  }
> >  


> > @@ -607,79 +462,55 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
> >  					  struct dentry *dentry,
> >  					  unsigned int flags)
> >  {
> > -	const struct file_operations *fops;
> > -	const struct eventfs_entry *entry;
> >  	struct eventfs_inode *ei_child;
> >  	struct tracefs_inode *ti;
> >  	struct eventfs_inode *ei;
> > -	struct dentry *ei_dentry = NULL;
> > -	struct dentry *ret = NULL;
> > -	struct dentry *d;
> >  	const char *name = dentry->d_name.name;
> > -	umode_t mode;
> > -	void *data;
> > -	int idx;
> > -	int i;
> > -	int r;
> > +	struct dentry *result = NULL;
> >  
> >  	ti = get_tracefs(dir);
> >  	if (!(ti->flags & TRACEFS_EVENT_INODE))  
> 
> 	Can that ever happen?  I mean, why set ->i_op to something that
> has this for ->lookup() on a directory without TRACEFS_EVENT_INODE in
> its inode?  It's not as if you ever removed that flag...

That's been there mostly as paranoia. Should probably be switched to:

	if (WARN_ON_ONCE(!(ti->flags & TRACEFS_EVENT_INODE)))


> 
> > -		return NULL;
> > -
> > -	/* Grab srcu to prevent the ei from going away */
> > -	idx = srcu_read_lock(&eventfs_srcu);
> > +		return ERR_PTR(-EIO);
> >  
> > -	/*
> > -	 * Grab the eventfs_mutex to consistent value from ti->private.
> > -	 * This s
> > -	 */
> >  	mutex_lock(&eventfs_mutex);
> > -	ei = READ_ONCE(ti->private);
> > -	if (ei && !ei->is_freed)
> > -		ei_dentry = READ_ONCE(ei->dentry);
> > -	mutex_unlock(&eventfs_mutex);
> > -
> > -	if (!ei || !ei_dentry)
> > -		goto out;
> >  
> > -	data = ei->data;
> > +	ei = ti->private;
> > +	if (!ei || ei->is_freed)
> > +		goto enoent;
> >  
> > -	list_for_each_entry_srcu(ei_child, &ei->children, list,
> > -				 srcu_read_lock_held(&eventfs_srcu)) {
> > +	list_for_each_entry(ei_child, &ei->children, list) {
> >  		if (strcmp(ei_child->name, name) != 0)
> >  			continue;
> > -		ret = simple_lookup(dir, dentry, flags);
> > -		if (IS_ERR(ret))
> > -			goto out;
> > -		d = create_dir_dentry(ei, ei_child, ei_dentry);
> > -		dput(d);
> > +		if (ei_child->is_freed)
> > +			goto enoent;  
> 
> Out of curiosity - can that happen now?  You've got exclusion with
> eventfs_remove_rec(), so you shouldn't be able to catch the moment
> between setting ->is_freed and removal from the list...

Yeah, that's from when we just used SRCU. If anything, it too should just
add a WARN_ON_ONCE() to it.

> 
> > +		lookup_dir_entry(dentry, ei, ei_child);
> >  		goto out;
> >  	}
> >  
> > -	for (i = 0; i < ei->nr_entries; i++) {
> > -		entry = &ei->entries[i];
> > -		if (strcmp(name, entry->name) == 0) {
> > -			void *cdata = data;
> > -			mutex_lock(&eventfs_mutex);
> > -			/* If ei->is_freed, then the event itself may be too */
> > -			if (!ei->is_freed)
> > -				r = entry->callback(name, &mode, &cdata, &fops);
> > -			else
> > -				r = -1;
> > -			mutex_unlock(&eventfs_mutex);
> > -			if (r <= 0)
> > -				continue;
> > -			ret = simple_lookup(dir, dentry, flags);
> > -			if (IS_ERR(ret))
> > -				goto out;
> > -			d = create_file_dentry(ei, i, ei_dentry, name, mode, cdata, fops);
> > -			dput(d);
> > -			break;
> > -		}
> > +	for (int i = 0; i < ei->nr_entries; i++) {
> > +		void *data;
> > +		umode_t mode;
> > +		const struct file_operations *fops;
> > +		const struct eventfs_entry *entry = &ei->entries[i];
> > +
> > +		if (strcmp(name, entry->name) != 0)
> > +			continue;
> > +
> > +		data = ei->data;
> > +		if (entry->callback(name, &mode, &data, &fops) <= 0)
> > +			goto enoent;
> > +
> > +		lookup_file_dentry(dentry, ei, i, mode, data, fops);
> > +		goto out;
> >  	}
> > +
> > + enoent:
> > +	/* Don't cache negative lookups, just return an error */
> > +	result = ERR_PTR(-ENOENT);  
> 
> Huh?  Just return NULL and be done with that - you'll get an
> unhashed negative dentry and let the caller turn that into
> -ENOENT...

We had a problem here with just returning NULL. It leaves the negative
dentry around and doesn't get refreshed.

I did this:

 # cd /sys/kernel/tracing
 # ls events/kprobes/sched/
ls: cannot access 'events/kprobes/sched/': No such file or directory
 # echo 'p:sched schedule' >> kprobe_events
 # ls events/kprobes/sched/
ls: cannot access 'events/kprobes/sched/': No such file or directory

When it should have been:

 # ls events/kprobes/sched/
enable  filter  format  hist  hist_debug  id  inject  trigger

Leaving the negative dentry there will have it fail when the directory
exists the next time.

-- Steve



> 
> >   out:
> > -	srcu_read_unlock(&eventfs_srcu, idx);
> > -	return ret;
> > +	mutex_unlock(&eventfs_mutex);
> > +	return result;
> >  }
> >  
> >  /*
Al Viro Feb. 1, 2024, 3:02 a.m. UTC | #3
On Wed, Jan 31, 2024 at 09:26:42PM -0500, Steven Rostedt wrote:

> > Huh?  Just return NULL and be done with that - you'll get an
> > unhashed negative dentry and let the caller turn that into
> > -ENOENT...
> 
> We had a problem here with just returning NULL. It leaves the negative
> dentry around and doesn't get refreshed.

Why would that dentry stick around?  And how would anyone find
it, anyway, when it's not hashed?

> I did this:
> 
>  # cd /sys/kernel/tracing
>  # ls events/kprobes/sched/
> ls: cannot access 'events/kprobes/sched/': No such file or directory
>  # echo 'p:sched schedule' >> kprobe_events
>  # ls events/kprobes/sched/
> ls: cannot access 'events/kprobes/sched/': No such file or directory
> 
> When it should have been:
> 
>  # ls events/kprobes/sched/
> enable  filter  format  hist  hist_debug  id  inject  trigger
> 
> Leaving the negative dentry there will have it fail when the directory
> exists the next time.

Then you have something very deeply fucked up.  NULL or ERR_PTR(-ENOENT)
from ->lookup() in the last component of open() would do exactly the
same thing: dput() whatever had been passed to ->lookup() and fail
open(2) with -ENOENT.
Steven Rostedt Feb. 1, 2024, 3:21 a.m. UTC | #4
On Thu, 1 Feb 2024 03:02:05 +0000
Al Viro <viro@zeniv.linux.org.uk> wrote:

> > We had a problem here with just returning NULL. It leaves the negative
> > dentry around and doesn't get refreshed.  
> 
> Why would that dentry stick around?  And how would anyone find
> it, anyway, when it's not hashed?

We (Linus and I) got it wrong. It originally had:

	d_add(dentry, NULL);
	[..]
	return NULL;

and it caused the:


  # ls events/kprobes/sched/
ls: cannot access 'events/kprobes/sched/': No such file or directory

  # echo 'p:sched schedule' >> /sys/kernel/tracing/kprobe_events 
  # ls events/kprobes/sched/
ls: cannot access 'events/kprobes/sched/': No such file or directory

I just changed the code to simply return NULL, and it had no issues:

  # ls events/kprobes/sched/
ls: cannot access 'events/kprobes/sched/': No such file or directory

  # echo 'p:sched schedule' >> /sys/kernel/tracing/kprobe_events 
  # ls events/kprobes/sched/
enable  filter  format  hist  hist_debug  id  inject  trigger

But then I added the: d_add(dentry, NULL); that we originally had, and then
it caused the issue again.

So it wasn't the returning NULL that was causing a problem, it was calling
the d_add(dentry, NULL); that was.

I'll update the patch.

-- Steve
Steven Rostedt Feb. 1, 2024, 4:18 a.m. UTC | #5
On Wed, 31 Jan 2024 22:21:27 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> We (Linus and I) got it wrong. It originally had:
> 
> 	d_add(dentry, NULL);
> 	[..]
> 	return NULL;

OK, so I changed that function to this:

static struct dentry *eventfs_root_lookup(struct inode *dir,
					  struct dentry *dentry,
					  unsigned int flags)
{
	struct eventfs_inode *ei_child;
	struct tracefs_inode *ti;
	struct eventfs_inode *ei;
	const char *name = dentry->d_name.name;

	ti = get_tracefs(dir);
	if (!(ti->flags & TRACEFS_EVENT_INODE))
		return ERR_PTR(-EIO);

	mutex_lock(&eventfs_mutex);

	ei = ti->private;
	if (!ei || ei->is_freed)
		goto out;

	list_for_each_entry(ei_child, &ei->children, list) {
		if (strcmp(ei_child->name, name) != 0)
			continue;
		if (ei_child->is_freed)
			goto out;
		lookup_dir_entry(dentry, ei, ei_child);
		goto out;
	}

	for (int i = 0; i < ei->nr_entries; i++) {
		void *data;
		umode_t mode;
		const struct file_operations *fops;
		const struct eventfs_entry *entry = &ei->entries[i];

		if (strcmp(name, entry->name) != 0)
			continue;

		data = ei->data;
		if (entry->callback(name, &mode, &data, &fops) <= 0)
			goto out;

		lookup_file_dentry(dentry, ei, i, mode, data, fops);
		goto out;
	}
 out:
	mutex_unlock(&eventfs_mutex);
	return NULL;
}

And it passes the make kprobe test. I'll send out a v3 of this patch, and
remove the inc_nlink(dentry->d_parent->d_inode) and the fsnotify as
separate patches as that code was there before Linus touched it.

Thanks,

-- Steve
diff mbox series

Patch

diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index e9819d719d2a..4878f4d578be 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -230,7 +230,6 @@  static struct eventfs_inode *eventfs_find_events(struct dentry *dentry)
 {
 	struct eventfs_inode *ei;
 
-	mutex_lock(&eventfs_mutex);
 	do {
 		// The parent is stable because we do not do renames
 		dentry = dentry->d_parent;
@@ -247,7 +246,6 @@  static struct eventfs_inode *eventfs_find_events(struct dentry *dentry)
 		}
 		// Walk upwards until you find the events inode
 	} while (!ei->is_events);
-	mutex_unlock(&eventfs_mutex);
 
 	update_top_events_attr(ei, dentry->d_sb);
 
@@ -280,11 +278,10 @@  static void update_inode_attr(struct dentry *dentry, struct inode *inode,
 }
 
 /**
- * create_file - create a file in the tracefs filesystem
- * @name: the name of the file to create.
+ * lookup_file - look up a file in the tracefs filesystem
+ * @dentry: the dentry to look up
  * @mode: the permission that the file should have.
  * @attr: saved attributes changed by user
- * @parent: parent dentry for this file.
  * @data: something that the caller will want to get to later on.
  * @fop: struct file_operations that should be used for this file.
  *
@@ -292,13 +289,13 @@  static void update_inode_attr(struct dentry *dentry, struct inode *inode,
  * directory. The inode.i_private pointer will point to @data in the open()
  * call.
  */
-static struct dentry *create_file(const char *name, umode_t mode,
+static struct dentry *lookup_file(struct dentry *dentry,
+				  umode_t mode,
 				  struct eventfs_attr *attr,
-				  struct dentry *parent, void *data,
+				  void *data,
 				  const struct file_operations *fop)
 {
 	struct tracefs_inode *ti;
-	struct dentry *dentry;
 	struct inode *inode;
 
 	if (!(mode & S_IFMT))
@@ -307,15 +304,9 @@  static struct dentry *create_file(const char *name, umode_t mode,
 	if (WARN_ON_ONCE(!S_ISREG(mode)))
 		return NULL;
 
-	WARN_ON_ONCE(!parent);
-	dentry = eventfs_start_creating(name, parent);
-
-	if (IS_ERR(dentry))
-		return dentry;
-
 	inode = tracefs_get_inode(dentry->d_sb);
 	if (unlikely(!inode))
-		return eventfs_failed_creating(dentry);
+		return ERR_PTR(-ENOMEM);
 
 	/* If the user updated the directory's attributes, use them */
 	update_inode_attr(dentry, inode, attr, mode);
@@ -329,32 +320,29 @@  static struct dentry *create_file(const char *name, umode_t mode,
 
 	ti = get_tracefs(inode);
 	ti->flags |= TRACEFS_EVENT_INODE;
-	d_instantiate(dentry, inode);
+
+	d_add(dentry, inode);
 	fsnotify_create(dentry->d_parent->d_inode, dentry);
-	return eventfs_end_creating(dentry);
+	return dentry;
 };
 
 /**
- * create_dir - create a dir in the tracefs filesystem
+ * lookup_dir_entry - look up a dir in the tracefs filesystem
+ * @dentry: the directory to look up
  * @ei: the eventfs_inode that represents the directory to create
- * @parent: parent dentry for this file.
  *
- * This function will create a dentry for a directory represented by
+ * This function will look up a dentry for a directory represented by
  * a eventfs_inode.
  */
-static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent)
+static struct dentry *lookup_dir_entry(struct dentry *dentry,
+	struct eventfs_inode *pei, struct eventfs_inode *ei)
 {
 	struct tracefs_inode *ti;
-	struct dentry *dentry;
 	struct inode *inode;
 
-	dentry = eventfs_start_creating(ei->name, parent);
-	if (IS_ERR(dentry))
-		return dentry;
-
 	inode = tracefs_get_inode(dentry->d_sb);
 	if (unlikely(!inode))
-		return eventfs_failed_creating(dentry);
+		return ERR_PTR(-ENOMEM);
 
 	/* If the user updated the directory's attributes, use them */
 	update_inode_attr(dentry, inode, &ei->attr,
@@ -371,11 +359,14 @@  static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent
 	/* Only directories have ti->private set to an ei, not files */
 	ti->private = ei;
 
+	dentry->d_fsdata = ei;
+        ei->dentry = dentry;	// Remove me!
+
 	inc_nlink(inode);
-	d_instantiate(dentry, inode);
+	d_add(dentry, inode);
 	inc_nlink(dentry->d_parent->d_inode);
 	fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
-	return eventfs_end_creating(dentry);
+	return dentry;
 }
 
 static void free_ei(struct eventfs_inode *ei)
@@ -425,7 +416,7 @@  void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry)
 }
 
 /**
- * create_file_dentry - create a dentry for a file of an eventfs_inode
+ * lookup_file_dentry - create a dentry for a file of an eventfs_inode
  * @ei: the eventfs_inode that the file will be created under
  * @idx: the index into the d_children[] of the @ei
  * @parent: The parent dentry of the created file.
@@ -438,157 +429,21 @@  void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry)
  * address located at @e_dentry.
  */
 static struct dentry *
-create_file_dentry(struct eventfs_inode *ei, int idx,
-		   struct dentry *parent, const char *name, umode_t mode, void *data,
+lookup_file_dentry(struct dentry *dentry,
+		   struct eventfs_inode *ei, int idx,
+		   umode_t mode, void *data,
 		   const struct file_operations *fops)
 {
 	struct eventfs_attr *attr = NULL;
 	struct dentry **e_dentry = &ei->d_children[idx];
-	struct dentry *dentry;
-
-	WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
 
-	mutex_lock(&eventfs_mutex);
-	if (ei->is_freed) {
-		mutex_unlock(&eventfs_mutex);
-		return NULL;
-	}
-	/* If the e_dentry already has a dentry, use it */
-	if (*e_dentry) {
-		dget(*e_dentry);
-		mutex_unlock(&eventfs_mutex);
-		return *e_dentry;
-	}
-
-	/* ei->entry_attrs are protected by SRCU */
 	if (ei->entry_attrs)
 		attr = &ei->entry_attrs[idx];
 
-	mutex_unlock(&eventfs_mutex);
-
-	dentry = create_file(name, mode, attr, parent, data, fops);
-
-	mutex_lock(&eventfs_mutex);
-
-	if (IS_ERR_OR_NULL(dentry)) {
-		/*
-		 * When the mutex was released, something else could have
-		 * created the dentry for this e_dentry. In which case
-		 * use that one.
-		 *
-		 * If ei->is_freed is set, the e_dentry is currently on its
-		 * way to being freed, don't return it. If e_dentry is NULL
-		 * it means it was already freed.
-		 */
-		if (ei->is_freed) {
-			dentry = NULL;
-		} else {
-			dentry = *e_dentry;
-			dget(dentry);
-		}
-		mutex_unlock(&eventfs_mutex);
-		return dentry;
-	}
-
-	if (!*e_dentry && !ei->is_freed) {
-		*e_dentry = dentry;
-		dentry->d_fsdata = ei;
-	} else {
-		/*
-		 * Should never happen unless we get here due to being freed.
-		 * Otherwise it means two dentries exist with the same name.
-		 */
-		WARN_ON_ONCE(!ei->is_freed);
-		dentry = NULL;
-	}
-	mutex_unlock(&eventfs_mutex);
-
-	return dentry;
-}
-
-/**
- * eventfs_post_create_dir - post create dir routine
- * @ei: eventfs_inode of recently created dir
- *
- * Map the meta-data of files within an eventfs dir to their parent dentry
- */
-static void eventfs_post_create_dir(struct eventfs_inode *ei)
-{
-	struct eventfs_inode *ei_child;
-
-	lockdep_assert_held(&eventfs_mutex);
-
-	/* srcu lock already held */
-	/* fill parent-child relation */
-	list_for_each_entry_srcu(ei_child, &ei->children, list,
-				 srcu_read_lock_held(&eventfs_srcu)) {
-		ei_child->d_parent = ei->dentry;
-	}
-}
-
-/**
- * create_dir_dentry - Create a directory dentry for the eventfs_inode
- * @pei: The eventfs_inode parent of ei.
- * @ei: The eventfs_inode to create the directory for
- * @parent: The dentry of the parent of this directory
- *
- * This creates and attaches a directory dentry to the eventfs_inode @ei.
- */
-static struct dentry *
-create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei,
-		  struct dentry *parent)
-{
-	struct dentry *dentry = NULL;
-
-	WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
-
-	mutex_lock(&eventfs_mutex);
-	if (pei->is_freed || ei->is_freed) {
-		mutex_unlock(&eventfs_mutex);
-		return NULL;
-	}
-	if (ei->dentry) {
-		/* If the eventfs_inode already has a dentry, use it */
-		dentry = ei->dentry;
-		dget(dentry);
-		mutex_unlock(&eventfs_mutex);
-		return dentry;
-	}
-	mutex_unlock(&eventfs_mutex);
+	dentry->d_fsdata = ei;		// NOTE: ei of _parent_
+	lookup_file(dentry, mode, attr, data, fops);
 
-	dentry = create_dir(ei, parent);
-
-	mutex_lock(&eventfs_mutex);
-
-	if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) {
-		/*
-		 * When the mutex was released, something else could have
-		 * created the dentry for this e_dentry. In which case
-		 * use that one.
-		 *
-		 * If ei->is_freed is set, the e_dentry is currently on its
-		 * way to being freed.
-		 */
-		dentry = ei->dentry;
-		if (dentry)
-			dget(dentry);
-		mutex_unlock(&eventfs_mutex);
-		return dentry;
-	}
-
-	if (!ei->dentry && !ei->is_freed) {
-		ei->dentry = dentry;
-		eventfs_post_create_dir(ei);
-		dentry->d_fsdata = ei;
-	} else {
-		/*
-		 * Should never happen unless we get here due to being freed.
-		 * Otherwise it means two dentries exist with the same name.
-		 */
-		WARN_ON_ONCE(!ei->is_freed);
-		dentry = NULL;
-	}
-	mutex_unlock(&eventfs_mutex);
+	*e_dentry = dentry;	// Remove me
 
 	return dentry;
 }
@@ -607,79 +462,55 @@  static struct dentry *eventfs_root_lookup(struct inode *dir,
 					  struct dentry *dentry,
 					  unsigned int flags)
 {
-	const struct file_operations *fops;
-	const struct eventfs_entry *entry;
 	struct eventfs_inode *ei_child;
 	struct tracefs_inode *ti;
 	struct eventfs_inode *ei;
-	struct dentry *ei_dentry = NULL;
-	struct dentry *ret = NULL;
-	struct dentry *d;
 	const char *name = dentry->d_name.name;
-	umode_t mode;
-	void *data;
-	int idx;
-	int i;
-	int r;
+	struct dentry *result = NULL;
 
 	ti = get_tracefs(dir);
 	if (!(ti->flags & TRACEFS_EVENT_INODE))
-		return NULL;
-
-	/* Grab srcu to prevent the ei from going away */
-	idx = srcu_read_lock(&eventfs_srcu);
+		return ERR_PTR(-EIO);
 
-	/*
-	 * Grab the eventfs_mutex to consistent value from ti->private.
-	 * This s
-	 */
 	mutex_lock(&eventfs_mutex);
-	ei = READ_ONCE(ti->private);
-	if (ei && !ei->is_freed)
-		ei_dentry = READ_ONCE(ei->dentry);
-	mutex_unlock(&eventfs_mutex);
-
-	if (!ei || !ei_dentry)
-		goto out;
 
-	data = ei->data;
+	ei = ti->private;
+	if (!ei || ei->is_freed)
+		goto enoent;
 
-	list_for_each_entry_srcu(ei_child, &ei->children, list,
-				 srcu_read_lock_held(&eventfs_srcu)) {
+	list_for_each_entry(ei_child, &ei->children, list) {
 		if (strcmp(ei_child->name, name) != 0)
 			continue;
-		ret = simple_lookup(dir, dentry, flags);
-		if (IS_ERR(ret))
-			goto out;
-		d = create_dir_dentry(ei, ei_child, ei_dentry);
-		dput(d);
+		if (ei_child->is_freed)
+			goto enoent;
+		lookup_dir_entry(dentry, ei, ei_child);
 		goto out;
 	}
 
-	for (i = 0; i < ei->nr_entries; i++) {
-		entry = &ei->entries[i];
-		if (strcmp(name, entry->name) == 0) {
-			void *cdata = data;
-			mutex_lock(&eventfs_mutex);
-			/* If ei->is_freed, then the event itself may be too */
-			if (!ei->is_freed)
-				r = entry->callback(name, &mode, &cdata, &fops);
-			else
-				r = -1;
-			mutex_unlock(&eventfs_mutex);
-			if (r <= 0)
-				continue;
-			ret = simple_lookup(dir, dentry, flags);
-			if (IS_ERR(ret))
-				goto out;
-			d = create_file_dentry(ei, i, ei_dentry, name, mode, cdata, fops);
-			dput(d);
-			break;
-		}
+	for (int i = 0; i < ei->nr_entries; i++) {
+		void *data;
+		umode_t mode;
+		const struct file_operations *fops;
+		const struct eventfs_entry *entry = &ei->entries[i];
+
+		if (strcmp(name, entry->name) != 0)
+			continue;
+
+		data = ei->data;
+		if (entry->callback(name, &mode, &data, &fops) <= 0)
+			goto enoent;
+
+		lookup_file_dentry(dentry, ei, i, mode, data, fops);
+		goto out;
 	}
+
+ enoent:
+	/* Don't cache negative lookups, just return an error */
+	result = ERR_PTR(-ENOENT);
+
  out:
-	srcu_read_unlock(&eventfs_srcu, idx);
-	return ret;
+	mutex_unlock(&eventfs_mutex);
+	return result;
 }
 
 /*
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 888e42087847..5c84460feeeb 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -495,75 +495,6 @@  struct dentry *tracefs_end_creating(struct dentry *dentry)
 	return dentry;
 }
 
-/**
- * eventfs_start_creating - start the process of creating a dentry
- * @name: Name of the file created for the dentry
- * @parent: The parent dentry where this dentry will be created
- *
- * This is a simple helper function for the dynamically created eventfs
- * files. When the directory of the eventfs files are accessed, their
- * dentries are created on the fly. This function is used to start that
- * process.
- */
-struct dentry *eventfs_start_creating(const char *name, struct dentry *parent)
-{
-	struct dentry *dentry;
-	int error;
-
-	/* Must always have a parent. */
-	if (WARN_ON_ONCE(!parent))
-		return ERR_PTR(-EINVAL);
-
-	error = simple_pin_fs(&trace_fs_type, &tracefs_mount,
-			      &tracefs_mount_count);
-	if (error)
-		return ERR_PTR(error);
-
-	if (unlikely(IS_DEADDIR(parent->d_inode)))
-		dentry = ERR_PTR(-ENOENT);
-	else
-		dentry = lookup_one_len(name, parent, strlen(name));
-
-	if (!IS_ERR(dentry) && dentry->d_inode) {
-		dput(dentry);
-		dentry = ERR_PTR(-EEXIST);
-	}
-
-	if (IS_ERR(dentry))
-		simple_release_fs(&tracefs_mount, &tracefs_mount_count);
-
-	return dentry;
-}
-
-/**
- * eventfs_failed_creating - clean up a failed eventfs dentry creation
- * @dentry: The dentry to clean up
- *
- * If after calling eventfs_start_creating(), a failure is detected, the
- * resources created by eventfs_start_creating() needs to be cleaned up. In
- * that case, this function should be called to perform that clean up.
- */
-struct dentry *eventfs_failed_creating(struct dentry *dentry)
-{
-	dput(dentry);
-	simple_release_fs(&tracefs_mount, &tracefs_mount_count);
-	return NULL;
-}
-
-/**
- * eventfs_end_creating - Finish the process of creating a eventfs dentry
- * @dentry: The dentry that has successfully been created.
- *
- * This function is currently just a place holder to match
- * eventfs_start_creating(). In case any synchronization needs to be added,
- * this function will be used to implement that without having to modify
- * the callers of eventfs_start_creating().
- */
-struct dentry *eventfs_end_creating(struct dentry *dentry)
-{
-	return dentry;
-}
-
 /* Find the inode that this will use for default */
 static struct inode *instance_inode(struct dentry *parent, struct inode *inode)
 {
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index 7d84349ade87..09037e2c173d 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -80,9 +80,6 @@  struct dentry *tracefs_start_creating(const char *name, struct dentry *parent);
 struct dentry *tracefs_end_creating(struct dentry *dentry);
 struct dentry *tracefs_failed_creating(struct dentry *dentry);
 struct inode *tracefs_get_inode(struct super_block *sb);
-struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
-struct dentry *eventfs_failed_creating(struct dentry *dentry);
-struct dentry *eventfs_end_creating(struct dentry *dentry);
 void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry);
 
 #endif /* _TRACEFS_INTERNAL_H */