diff mbox series

bcachefs: pending_rcu_items

Message ID xharqajk2o7xkpdk7mpd3jzj2suztubbqfms5xva367pcli5ny@6gjjjdbjo7om (mailing list archive)
State New, archived
Headers show
Series bcachefs: pending_rcu_items | expand

Commit Message

Kent Overstreet June 14, 2024, 8:28 p.m. UTC
This should incorporate everything we've covered so far; the one thing I
still have to look at is making it work as a kvfree_rcu() backend.

I decided not to do the "store the callback in the radix tree"
optimization for generic use - it made the code fairly ugly, and just
replacing the linked list with a radix tree should already be a
significant improvement.

Thoughts?

-- >8 --

Generic data structure for explicitly tracking pending RCU items,
allowing items to be dequeued (i.e. allocate from items pending
freeing).

- Works with conventional RCU and SRCU; pass in NULL for the srcu_struct
  for regular RCU

- Tracks pending items in a radix tree; falls back to linked list if
  radix tree allocation fails

todo: add support for being a kvfree_rcu() backend

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

Comments

Paul E. McKenney June 18, 2024, 12:15 a.m. UTC | #1
On Fri, Jun 14, 2024 at 04:28:14PM -0400, Kent Overstreet wrote:
> This should incorporate everything we've covered so far; the one thing I
> still have to look at is making it work as a kvfree_rcu() backend.
> 
> I decided not to do the "store the callback in the radix tree"
> optimization for generic use - it made the code fairly ugly, and just
> replacing the linked list with a radix tree should already be a
> significant improvement.
> 
> Thoughts?

Please see below for few questions and comments.

I do have some catching up to do, as I am having some difficulty imagining
the benefit of a radix tree in this situation.  But might as well dig in!

> -- >8 --
> 
> Generic data structure for explicitly tracking pending RCU items,
> allowing items to be dequeued (i.e. allocate from items pending
> freeing).
> 
> - Works with conventional RCU and SRCU; pass in NULL for the srcu_struct
>   for regular RCU

If it ever becomes helpful to make this work for any of the RCU Tasks
flavors, the rcutorture_type enum from kernel/rcu/rcu.h might be of help.
Until then, NULL works just fine.

> - Tracks pending items in a radix tree; falls back to linked list if
>   radix tree allocation fails

I am still having trouble seeing what a radix tree brings to the table
in this case, but again, migtht as well dig in.

> todo: add support for being a kvfree_rcu() backend
> 
> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> 
> diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
> index 0ab533a2b03b..c8394ec6f22f 100644
> --- a/fs/bcachefs/Makefile
> +++ b/fs/bcachefs/Makefile
> @@ -68,6 +68,7 @@ bcachefs-y		:=	\
>  	opts.o			\
>  	printbuf.o		\
>  	quota.o			\
> +	rcu_pending.o		\
>  	rebalance.o		\
>  	recovery.o		\
>  	recovery_passes.o	\
> diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c
> new file mode 100644
> index 000000000000..9b2f4da94425
> --- /dev/null
> +++ b/fs/bcachefs/rcu_pending.c
> @@ -0,0 +1,278 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/generic-radix-tree.h>
> +#include <linux/percpu.h>
> +#include <linux/srcu.h>
> +#include "rcu_pending.h"
> +
> +static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp)
> +{
> +	return ssp
> +		? get_state_synchronize_srcu(ssp)
> +		: get_state_synchronize_rcu();
> +}
> +
> +static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp)
> +{
> +	return ssp
> +		? start_poll_synchronize_srcu(ssp)
> +		: start_poll_synchronize_rcu();
> +}

I don't get why you 

> +static inline void __rcu_barrier(struct srcu_struct *ssp)
> +{
> +	return ssp
> +		? srcu_barrier(ssp)
> +		: rcu_barrier();
> +}

You will likely need something like these:

static inline void __get_completed_synchronize_rcu(struct srcu_struct *ssp)
{
	return ssp
		? get_completed_synchronize_srcu(ssp)
		: get_completed_synchronize_rcu();
}

static inline void __same_state_synchronize_rcu(struct srcu_struct *ssp)
{
	return ssp
		? same_state_synchronize_srcu(ssp)
		: same_state_synchronize_rcu();
}

The point of these is to get around the earlier issue where three cookies
appeared to be valid due to time delays.  But perhaps you found some
other way around that.

Anyway, the RCU versions have been around for quite some time, but the
SRCU versions were added to -rcu quite recently:

5f51520ec4fc ("srcu: Fill out polled grace-period APIs")

> +struct pending_rcu_items_seq {
> +	GENRADIX(struct rcu_head *)	objs;
> +	size_t				nr;
> +	struct rcu_head			*list;

A pointer to an rcu_head structure?  Interesting...

OK, I see it.  This is the overflow list in case memory cannot be
allocated.

> +	unsigned long			seq;

OK, this does appear like each group of objs gets its own return value from
either __start_poll_synchronize_rcu() or __get_state_synchronize_rcu().  If
so, much improved!

> +};
> +
> +struct pending_rcu_items_pcpu {
> +	struct pending_rcu_items	*parent;
> +	spinlock_t			lock;
> +	bool				rcu_armed;
> +	struct pending_rcu_items_seq	objs[2];

This can be:
	struct pending_rcu_items_seq	objs[NUM_ACTIVE_SRCU_POLL_OLDSTATE];

This is also recent in -rcu:

36dcb89814f7 ("srcu: Add NUM_ACTIVE_SRCU_POLL_OLDSTATE")

> +	struct rcu_head			rcu;

The purpose of this rcu_head is as before, to flag good times to invoke
poll_state_synchronize_rcu()?

> +	struct work_struct		work;
> +};
> +
> +static bool get_finished_items(struct pending_rcu_items *pending,
> +			       struct pending_rcu_items_pcpu *p,
> +			       struct pending_rcu_items_seq *out)
> +{
> +	for (struct pending_rcu_items_seq *objs = p->objs;
> +	     objs < p->objs + ARRAY_SIZE(p->objs); objs++)
> +		if ((objs->nr || objs->list) &&
> +		    poll_state_synchronize_srcu(pending->srcu, objs->seq)) {
> +			*out = (struct pending_rcu_items_seq) {
> +				/*
> +				 * the genradix can only be modified with atomic instructions,
> +				 * since we allocate new nodes without
> +				 * pending_rcu_items_pcpu.lock
> +				 */
> +				.objs.tree.root	= xchg(&objs->objs.tree.root, NULL),

OK, so this captures the objects that are now ready to invoke.  Very good.

I need to check what the caller does with it, of course.  And one caller
appears to invokes it from a workqueue handler, the other from unknown
context.  But you said earlier that all were from process context,
so that should be fine.  Having the submitters do processing does have
nice overload-handling characteristics, though it might be more prone
to deadlock.  (As in you cannot acquire a mutex in the callback that is
held across a call to the bch2_pending_rcu_item_enqueue() function.)

Both callers re-enable interrupts before doing the processing, which
is another good improvement.

> +				.nr		= objs->nr,
> +				.list		= objs->list,

And here you capture the overflow list.

What provides synchronization for this function?  It looks like it can
be invoked from multiple contexts.  Ah, the pending_rcu_items_pcpu
structure's ->lock, which is dropped after invoking this but before
processing the objects.

> +			};
> +			objs->nr	= 0;
> +			objs->list	= NULL;
> +			return true;
> +		}

If both elements are ready to process, it looks like you only process
the first one that you come to.  Why not look at both and return both
if both are ready?

A linked list of pages of pointers would make this possible by simply
concatenating the lists, right?

Or am I missing something that forces the second element of the array
to be checked?

> +	return false;
> +}
> +
> +static void process_finished_items(struct pending_rcu_items *pending,
> +				   struct pending_rcu_items_seq *objs)
> +{
> +	for (size_t i = 0; i < objs->nr; i++)
> +		pending->process(pending, *genradix_ptr(&objs->objs, i));

I might be missing something, but it looks like a linked list of pages of
pointers would do better here, especially if there are a lot of callbacks.
So what am I missing?

Also, I am not seeing a ->pending() field in the pending_rcu_items_seq
structure.  Again, what am I missing?

> +	genradix_free(&objs->objs);
> +
> +	while (objs->list) {
> +		struct rcu_head *obj = objs->list;
> +		objs->list = obj->next;
> +		pending->process(pending, obj);

And here we process the overflow list.

> +	}
> +}
> +
> +static void pending_rcu_items_rcu_cb(struct rcu_head *rcu)
> +{
> +	struct pending_rcu_items_pcpu *p =
> +		container_of(rcu, struct pending_rcu_items_pcpu, rcu);
> +
> +	schedule_work(&p->work);

OK, we fire off the workqueue after the RCU callback is invoked, which
should be a good time to invoke poll_state_synchronize_srcu().

You could git rid of this function by using queue_rcu_work() instead
of call_rcu().  Except that you would instead need queue_srcu_work().
Never mind!

> +}
> +
> +static bool __pending_rcu_items_has_pending(struct pending_rcu_items_pcpu *p)
> +{
> +	for (struct pending_rcu_items_seq *objs = p->objs;
> +	     objs < p->objs + ARRAY_SIZE(p->objs); objs++)
> +		if (objs->nr || objs->list)
> +			return true;
> +	return false;

And this is always invoked with the ->lock held, so good synchronization.

> +}
> +
> +static void pending_rcu_items_work(struct work_struct *work)
> +{
> +	struct pending_rcu_items_pcpu *p =
> +		container_of(work, struct pending_rcu_items_pcpu, work);
> +	struct pending_rcu_items *pending = p->parent;
> +again:
> +	spin_lock_irq(&p->lock);
> +	struct pending_rcu_items_seq finished;
> +	while (get_finished_items(pending, p, &finished)) {
> +		spin_unlock_irq(&p->lock);

OK, processing this with interrupts enabled is a great improvement!

Also doing it in workqueue context is a good thing.  I don't yet have
an opinion about how this code handles overload conditions.

> +		process_finished_items(pending, &finished);
> +		goto again;
> +	}
> +
> +	BUG_ON(!p->rcu_armed);
> +	p->rcu_armed = __pending_rcu_items_has_pending(p);
> +	if (p->rcu_armed)
> +		call_srcu(pending->srcu, &p->rcu, pending_rcu_items_rcu_cb);
> +	spin_unlock_irq(&p->lock);
> +}
> +
> +void bch2_pending_rcu_item_enqueue(struct pending_rcu_items *pending, struct rcu_head *obj)

As noted earlier, I am assuming that this is invoked from task context
with preemption enabled.

It might be worth a comment stating that the ->process() function cannot
acquire any mutex held across a call to bch2_pending_rcu_item_enqueue().

> +{
> +	struct pending_rcu_items_pcpu *p = raw_cpu_ptr(pending->p);
> +	bool alloc_failed = false;
> +	unsigned long flags;
> +retry:
> +	spin_lock_irqsave(&p->lock, flags);
> +
> +	struct pending_rcu_items_seq finished;
> +	while (get_finished_items(pending, p, &finished)) {
> +		spin_unlock_irqrestore(&p->lock, flags);
> +		process_finished_items(pending, &finished);
> +		goto retry;

This loop could be avoided by passing obj in to get_finished_items(),
and having that function enqueue obj just after emptying things.  Yes,
an infinite loop requires an improbably repeated sequence of preemptions
and migrations, but why not eliminate the possibility?

You might well like the division of processing, but it is not free here.

> +	}
> +
> +	struct pending_rcu_items_seq *objs;
> +
> +	unsigned long seq = __get_state_synchronize_rcu(pending->srcu);

Invoking __get_state_synchronize_rcu() before the call to get_finished_items()
could save you some latency.

> +	for (objs = p->objs; objs < p->objs + ARRAY_SIZE(p->objs); objs++)
> +		if (objs->nr && objs->seq == seq)

OK, the purpose of same_state_synchronize_srcu() is to make that
equality comparison work no matter what debugging information we might
need to put in the cookie returned from get_state_synchronize_srcu().
Or, for that matter, poll_state_synchronize_srcu().

So please use same_state_synchronize_srcu(objs->seq,seq) here.

> +			goto add;

And you can do this part before calling get_finished_items().

> +	seq = __start_poll_synchronize_rcu(pending->srcu);
> +	for (objs = p->objs; objs < p->objs + ARRAY_SIZE(p->objs); objs++)
> +		if (!objs->nr) {

And this is why you don't need get_completed_synchronize_srcu().  The fact
that you can easily check for there being no objects means that you can
easily know when it is OK to reuse this element.

The get_completed_synchronize_srcu() might be useful in caching
situations, where it would mark an object that might need to be rexposed
to readers for reuse, but that can otherwise be immediately freed.
Not the situation here, so you don't need it.

> +			objs->seq = seq;
> +			goto add;
> +		}
> +
> +	BUG();

Can this BUG() trigger?  If we get here, we have had interrupts disabled
across the call to get_finished_items(), and we can only have two
unexpired cookies.

But we still have the time-based issue where get_finished_items()
sees both as being unexpired, one of them expires, we call
__start_poll_synchronize_rcu(), and get a third sequence number.
Then this BUG() triggers.

I wasn't kidding.  That call to __start_poll_synchronize_rcu() absolutely
must precede that call to get_finished_items().  ;-)

> +	struct rcu_head **entry;
> +add:
> +	entry = genradix_ptr_alloc(&objs->objs, objs->nr, GFP_ATOMIC|__GFP_NOWARN);
> +	if (likely(entry)) {
> +		*entry = obj;
> +		objs->nr++;
> +	} else if (likely(!alloc_failed)) {
> +		spin_unlock_irqrestore(&p->lock, flags);
> +		alloc_failed = !genradix_ptr_alloc(&objs->objs, objs->nr, GFP_KERNEL);

Just so you know, the corresponding part of kfree_rcu() was the subject
of much churn as the mm system evolved.  But you can only go through this
retry loop twice, so I am OK, with it.

But you really want to keep this structure, you also really want to call
__start_poll_synchronize_rcu() before the retry: label.

If nothing else, you can check the cookie on allocation failure, and if it
has expired, just immediately free the object.

> +		goto retry;
> +	} else {
> +		obj->next = objs->list;
> +		objs->list = obj;

OK, and here is the OOM fallback processing.

> +	}
> +
> +	if (!p->rcu_armed) {
> +		call_srcu(pending->srcu, &p->rcu, pending_rcu_items_rcu_cb);
> +		p->rcu_armed = true;

In the other order, please!

Yes, you have interrupts disabled here, but you are not in an SRCU
read-side critical section, and a hypervisor isn't necessarily going to
know or care about your interrupt-disabled state.  So the way that the
code is written, you really can have the SRCU callback invoked before
setting ->rcu_armed to true, which might not be a good thing.

Especially given that it won't happen very often.  Except on large
fleets.  ;-)

> +	}
> +	spin_unlock_irqrestore(&p->lock, flags);
> +}
> +
> +static struct rcu_head *pending_rcu_item_pcpu_dequeue(struct pending_rcu_items_pcpu *p)
> +{
> +	struct rcu_head *ret = NULL;
> +
> +	spin_lock_irq(&p->lock);
> +	unsigned idx = p->objs[1].seq > p->objs[0].seq;
> +
> +	for (unsigned i = 0; i < 2; i++, idx ^= 1) {
> +		struct pending_rcu_items_seq *objs = p->objs + idx;
> +
> +		if (objs->nr) {
> +			ret = *genradix_ptr(&objs->objs, --objs->nr);
> +			break;
> +		}
> +
> +		if (objs->list) {
> +			ret = objs->list;
> +			objs->list = ret->next;
> +			break;
> +		}
> +	}
> +	spin_unlock_irq(&p->lock);
> +
> +	return ret;
> +}
> +
> +struct rcu_head *bch2_pending_rcu_item_dequeue(struct pending_rcu_items *pending)
> +{
> +	return pending_rcu_item_pcpu_dequeue(raw_cpu_ptr(pending->p));
> +}
> +
> +struct rcu_head *bch2_pending_rcu_item_dequeue_from_all(struct pending_rcu_items *pending)
> +{
> +	struct rcu_head *ret = NULL;
> +	int cpu;
> +	for_each_possible_cpu(cpu) {
> +		ret = pending_rcu_item_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
> +		if (ret)
> +			break;
> +	}
> +	return ret;
> +}

I assume that the point of these guys is to recycle objects that are
waiting for their RCU grace period, similar to SLAB_TYPESAFE_BY_RCU.
Readers would of course need the same sort of checks as used by readers
traversing memory from SLAB_TYPESAFE_BY_RCU slabs.

If my assumption is correct, handing out the oldest items minimizes the
amount of retrying these readers need to do.

> +static bool pending_rcu_items_has_pending(struct pending_rcu_items *pending)
> +{
> +	int cpu;
> +	for_each_possible_cpu(cpu) {
> +		struct pending_rcu_items_pcpu *p = per_cpu_ptr(pending->p, cpu);
> +		spin_lock_irq(&p->lock);
> +		if (__pending_rcu_items_has_pending(p)) {
> +			spin_unlock_irq(&p->lock);
> +			return true;
> +		}
> +		spin_unlock_irq(&p->lock);
> +	}
> +
> +	return false;
> +}
> +
> +void bch2_pending_rcu_items_exit(struct pending_rcu_items *pending)
> +{
> +	if (!pending->p)
> +		return;
> +
> +	while (pending_rcu_items_has_pending(pending))
> +		__rcu_barrier(pending->srcu);

OK, as long as bch2_pending_rcu_item_enqueue() cannot be invoked anymore,
this loop will only do a few iterations.

The rest looks fine.  (Famous last words...)

> +	int cpu;
> +	for_each_possible_cpu(cpu) {
> +		struct pending_rcu_items_pcpu *p = per_cpu_ptr(pending->p, cpu);
> +
> +		flush_work(&p->work);
> +
> +		WARN_ON(p->objs[0].nr);
> +		WARN_ON(p->objs[1].nr);
> +		WARN_ON(p->objs[0].list);
> +		WARN_ON(p->objs[1].list);
> +
> +		genradix_free(&p->objs[0].objs);
> +		genradix_free(&p->objs[1].objs);
> +	}
> +	free_percpu(pending->p);
> +}
> +
> +int bch2_pending_rcu_items_init(struct pending_rcu_items *pending,
> +				struct srcu_struct *srcu,
> +				pending_rcu_item_process_fn process)
> +{
> +	pending->p = alloc_percpu(struct pending_rcu_items_pcpu);
> +	if (!pending->p)
> +		return -ENOMEM;
> +
> +	int cpu;
> +	for_each_possible_cpu(cpu) {
> +		struct pending_rcu_items_pcpu *p = per_cpu_ptr(pending->p, cpu);
> +		p->parent	= pending;
> +		spin_lock_init(&p->lock);
> +		INIT_WORK(&p->work, pending_rcu_items_work);
> +	}
> +
> +	pending->srcu = srcu;
> +	pending->process = process;
> +
> +	return 0;
> +}
> diff --git a/fs/bcachefs/rcu_pending.h b/fs/bcachefs/rcu_pending.h
> new file mode 100644
> index 000000000000..e45ede074443
> --- /dev/null
> +++ b/fs/bcachefs/rcu_pending.h
> @@ -0,0 +1,25 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _BCACHEFS_RCU_PENDING_H
> +#define _BCACHEFS_RCU_PENDING_H
> +
> +struct pending_rcu_items;
> +typedef void (*pending_rcu_item_process_fn)(struct pending_rcu_items *, struct rcu_head *);
> +
> +struct pending_rcu_items_pcpu;
> +
> +struct pending_rcu_items {
> +	struct pending_rcu_items_pcpu __percpu *p;
> +	struct srcu_struct		*srcu;
> +	pending_rcu_item_process_fn	process;
> +};
> +
> +void bch2_pending_rcu_item_enqueue(struct pending_rcu_items *pending, struct rcu_head *obj);
> +struct rcu_head *bch2_pending_rcu_item_dequeue(struct pending_rcu_items *pending);
> +struct rcu_head *bch2_pending_rcu_item_dequeue_from_all(struct pending_rcu_items *pending);
> +
> +void bch2_pending_rcu_items_exit(struct pending_rcu_items *pending);
> +int bch2_pending_rcu_items_init(struct pending_rcu_items *pending,
> +				struct srcu_struct *srcu,
> +				pending_rcu_item_process_fn process);
> +
> +#endif /* _BCACHEFS_RCU_PENDING_H */
Kent Overstreet June 18, 2024, 2:24 a.m. UTC | #2
On Mon, Jun 17, 2024 at 05:15:29PM -0700, Paul E. McKenney wrote:
> If it ever becomes helpful to make this work for any of the RCU Tasks
> flavors, the rcutorture_type enum from kernel/rcu/rcu.h might be of help.
> Until then, NULL works just fine.

Certainly

> > - Tracks pending items in a radix tree; falls back to linked list if
> >   radix tree allocation fails
> 
> I am still having trouble seeing what a radix tree brings to the table
> in this case, but again, migtht as well dig in.

It's simpler, and (marginally) more efficient than list-of-pages.

My new implementation of kvfree_call_rcu() on top of this data structure
is now looking faster than the existing version - and I'm using smaller
nodes (512 bytes instead of full pages).

> > +static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp)
> > +{
> > +	return ssp
> > +		? start_poll_synchronize_srcu(ssp)
> > +		: start_poll_synchronize_rcu();
> > +}
> 
> I don't get why you 
> 
> > +static inline void __rcu_barrier(struct srcu_struct *ssp)
> > +{
> > +	return ssp
> > +		? srcu_barrier(ssp)
> > +		: rcu_barrier();
> > +}
> 
> You will likely need something like these:
> 
> static inline void __get_completed_synchronize_rcu(struct srcu_struct *ssp)
> {
> 	return ssp
> 		? get_completed_synchronize_srcu(ssp)
> 		: get_completed_synchronize_rcu();
> }
> 
> static inline void __same_state_synchronize_rcu(struct srcu_struct *ssp)
> {
> 	return ssp
> 		? same_state_synchronize_srcu(ssp)
> 		: same_state_synchronize_rcu();
> }
> 
> The point of these is to get around the earlier issue where three cookies
> appeared to be valid due to time delays.  But perhaps you found some
> other way around that.

Yes, that comes from a race between poll_state_synchronize() and
start_poll_synchronize(), easily handled with a loop.

To be honest, I'm still not getting your intended use of the other
helpers; to be honest, just comparing sequence numbers with
ULONG_CMP_GE/LT seems simplest to my brain.

> OK, this does appear like each group of objs gets its own return value from
> either __start_poll_synchronize_rcu() or __get_state_synchronize_rcu().  If
> so, much improved!

To be honest I still prefer the old way.

This scheme requires multiple loops over the array, which leads to the
kvfree_call_rcu() based on this being quite branchy; the generated code
could be a fair bit slimmer with the old scheme.

> The purpose of this rcu_head is as before, to flag good times to invoke
> poll_state_synchronize_rcu()?

Correct

> I need to check what the caller does with it, of course.  And one caller
> appears to invokes it from a workqueue handler, the other from unknown
> context.  But you said earlier that all were from process context,
> so that should be fine.  Having the submitters do processing does have
> nice overload-handling characteristics, though it might be more prone
> to deadlock.  (As in you cannot acquire a mutex in the callback that is
> held across a call to the bch2_pending_rcu_item_enqueue() function.)

That's true for my use in bcachefs, but thanks for pointing that out,
that does look like a real problem for using this for a
kvfree_call_rcu() replacement.

kvfree_call_rcu() can't necessarily process completions in the caller,
since it can be called from interrupt context; that means unbounded
grace periods with uncompleted callbacks build up unbounded work.

And, with these data structures that would be really painful.

Which leads me to wonder, how conceivable would it be to put a limit on
the number of grace periods with uncompleted callbacks? Use of that
interface would have to be limited to trusted code (because it could
stall the system!), but I wonder if it might have positive effects on
overload behaviour.

Hmm.

> What provides synchronization for this function?  It looks like it can
> be invoked from multiple contexts.  Ah, the pending_rcu_items_pcpu
> structure's ->lock, which is dropped after invoking this but before
> processing the objects.

Yes, and in my latest version that's stricter about avoiding preemption,
the lock is only necessary for dequeue_from_all() - all other options
are careful to only work on the CPU local version now.

> If both elements are ready to process, it looks like you only process
> the first one that you come to.  Why not look at both and return both
> if both are ready?
> 
> A linked list of pages of pointers would make this possible by simply
> concatenating the lists, right?
> 
> Or am I missing something that forces the second element of the array
> to be checked?

The elements of the array aren't ordered now (per your scheme). But
perhaps they still could be.

> I might be missing something, but it looks like a linked list of pages of
> pointers would do better here, especially if there are a lot of callbacks.
> So what am I missing?

I'll be improving the genradix iterator code to avoid the radix tree
walk except when crossing nodes.

I _much_ prefer using a standard data structure instead of hand rolling,
and this code is both cleaner and more compact - and faster.

> Also, I am not seeing a ->pending() field in the pending_rcu_items_seq
> structure.  Again, what am I missing?

Not necessary, since it's pending iff it has items - except you indicate
a callback, so perhas you're thinking of something else?

> OK, processing this with interrupts enabled is a great improvement!
> 
> Also doing it in workqueue context is a good thing.  I don't yet have
> an opinion about how this code handles overload conditions.

> > +void bch2_pending_rcu_item_enqueue(struct pending_rcu_items *pending, struct rcu_head *obj)
> 
> As noted earlier, I am assuming that this is invoked from task context
> with preemption enabled.

In my usage, yes. I would prefer to not require that.

It's a bit unfortunate that we still don't have a way of checking
(outside of debug code) "are we in sleepable process context?" - that'll
have to be passed in.

> It might be worth a comment stating that the ->process() function cannot
> acquire any mutex held across a call to bch2_pending_rcu_item_enqueue().

Yes, and a note about it being called from enqueue()

> > +	struct pending_rcu_items_seq finished;
> > +	while (get_finished_items(pending, p, &finished)) {
> > +		spin_unlock_irqrestore(&p->lock, flags);
> > +		process_finished_items(pending, &finished);
> > +		goto retry;
> 
> This loop could be avoided by passing obj in to get_finished_items(),
> and having that function enqueue obj just after emptying things.  Yes,
> an infinite loop requires an improbably repeated sequence of preemptions
> and migrations, but why not eliminate the possibility?

I've reworked this code somewhat to inline fastpaths and move slowpath
to separate functions, have a look at the latest when I post it.

> > +	struct pending_rcu_items_seq *objs;
> > +
> > +	unsigned long seq = __get_state_synchronize_rcu(pending->srcu);
> 
> Invoking __get_state_synchronize_rcu() before the call to get_finished_items()
> could save you some latency.

??

> > +	for (objs = p->objs; objs < p->objs + ARRAY_SIZE(p->objs); objs++)
> > +		if (objs->nr && objs->seq == seq)
> 
> OK, the purpose of same_state_synchronize_srcu() is to make that
> equality comparison work no matter what debugging information we might
> need to put in the cookie returned from get_state_synchronize_srcu().
> Or, for that matter, poll_state_synchronize_srcu().
> 
> So please use same_state_synchronize_srcu(objs->seq,seq) here.

Done

> I wasn't kidding.  That call to __start_poll_synchronize_rcu() absolutely
> must precede that call to get_finished_items().  ;-)

Except start_poll_synchronize_rcu() is higher overhead than
get_state_synchronize_rcu() - the idea is to avoid calling it when we
know it's not necessary because we already have items for that sequence
number.

But I think there must be a better way to do this, and we shouldn't be
calling start_poll() multiple times when we're looping, yeah.

> > +	struct rcu_head **entry;
> > +add:
> > +	entry = genradix_ptr_alloc(&objs->objs, objs->nr, GFP_ATOMIC|__GFP_NOWARN);
> > +	if (likely(entry)) {
> > +		*entry = obj;
> > +		objs->nr++;
> > +	} else if (likely(!alloc_failed)) {
> > +		spin_unlock_irqrestore(&p->lock, flags);
> > +		alloc_failed = !genradix_ptr_alloc(&objs->objs, objs->nr, GFP_KERNEL);
> 
> Just so you know, the corresponding part of kfree_rcu() was the subject
> of much churn as the mm system evolved.  But you can only go through this
> retry loop twice, so I am OK, with it.
> 
> But you really want to keep this structure, you also really want to call
> __start_poll_synchronize_rcu() before the retry: label.
> 
> If nothing else, you can check the cookie on allocation failure, and if it
> has expired, just immediately free the object.

Yeah, that's a good thought.

> 
> > +	if (!p->rcu_armed) {
> > +		call_srcu(pending->srcu, &p->rcu, pending_rcu_items_rcu_cb);
> > +		p->rcu_armed = true;
> 
> In the other order, please!
> 
> Yes, you have interrupts disabled here, but you are not in an SRCU
> read-side critical section, and a hypervisor isn't necessarily going to
> know or care about your interrupt-disabled state.  So the way that the
> code is written, you really can have the SRCU callback invoked before
> setting ->rcu_armed to true, which might not be a good thing.
> 
> Especially given that it won't happen very often.  Except on large
> fleets.  ;-)

Noted :)

> > +struct rcu_head *bch2_pending_rcu_item_dequeue_from_all(struct pending_rcu_items *pending)
> > +{
> > +	struct rcu_head *ret = NULL;
> > +	int cpu;
> > +	for_each_possible_cpu(cpu) {
> > +		ret = pending_rcu_item_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
> > +		if (ret)
> > +			break;
> > +	}
> > +	return ret;
> > +}
> 
> I assume that the point of these guys is to recycle objects that are
> waiting for their RCU grace period, similar to SLAB_TYPESAFE_BY_RCU.
> Readers would of course need the same sort of checks as used by readers
> traversing memory from SLAB_TYPESAFE_BY_RCU slabs.
> 
> If my assumption is correct, handing out the oldest items minimizes the
> amount of retrying these readers need to do.

I hand out the newest to minize stranding - if we're trying to return
memory to the rest of the system, we don't want to reuse the objects
we're about to be able to free.
Paul E. McKenney June 18, 2024, 3:57 a.m. UTC | #3
On Mon, Jun 17, 2024 at 10:24:54PM -0400, Kent Overstreet wrote:
> On Mon, Jun 17, 2024 at 05:15:29PM -0700, Paul E. McKenney wrote:
> > If it ever becomes helpful to make this work for any of the RCU Tasks
> > flavors, the rcutorture_type enum from kernel/rcu/rcu.h might be of help.
> > Until then, NULL works just fine.
> 
> Certainly
> 
> > > - Tracks pending items in a radix tree; falls back to linked list if
> > >   radix tree allocation fails
> > 
> > I am still having trouble seeing what a radix tree brings to the table
> > in this case, but again, migtht as well dig in.
> 
> It's simpler, and (marginally) more efficient than list-of-pages.
> 
> My new implementation of kvfree_call_rcu() on top of this data structure
> is now looking faster than the existing version - and I'm using smaller
> nodes (512 bytes instead of full pages).

OK.  We will need to look carefully at the performance comparisons.
Lots of places for overhead to hide in both schemes.  ;-)

> > > +static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp)
> > > +{
> > > +	return ssp
> > > +		? start_poll_synchronize_srcu(ssp)
> > > +		: start_poll_synchronize_rcu();
> > > +}
> > 
> > I don't get why you 
> > 
> > > +static inline void __rcu_barrier(struct srcu_struct *ssp)
> > > +{
> > > +	return ssp
> > > +		? srcu_barrier(ssp)
> > > +		: rcu_barrier();
> > > +}
> > 
> > You will likely need something like these:
> > 
> > static inline void __get_completed_synchronize_rcu(struct srcu_struct *ssp)
> > {
> > 	return ssp
> > 		? get_completed_synchronize_srcu(ssp)
> > 		: get_completed_synchronize_rcu();
> > }
> > 
> > static inline void __same_state_synchronize_rcu(struct srcu_struct *ssp)
> > {
> > 	return ssp
> > 		? same_state_synchronize_srcu(ssp)
> > 		: same_state_synchronize_rcu();
> > }
> > 
> > The point of these is to get around the earlier issue where three cookies
> > appeared to be valid due to time delays.  But perhaps you found some
> > other way around that.
> 
> Yes, that comes from a race between poll_state_synchronize() and
> start_poll_synchronize(), easily handled with a loop.

Plus call_srcu() can also get its oar in.  But more later.

> To be honest, I'm still not getting your intended use of the other
> helpers; to be honest, just comparing sequence numbers with
> ULONG_CMP_GE/LT seems simplest to my brain.

From your reply later on, I believe that we are OK.  I will see with
your next version.  ;-)

> > OK, this does appear like each group of objs gets its own return value from
> > either __start_poll_synchronize_rcu() or __get_state_synchronize_rcu().  If
> > so, much improved!
> 
> To be honest I still prefer the old way.
> 
> This scheme requires multiple loops over the array, which leads to the
> kvfree_call_rcu() based on this being quite branchy; the generated code
> could be a fair bit slimmer with the old scheme.

The time that this matters is when someone is invoking this in a tight
loop, in which case the branch predictor will to quite well, given that
grace periods take some time to complete.

> > The purpose of this rcu_head is as before, to flag good times to invoke
> > poll_state_synchronize_rcu()?
> 
> Correct
> 
> > I need to check what the caller does with it, of course.  And one caller
> > appears to invokes it from a workqueue handler, the other from unknown
> > context.  But you said earlier that all were from process context,
> > so that should be fine.  Having the submitters do processing does have
> > nice overload-handling characteristics, though it might be more prone
> > to deadlock.  (As in you cannot acquire a mutex in the callback that is
> > held across a call to the bch2_pending_rcu_item_enqueue() function.)
> 
> That's true for my use in bcachefs, but thanks for pointing that out,
> that does look like a real problem for using this for a
> kvfree_call_rcu() replacement.
> 
> kvfree_call_rcu() can't necessarily process completions in the caller,
> since it can be called from interrupt context; that means unbounded
> grace periods with uncompleted callbacks build up unbounded work.
> 
> And, with these data structures that would be really painful.

Agreed, if you can keep your object queueing in preempt-enabled task
context, life is much easier.  But yes, kfree_rcu() doesn't get that
luxury, nor does call_rcu().

> Which leads me to wonder, how conceivable would it be to put a limit on
> the number of grace periods with uncompleted callbacks? Use of that
> interface would have to be limited to trusted code (because it could
> stall the system!), but I wonder if it might have positive effects on
> overload behaviour.
> 
> Hmm.

For a sufficiently constrained system (and your bcachefs use case might
qualify for all I know), this might work very well.  The problem is that
"trusted" would need to include "not running on a hypervisor" and "not
running with firmware having longer-than-average SMIs.  :-(

> > What provides synchronization for this function?  It looks like it can
> > be invoked from multiple contexts.  Ah, the pending_rcu_items_pcpu
> > structure's ->lock, which is dropped after invoking this but before
> > processing the objects.
> 
> Yes, and in my latest version that's stricter about avoiding preemption,
> the lock is only necessary for dequeue_from_all() - all other options
> are careful to only work on the CPU local version now.

Using migration_disable() or similar?

> > If both elements are ready to process, it looks like you only process
> > the first one that you come to.  Why not look at both and return both
> > if both are ready?
> > 
> > A linked list of pages of pointers would make this possible by simply
> > concatenating the lists, right?
> > 
> > Or am I missing something that forces the second element of the array
> > to be checked?
> 
> The elements of the array aren't ordered now (per your scheme). But
> perhaps they still could be.

Well, they are, not in terms of the ->seq value, but in terms of the
index.

Suppose that there has been a huge pile of objects queued up over the past
while, so that both ->objs[] elements are quite full.  Suppose that,
just by chance, the ->obj[0] ->seq is the first to have its grace
period complete.  Suppose further that just at that point in time, the
object-queuing rate suddenly decreases.  So much so, that at least one
full grace period elapses between queuing of individual objects.

Ah, the loop in pending_rcu_items_work() should take care of things
here, maybe.

> > I might be missing something, but it looks like a linked list of pages of
> > pointers would do better here, especially if there are a lot of callbacks.
> > So what am I missing?
> 
> I'll be improving the genradix iterator code to avoid the radix tree
> walk except when crossing nodes.
> 
> I _much_ prefer using a standard data structure instead of hand rolling,
> and this code is both cleaner and more compact - and faster.

Perhaps there are things we could do to make the pages of pointers better.
Or maybe we should adopt radix tree, though it still seems like a very
strange fit.  Some careful performance analysis will need to drive this
choice.  Avoiding ever forgetting the "status quo" option.

> > Also, I am not seeing a ->pending() field in the pending_rcu_items_seq
> > structure.  Again, what am I missing?
> 
> Not necessary, since it's pending iff it has items - except you indicate
> a callback, so perhas you're thinking of something else?

My typo, ->process(), not ->pending().  Apologies for my confusion!

> > OK, processing this with interrupts enabled is a great improvement!
> > 
> > Also doing it in workqueue context is a good thing.  I don't yet have
> > an opinion about how this code handles overload conditions.
> 
> > > +void bch2_pending_rcu_item_enqueue(struct pending_rcu_items *pending, struct rcu_head *obj)
> > 
> > As noted earlier, I am assuming that this is invoked from task context
> > with preemption enabled.
> 
> In my usage, yes. I would prefer to not require that.
> 
> It's a bit unfortunate that we still don't have a way of checking
> (outside of debug code) "are we in sleepable process context?" - that'll
> have to be passed in.

That might arrive with lazy preemption.  It would untangle quite a few
knots in many parts of RCU, that is for sure.

> > It might be worth a comment stating that the ->process() function cannot
> > acquire any mutex held across a call to bch2_pending_rcu_item_enqueue().
> 
> Yes, and a note about it being called from enqueue()

Very good!

> > > +	struct pending_rcu_items_seq finished;
> > > +	while (get_finished_items(pending, p, &finished)) {
> > > +		spin_unlock_irqrestore(&p->lock, flags);
> > > +		process_finished_items(pending, &finished);
> > > +		goto retry;
> > 
> > This loop could be avoided by passing obj in to get_finished_items(),
> > and having that function enqueue obj just after emptying things.  Yes,
> > an infinite loop requires an improbably repeated sequence of preemptions
> > and migrations, but why not eliminate the possibility?
> 
> I've reworked this code somewhat to inline fastpaths and move slowpath
> to separate functions, have a look at the latest when I post it.

Looking forward to it.

> > > +	struct pending_rcu_items_seq *objs;
> > > +
> > > +	unsigned long seq = __get_state_synchronize_rcu(pending->srcu);
> > 
> > Invoking __get_state_synchronize_rcu() before the call to get_finished_items()
> > could save you some latency.
> 
> ??

It is always the same object, and it has the same history being
reader-visible.  So get the cookie as early as possible and keep that
cookie.  Just because you took time out to process some objects does
not mean you need a newer cookie, after all.

My advice is as follows:

1.	Pick up the cookie, whether from get_synchronize_srcu() or
	poll_state_synchronize_rcu().

2.	If that cookie matches one of the ->objs[]->seq, add it to
	that ->objs[].  Null the pointer or whatever to record that
	it is already taken care of.  Or just take an early exit.

	If under heavy load, this will be the common-case code path.

3.	Only then try processing the ->objs[] elements.

4.	Yes, then you need to queue the thing, but you can make
	#2 above be an inline function to avoid duplicating code.
	And you can check to see if the cookie already expired,
	in which case you can just process it immediately.

	You only need to retry in case of memory-allocation failure.

Does that make sense?

> > > +	for (objs = p->objs; objs < p->objs + ARRAY_SIZE(p->objs); objs++)
> > > +		if (objs->nr && objs->seq == seq)
> > 
> > OK, the purpose of same_state_synchronize_srcu() is to make that
> > equality comparison work no matter what debugging information we might
> > need to put in the cookie returned from get_state_synchronize_srcu().
> > Or, for that matter, poll_state_synchronize_srcu().
> > 
> > So please use same_state_synchronize_srcu(objs->seq,seq) here.
> 
> Done

Thank you!

> > I wasn't kidding.  That call to __start_poll_synchronize_rcu() absolutely
> > must precede that call to get_finished_items().  ;-)
> 
> Except start_poll_synchronize_rcu() is higher overhead than
> get_state_synchronize_rcu() - the idea is to avoid calling it when we
> know it's not necessary because we already have items for that sequence
> number.

You can probably instead call get_state_synchronize_srcu() to begin
with, and let the later call_srcu() start the grace period if needed.
The only exception I can imagine to this right off-hand is under heavy
load, where you might need to start the grace period sooner rather than
later.  But that is easy -- if you have too many objects piled up, call
poll_state_synchronize_srcu() instead of get_state_synchronize_srcu().

And there are ways of avoiding even that get_state_synchronize_srcu()
in the common case, but at the cost of additional complexity.  So later,
when and if.

> But I think there must be a better way to do this, and we shouldn't be
> calling start_poll() multiple times when we're looping, yeah.

Whew!!!  ;-)

> > > +	struct rcu_head **entry;
> > > +add:
> > > +	entry = genradix_ptr_alloc(&objs->objs, objs->nr, GFP_ATOMIC|__GFP_NOWARN);
> > > +	if (likely(entry)) {
> > > +		*entry = obj;
> > > +		objs->nr++;
> > > +	} else if (likely(!alloc_failed)) {
> > > +		spin_unlock_irqrestore(&p->lock, flags);
> > > +		alloc_failed = !genradix_ptr_alloc(&objs->objs, objs->nr, GFP_KERNEL);
> > 
> > Just so you know, the corresponding part of kfree_rcu() was the subject
> > of much churn as the mm system evolved.  But you can only go through this
> > retry loop twice, so I am OK, with it.
> > 
> > But you really want to keep this structure, you also really want to call
> > __start_poll_synchronize_rcu() before the retry: label.
> > 
> > If nothing else, you can check the cookie on allocation failure, and if it
> > has expired, just immediately free the object.
> 
> Yeah, that's a good thought.
> 
> > > +	if (!p->rcu_armed) {
> > > +		call_srcu(pending->srcu, &p->rcu, pending_rcu_items_rcu_cb);
> > > +		p->rcu_armed = true;
> > 
> > In the other order, please!
> > 
> > Yes, you have interrupts disabled here, but you are not in an SRCU
> > read-side critical section, and a hypervisor isn't necessarily going to
> > know or care about your interrupt-disabled state.  So the way that the
> > code is written, you really can have the SRCU callback invoked before
> > setting ->rcu_armed to true, which might not be a good thing.
> > 
> > Especially given that it won't happen very often.  Except on large
> > fleets.  ;-)
> 
> Noted :)
> 
> > > +struct rcu_head *bch2_pending_rcu_item_dequeue_from_all(struct pending_rcu_items *pending)
> > > +{
> > > +	struct rcu_head *ret = NULL;
> > > +	int cpu;
> > > +	for_each_possible_cpu(cpu) {
> > > +		ret = pending_rcu_item_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
> > > +		if (ret)
> > > +			break;
> > > +	}
> > > +	return ret;
> > > +}
> > 
> > I assume that the point of these guys is to recycle objects that are
> > waiting for their RCU grace period, similar to SLAB_TYPESAFE_BY_RCU.
> > Readers would of course need the same sort of checks as used by readers
> > traversing memory from SLAB_TYPESAFE_BY_RCU slabs.
> > 
> > If my assumption is correct, handing out the oldest items minimizes the
> > amount of retrying these readers need to do.
> 
> I hand out the newest to minize stranding - if we're trying to return
> memory to the rest of the system, we don't want to reuse the objects
> we're about to be able to free.

Fair enough, and I agree that it is not a trivial tradeoff.

							Thanx, Paul
Kent Overstreet June 18, 2024, 1:45 p.m. UTC | #4
On Mon, Jun 17, 2024 at 08:57:37PM -0700, Paul E. McKenney wrote:
> On Mon, Jun 17, 2024 at 10:24:54PM -0400, Kent Overstreet wrote:
> > On Mon, Jun 17, 2024 at 05:15:29PM -0700, Paul E. McKenney wrote:
> > > If it ever becomes helpful to make this work for any of the RCU Tasks
> > > flavors, the rcutorture_type enum from kernel/rcu/rcu.h might be of help.
> > > Until then, NULL works just fine.
> > 
> > Certainly
> > 
> > > > - Tracks pending items in a radix tree; falls back to linked list if
> > > >   radix tree allocation fails
> > > 
> > > I am still having trouble seeing what a radix tree brings to the table
> > > in this case, but again, migtht as well dig in.
> > 
> > It's simpler, and (marginally) more efficient than list-of-pages.
> > 
> > My new implementation of kvfree_call_rcu() on top of this data structure
> > is now looking faster than the existing version - and I'm using smaller
> > nodes (512 bytes instead of full pages).
> 
> OK.  We will need to look carefully at the performance comparisons.
> Lots of places for overhead to hide in both schemes.  ;-)

If you want to take a look at where I'm going now or possibly benchmark
it yourself - https://evilpiepirate.org/git/bcachefs.git/log/?h=rcu_pending

messy, wip: I still have yet to document anything, and I've ripped out
the shrinker (along with the entire prior kvfree_call_rcu()
implementation), and haven't redone that yet - and this version does
have the issue that we're processing pending frees in kvfree_call_rcu()
context.

But it does work and pass tests...

> > To be honest, I'm still not getting your intended use of the other
> > helpers; to be honest, just comparing sequence numbers with
> > ULONG_CMP_GE/LT seems simplest to my brain.
> 
> From your reply later on, I believe that we are OK.  I will see with
> your next version.  ;-)

I only just last night realized that there's really just a single
sequence number internally; exposing that simplifies things
_considerably_, many weird corner cases races go away when we're only
grabbing the current RCU sequence number once.

> The time that this matters is when someone is invoking this in a tight
> loop, in which case the branch predictor will to quite well, given that
> grace periods take some time to complete.

Code size matters - and more than that, we're moving towards RCU-ifying
a bigger and bigger fraction of total allocations in the system (e.g.
recent talk about rcu-freeing all page cache pages!).

So we really should be aiming to have this path be just as optimized as
the main slub paths.

Also, code size: rcu_pending.o, stripped, is just 5kb. I've got the main
__rcu_pending_enqueue() fastpath down to ~600 bytes.

> For a sufficiently constrained system (and your bcachefs use case might
> qualify for all I know), this might work very well.  The problem is that
> "trusted" would need to include "not running on a hypervisor" and "not
> running with firmware having longer-than-average SMIs.  :-(

But in those cases RCU is also going to be blocked from ending existing
grace periods, and since we have a hard limit on the number of
outstanding grace periods that means we can't start new ones either.

This feels like something we can solve with some finesse, not a real
blocker.

Now - where to shove the processing is a real question. Ideally process
context, but can't (in general) be in the calling context.

Actually, process context feels wrong for this (at least kvfree_rcu())
completions), given that the completions need to run with essentially
realtime priority - that's really a job for softirq context, so maybe we
just want to ditch the workqueues.

But running completions in call_rcu() context (not existing call_rcu(),
since we need to annotate paths where this is ok, but the moral
equivalent) does seem to be a lovely tool for avoiding use of softirq
context while ensuring they still get done :)

> > Yes, and in my latest version that's stricter about avoiding preemption,
> > the lock is only necessary for dequeue_from_all() - all other options
> > are careful to only work on the CPU local version now.
> 
> Using migration_disable() or similar?

Nope, just local_irq_save(). Gotta disable preemption anyways :)

> Suppose that there has been a huge pile of objects queued up over the past
> while, so that both ->objs[] elements are quite full.  Suppose that,
> just by chance, the ->obj[0] ->seq is the first to have its grace
> period complete.  Suppose further that just at that point in time, the
> object-queuing rate suddenly decreases.  So much so, that at least one
> full grace period elapses between queuing of individual objects.

Not sure what you mean by full? This seems like a rather vague thought
experiment :)

Anyways, I've now got things ordered so objs[0] is always newer, objs[1]
(if nonempty) completes first, and we can pick the one we need with
simple array indexing.

Very nice improvement on code size (balances out all the inlining I've
added).

> Perhaps there are things we could do to make the pages of pointers better.
> Or maybe we should adopt radix tree, though it still seems like a very
> strange fit.  Some careful performance analysis will need to drive this
> choice.  Avoiding ever forgetting the "status quo" option.

To my eyes, the kernel tendency to always reach for linked lists first
seems strange, to me :)

I just don't use linked lists that much in my own code.

> > > Also, I am not seeing a ->pending() field in the pending_rcu_items_seq
> > > structure.  Again, what am I missing?
> > 
> > Not necessary, since it's pending iff it has items - except you indicate
> > a callback, so perhas you're thinking of something else?
> 
> My typo, ->process(), not ->pending().  Apologies for my confusion!

It's in the parent, non-percpu struct :)

> > It's a bit unfortunate that we still don't have a way of checking
> > (outside of debug code) "are we in sleepable process context?" - that'll
> > have to be passed in.
> 
> That might arrive with lazy preemption.  It would untangle quite a few
> knots in many parts of RCU, that is for sure.

yeah, trouble is it would make spinlocks/local_irq_save() a good deal
more expensive :/

> My advice is as follows:
> 
> 1.	Pick up the cookie, whether from get_synchronize_srcu() or
> 	poll_state_synchronize_rcu().

Ok, so it sounds like we're thinking along the same lines then. That's
the approach I came up with last night as well.

> 
> 2.	If that cookie matches one of the ->objs[]->seq, add it to
> 	that ->objs[].  Null the pointer or whatever to record that
> 	it is already taken care of.  Or just take an early exit.
> 
> 	If under heavy load, this will be the common-case code path.
> 
> 3.	Only then try processing the ->objs[] elements.
> 
> 4.	Yes, then you need to queue the thing, but you can make
> 	#2 above be an inline function to avoid duplicating code.
> 	And you can check to see if the cookie already expired,
> 	in which case you can just process it immediately.
> 
> 	You only need to retry in case of memory-allocation failure.
> 
> Does that make sense?

That's roughly what the code is doing now (as of last night), with the
exception that it's still always checking for elements to process first.

Will have to make that configurable for kvfree_call_rcu()/call_rcu().

> > > > +	for (objs = p->objs; objs < p->objs + ARRAY_SIZE(p->objs); objs++)
> > > > +		if (objs->nr && objs->seq == seq)
> > > 
> > > OK, the purpose of same_state_synchronize_srcu() is to make that
> > > equality comparison work no matter what debugging information we might
> > > need to put in the cookie returned from get_state_synchronize_srcu().
> > > Or, for that matter, poll_state_synchronize_srcu().
> > > 
> > > So please use same_state_synchronize_srcu(objs->seq,seq) here.
> > 
> > Done
> 
> Thank you!

heh, and then deleted, because I switched to grabbing the raw counter
(and doing the masking myself).

But I'm now masking off the debug bits first thing, so - tell me what
you think :)

> You can probably instead call get_state_synchronize_srcu() to begin
> with, and let the later call_srcu() start the grace period if needed.
> The only exception I can imagine to this right off-hand is under heavy
> load, where you might need to start the grace period sooner rather than
> later.  But that is easy -- if you have too many objects piled up, call
> poll_state_synchronize_srcu() instead of get_state_synchronize_srcu().

I think there's another issue, where we may have items for two different
grace periods when we need to arm the callback - and then we want it to
fire when the _older_ grace period completes.

Do you have a call_rcu() for a specific grace period seq? :)
Paul E. McKenney June 20, 2024, 7:58 p.m. UTC | #5
On Tue, Jun 18, 2024 at 09:45:33AM -0400, Kent Overstreet wrote:
> On Mon, Jun 17, 2024 at 08:57:37PM -0700, Paul E. McKenney wrote:
> > On Mon, Jun 17, 2024 at 10:24:54PM -0400, Kent Overstreet wrote:
> > > On Mon, Jun 17, 2024 at 05:15:29PM -0700, Paul E. McKenney wrote:
> > > > If it ever becomes helpful to make this work for any of the RCU Tasks
> > > > flavors, the rcutorture_type enum from kernel/rcu/rcu.h might be of help.
> > > > Until then, NULL works just fine.
> > > 
> > > Certainly
> > > 
> > > > > - Tracks pending items in a radix tree; falls back to linked list if
> > > > >   radix tree allocation fails
> > > > 
> > > > I am still having trouble seeing what a radix tree brings to the table
> > > > in this case, but again, migtht as well dig in.
> > > 
> > > It's simpler, and (marginally) more efficient than list-of-pages.
> > > 
> > > My new implementation of kvfree_call_rcu() on top of this data structure
> > > is now looking faster than the existing version - and I'm using smaller
> > > nodes (512 bytes instead of full pages).
> > 
> > OK.  We will need to look carefully at the performance comparisons.
> > Lots of places for overhead to hide in both schemes.  ;-)
> 
> If you want to take a look at where I'm going now or possibly benchmark
> it yourself - https://evilpiepirate.org/git/bcachefs.git/log/?h=rcu_pending

I do see the swap of the __call_rcu() and the p->rcu_armed assignment,
so that is good.

I am unlikely to get to benchmarking during the next week or two.

I rebased the SRCU polled-API commits on top of v6.10-rc1 and am looking
to send them into the v6.11 merge window:

	395e73bd8d35f srcu: Add NUM_ACTIVE_SRCU_POLL_OLDSTATE
	d7b0615cb8d24 srcu: Update cleanup_srcu_struct() comment
	e206f33e2c077 srcu: Fill out polled grace-period APIs

Or if you would prefer the email versions:

https://lore.kernel.org/all/8ec14282-a768-49eb-b870-c2ee51e91f8b@paulmck-laptop/

This provides the NUM_ACTIVE_SRCU_POLL_OLDSTATE number and the
same_state_synchronize_srcu() function.  Currently, your code is assuming
that NUM_ACTIVE_RCU_POLL_OLDSTATE and NUM_ACTIVE_SRCU_POLL_OLDSTATE are
the same, and as noted earlier, there could be changes to SRCU expedited
grace periods that changed that current lucky coincidence.

> messy, wip: I still have yet to document anything, and I've ripped out
> the shrinker (along with the entire prior kvfree_call_rcu()
> implementation), and haven't redone that yet - and this version does
> have the issue that we're processing pending frees in kvfree_call_rcu()
> context.

One thing is that although processing pending frees in caller context
does have the locking restriction, it also has the benefit of pushing
overhead from the deferred-free context back to the caller context.
This has system-stability benefits by reducing the ability of the
calling context to overload the context doing the actual freeing.

Tradeoffs, tradeoffs...

> But it does work and pass tests...

Very good!

> > > To be honest, I'm still not getting your intended use of the other
> > > helpers; to be honest, just comparing sequence numbers with
> > > ULONG_CMP_GE/LT seems simplest to my brain.
> > 
> > From your reply later on, I believe that we are OK.  I will see with
> > your next version.  ;-)
> 
> I only just last night realized that there's really just a single
> sequence number internally; exposing that simplifies things
> _considerably_, many weird corner cases races go away when we're only
> grabbing the current RCU sequence number once.

OK, I will bite...  Why can't you get that same effect by calling
get_state_synchronize_srcu() at that same place?

And that sequence number is not guaranteed to continue acting like a
simple sequence number, again should expedited SRCU grace periods need
optimizing.

> > The time that this matters is when someone is invoking this in a tight
> > loop, in which case the branch predictor will to quite well, given that
> > grace periods take some time to complete.
> 
> Code size matters - and more than that, we're moving towards RCU-ifying
> a bigger and bigger fraction of total allocations in the system (e.g.
> recent talk about rcu-freeing all page cache pages!).
> 
> So we really should be aiming to have this path be just as optimized as
> the main slub paths.

???

The main slub paths are used on fastpaths featuring frees and allocations
in quick sequence.  In constrast, this should not be a fastpath, instead
your fastpath is code paths doing srcu_read_lock(), correct?

> Also, code size: rcu_pending.o, stripped, is just 5kb. I've got the main
> __rcu_pending_enqueue() fastpath down to ~600 bytes.

Very nice, but this is not a fastpath.  The fast paths are the read-side
primitives, srcu_read_lock() and friends.

> > For a sufficiently constrained system (and your bcachefs use case might
> > qualify for all I know), this might work very well.  The problem is that
> > "trusted" would need to include "not running on a hypervisor" and "not
> > running with firmware having longer-than-average SMIs.  :-(
> 
> But in those cases RCU is also going to be blocked from ending existing
> grace periods, and since we have a hard limit on the number of
> outstanding grace periods that means we can't start new ones either.

For RCU, mostly agreed.  For SRCU, vCPU preemption usually does not
block grace periods.

> This feels like something we can solve with some finesse, not a real
> blocker.
> 
> Now - where to shove the processing is a real question. Ideally process
> context, but can't (in general) be in the calling context.

Especially in cases where the calling context is under srcu_read_lock().

> Actually, process context feels wrong for this (at least kvfree_rcu())
> completions), given that the completions need to run with essentially
> realtime priority - that's really a job for softirq context, so maybe we
> just want to ditch the workqueues.

Sometimes you do want to run the object processing at real-time priority,
hence the rcutree.kthread_prio kernel-boot parameter.  But most of the
time you really do not want that stuff running at real-time priority.
Trust me, you really do not want to process a million object objects
while running at real-time priority if the application has any sort of
latency constraint, and most do these days.

> But running completions in call_rcu() context (not existing call_rcu(),
> since we need to annotate paths where this is ok, but the moral
> equivalent) does seem to be a lovely tool for avoiding use of softirq
> context while ensuring they still get done :)

There are always the rcuc and rcuo kthreads, but they still run the
callbacks in BH-disabled context.  In your case, you aren't needing to
deal with deferrals from irq-disabled contexts, so workqueues should be
a better bet.

> > > Yes, and in my latest version that's stricter about avoiding preemption,
> > > the lock is only necessary for dequeue_from_all() - all other options
> > > are careful to only work on the CPU local version now.
> > 
> > Using migration_disable() or similar?
> 
> Nope, just local_irq_save(). Gotta disable preemption anyways :)

As long as you don't disable preemption across object processing.  ;-)

> > Suppose that there has been a huge pile of objects queued up over the past
> > while, so that both ->objs[] elements are quite full.  Suppose that,
> > just by chance, the ->obj[0] ->seq is the first to have its grace
> > period complete.  Suppose further that just at that point in time, the
> > object-queuing rate suddenly decreases.  So much so, that at least one
> > full grace period elapses between queuing of individual objects.
> 
> Not sure what you mean by full? This seems like a rather vague thought
> experiment :)

A sharp sudden change in load is an important validation techinque for
RCU.  And not just RCU, but also throughout engineering.  For example,
in mechanical engineering, they often use hammers of various sizes for
this purpose.

> Anyways, I've now got things ordered so objs[0] is always newer, objs[1]
> (if nonempty) completes first, and we can pick the one we need with
> simple array indexing.
> 
> Very nice improvement on code size (balances out all the inlining I've
> added).

Again, as noted in earlier discussion, the return values from
get_state_synchronize_rcu() are not guaranteed to be fully ordered.
Yes, they happen to be fully ordered now, but...

Module boundaries are an important maintainability tool, even within
the confines of a given source directory.  ;-)

> > Perhaps there are things we could do to make the pages of pointers better.
> > Or maybe we should adopt radix tree, though it still seems like a very
> > strange fit.  Some careful performance analysis will need to drive this
> > choice.  Avoiding ever forgetting the "status quo" option.
> 
> To my eyes, the kernel tendency to always reach for linked lists first
> seems strange, to me :)
> 
> I just don't use linked lists that much in my own code.

OK, but they still have their strengths and their uses.

> > > > Also, I am not seeing a ->pending() field in the pending_rcu_items_seq
> > > > structure.  Again, what am I missing?
> > > 
> > > Not necessary, since it's pending iff it has items - except you indicate
> > > a callback, so perhas you're thinking of something else?
> > 
> > My typo, ->process(), not ->pending().  Apologies for my confusion!
> 
> It's in the parent, non-percpu struct :)

Thank you, and once I started searching for the correct string, I did
find it.

> > > It's a bit unfortunate that we still don't have a way of checking
> > > (outside of debug code) "are we in sleepable process context?" - that'll
> > > have to be passed in.
> > 
> > That might arrive with lazy preemption.  It would untangle quite a few
> > knots in many parts of RCU, that is for sure.
> 
> yeah, trouble is it would make spinlocks/local_irq_save() a good deal
> more expensive :/

There appears to be some PowerPC heartburn with it, but yes, making that
information explicit is not free.

> > My advice is as follows:
> > 
> > 1.	Pick up the cookie, whether from get_synchronize_srcu() or
> > 	poll_state_synchronize_rcu().
> 
> Ok, so it sounds like we're thinking along the same lines then. That's
> the approach I came up with last night as well.
> 
> > 
> > 2.	If that cookie matches one of the ->objs[]->seq, add it to
> > 	that ->objs[].  Null the pointer or whatever to record that
> > 	it is already taken care of.  Or just take an early exit.
> > 
> > 	If under heavy load, this will be the common-case code path.
> > 
> > 3.	Only then try processing the ->objs[] elements.
> > 
> > 4.	Yes, then you need to queue the thing, but you can make
> > 	#2 above be an inline function to avoid duplicating code.
> > 	And you can check to see if the cookie already expired,
> > 	in which case you can just process it immediately.
> > 
> > 	You only need to retry in case of memory-allocation failure.
> > 
> > Does that make sense?
> 
> That's roughly what the code is doing now (as of last night), with the
> exception that it's still always checking for elements to process first.

OK, I am good with that being configurable, at least unless or until
one option proves uniformly better than the other.

> Will have to make that configurable for kvfree_call_rcu()/call_rcu().
> 
> > > > > +	for (objs = p->objs; objs < p->objs + ARRAY_SIZE(p->objs); objs++)
> > > > > +		if (objs->nr && objs->seq == seq)
> > > > 
> > > > OK, the purpose of same_state_synchronize_srcu() is to make that
> > > > equality comparison work no matter what debugging information we might
> > > > need to put in the cookie returned from get_state_synchronize_srcu().
> > > > Or, for that matter, poll_state_synchronize_srcu().
> > > > 
> > > > So please use same_state_synchronize_srcu(objs->seq,seq) here.
> > > 
> > > Done
> > 
> > Thank you!
> 
> heh, and then deleted, because I switched to grabbing the raw counter
> (and doing the masking myself).
> 
> But I'm now masking off the debug bits first thing, so - tell me what
> you think :)

I am sorry, but we really need to maintain some modularity here, especially
given that this is not a fastpath.

> > You can probably instead call get_state_synchronize_srcu() to begin
> > with, and let the later call_srcu() start the grace period if needed.
> > The only exception I can imagine to this right off-hand is under heavy
> > load, where you might need to start the grace period sooner rather than
> > later.  But that is easy -- if you have too many objects piled up, call
> > poll_state_synchronize_srcu() instead of get_state_synchronize_srcu().
> 
> I think there's another issue, where we may have items for two different
> grace periods when we need to arm the callback - and then we want it to
> fire when the _older_ grace period completes.
> 
> Do you have a call_rcu() for a specific grace period seq? :)

I do not see why you would need that in this case.

If you use get_state_synchronize_srcu() to pick up the initial cookie,
you have that cookie.  If !rcu_armed, you invoke call_srcu(), which
will respond to exactly the grace period that you need it to.  You only
ever have the one SRCU callback, so the delays from the end of that
grace period to the invocation of that callback is down in the noise.
Within that callback, you check whether there are additional objects
in need of another grace period, and if so, another call_srcu() will do
exactly the additional grace period you need.

Yes, there is a race condition where objects arrive between the grace
period ending and the callback being invoked, but those will be taken
care of at the end of the next grace period.

What am I missing here?

							Thanx, Paul
diff mbox series

Patch

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 0ab533a2b03b..c8394ec6f22f 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -68,6 +68,7 @@  bcachefs-y		:=	\
 	opts.o			\
 	printbuf.o		\
 	quota.o			\
+	rcu_pending.o		\
 	rebalance.o		\
 	recovery.o		\
 	recovery_passes.o	\
diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c
new file mode 100644
index 000000000000..9b2f4da94425
--- /dev/null
+++ b/fs/bcachefs/rcu_pending.c
@@ -0,0 +1,278 @@ 
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/generic-radix-tree.h>
+#include <linux/percpu.h>
+#include <linux/srcu.h>
+#include "rcu_pending.h"
+
+static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp)
+{
+	return ssp
+		? get_state_synchronize_srcu(ssp)
+		: get_state_synchronize_rcu();
+}
+
+static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp)
+{
+	return ssp
+		? start_poll_synchronize_srcu(ssp)
+		: start_poll_synchronize_rcu();
+}
+
+static inline void __rcu_barrier(struct srcu_struct *ssp)
+{
+	return ssp
+		? srcu_barrier(ssp)
+		: rcu_barrier();
+}
+
+struct pending_rcu_items_seq {
+	GENRADIX(struct rcu_head *)	objs;
+	size_t				nr;
+	struct rcu_head			*list;
+	unsigned long			seq;
+};
+
+struct pending_rcu_items_pcpu {
+	struct pending_rcu_items	*parent;
+	spinlock_t			lock;
+	bool				rcu_armed;
+	struct pending_rcu_items_seq	objs[2];
+	struct rcu_head			rcu;
+	struct work_struct		work;
+};
+
+static bool get_finished_items(struct pending_rcu_items *pending,
+			       struct pending_rcu_items_pcpu *p,
+			       struct pending_rcu_items_seq *out)
+{
+	for (struct pending_rcu_items_seq *objs = p->objs;
+	     objs < p->objs + ARRAY_SIZE(p->objs); objs++)
+		if ((objs->nr || objs->list) &&
+		    poll_state_synchronize_srcu(pending->srcu, objs->seq)) {
+			*out = (struct pending_rcu_items_seq) {
+				/*
+				 * the genradix can only be modified with atomic instructions,
+				 * since we allocate new nodes without
+				 * pending_rcu_items_pcpu.lock
+				 */
+				.objs.tree.root	= xchg(&objs->objs.tree.root, NULL),
+				.nr		= objs->nr,
+				.list		= objs->list,
+			};
+			objs->nr	= 0;
+			objs->list	= NULL;
+			return true;
+		}
+	return false;
+}
+
+static void process_finished_items(struct pending_rcu_items *pending,
+				   struct pending_rcu_items_seq *objs)
+{
+	for (size_t i = 0; i < objs->nr; i++)
+		pending->process(pending, *genradix_ptr(&objs->objs, i));
+	genradix_free(&objs->objs);
+
+	while (objs->list) {
+		struct rcu_head *obj = objs->list;
+		objs->list = obj->next;
+		pending->process(pending, obj);
+	}
+}
+
+static void pending_rcu_items_rcu_cb(struct rcu_head *rcu)
+{
+	struct pending_rcu_items_pcpu *p =
+		container_of(rcu, struct pending_rcu_items_pcpu, rcu);
+
+	schedule_work(&p->work);
+}
+
+static bool __pending_rcu_items_has_pending(struct pending_rcu_items_pcpu *p)
+{
+	for (struct pending_rcu_items_seq *objs = p->objs;
+	     objs < p->objs + ARRAY_SIZE(p->objs); objs++)
+		if (objs->nr || objs->list)
+			return true;
+	return false;
+}
+
+static void pending_rcu_items_work(struct work_struct *work)
+{
+	struct pending_rcu_items_pcpu *p =
+		container_of(work, struct pending_rcu_items_pcpu, work);
+	struct pending_rcu_items *pending = p->parent;
+again:
+	spin_lock_irq(&p->lock);
+	struct pending_rcu_items_seq finished;
+	while (get_finished_items(pending, p, &finished)) {
+		spin_unlock_irq(&p->lock);
+		process_finished_items(pending, &finished);
+		goto again;
+	}
+
+	BUG_ON(!p->rcu_armed);
+	p->rcu_armed = __pending_rcu_items_has_pending(p);
+	if (p->rcu_armed)
+		call_srcu(pending->srcu, &p->rcu, pending_rcu_items_rcu_cb);
+	spin_unlock_irq(&p->lock);
+}
+
+void bch2_pending_rcu_item_enqueue(struct pending_rcu_items *pending, struct rcu_head *obj)
+{
+	struct pending_rcu_items_pcpu *p = raw_cpu_ptr(pending->p);
+	bool alloc_failed = false;
+	unsigned long flags;
+retry:
+	spin_lock_irqsave(&p->lock, flags);
+
+	struct pending_rcu_items_seq finished;
+	while (get_finished_items(pending, p, &finished)) {
+		spin_unlock_irqrestore(&p->lock, flags);
+		process_finished_items(pending, &finished);
+		goto retry;
+	}
+
+	struct pending_rcu_items_seq *objs;
+
+	unsigned long seq = __get_state_synchronize_rcu(pending->srcu);
+	for (objs = p->objs; objs < p->objs + ARRAY_SIZE(p->objs); objs++)
+		if (objs->nr && objs->seq == seq)
+			goto add;
+
+	seq = __start_poll_synchronize_rcu(pending->srcu);
+	for (objs = p->objs; objs < p->objs + ARRAY_SIZE(p->objs); objs++)
+		if (!objs->nr) {
+			objs->seq = seq;
+			goto add;
+		}
+
+	BUG();
+	struct rcu_head **entry;
+add:
+	entry = genradix_ptr_alloc(&objs->objs, objs->nr, GFP_ATOMIC|__GFP_NOWARN);
+	if (likely(entry)) {
+		*entry = obj;
+		objs->nr++;
+	} else if (likely(!alloc_failed)) {
+		spin_unlock_irqrestore(&p->lock, flags);
+		alloc_failed = !genradix_ptr_alloc(&objs->objs, objs->nr, GFP_KERNEL);
+		goto retry;
+	} else {
+		obj->next = objs->list;
+		objs->list = obj;
+	}
+
+	if (!p->rcu_armed) {
+		call_srcu(pending->srcu, &p->rcu, pending_rcu_items_rcu_cb);
+		p->rcu_armed = true;
+	}
+	spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static struct rcu_head *pending_rcu_item_pcpu_dequeue(struct pending_rcu_items_pcpu *p)
+{
+	struct rcu_head *ret = NULL;
+
+	spin_lock_irq(&p->lock);
+	unsigned idx = p->objs[1].seq > p->objs[0].seq;
+
+	for (unsigned i = 0; i < 2; i++, idx ^= 1) {
+		struct pending_rcu_items_seq *objs = p->objs + idx;
+
+		if (objs->nr) {
+			ret = *genradix_ptr(&objs->objs, --objs->nr);
+			break;
+		}
+
+		if (objs->list) {
+			ret = objs->list;
+			objs->list = ret->next;
+			break;
+		}
+	}
+	spin_unlock_irq(&p->lock);
+
+	return ret;
+}
+
+struct rcu_head *bch2_pending_rcu_item_dequeue(struct pending_rcu_items *pending)
+{
+	return pending_rcu_item_pcpu_dequeue(raw_cpu_ptr(pending->p));
+}
+
+struct rcu_head *bch2_pending_rcu_item_dequeue_from_all(struct pending_rcu_items *pending)
+{
+	struct rcu_head *ret = NULL;
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		ret = pending_rcu_item_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+static bool pending_rcu_items_has_pending(struct pending_rcu_items *pending)
+{
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		struct pending_rcu_items_pcpu *p = per_cpu_ptr(pending->p, cpu);
+		spin_lock_irq(&p->lock);
+		if (__pending_rcu_items_has_pending(p)) {
+			spin_unlock_irq(&p->lock);
+			return true;
+		}
+		spin_unlock_irq(&p->lock);
+	}
+
+	return false;
+}
+
+void bch2_pending_rcu_items_exit(struct pending_rcu_items *pending)
+{
+	if (!pending->p)
+		return;
+
+	while (pending_rcu_items_has_pending(pending))
+		__rcu_barrier(pending->srcu);
+
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		struct pending_rcu_items_pcpu *p = per_cpu_ptr(pending->p, cpu);
+
+		flush_work(&p->work);
+
+		WARN_ON(p->objs[0].nr);
+		WARN_ON(p->objs[1].nr);
+		WARN_ON(p->objs[0].list);
+		WARN_ON(p->objs[1].list);
+
+		genradix_free(&p->objs[0].objs);
+		genradix_free(&p->objs[1].objs);
+	}
+	free_percpu(pending->p);
+}
+
+int bch2_pending_rcu_items_init(struct pending_rcu_items *pending,
+				struct srcu_struct *srcu,
+				pending_rcu_item_process_fn process)
+{
+	pending->p = alloc_percpu(struct pending_rcu_items_pcpu);
+	if (!pending->p)
+		return -ENOMEM;
+
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		struct pending_rcu_items_pcpu *p = per_cpu_ptr(pending->p, cpu);
+		p->parent	= pending;
+		spin_lock_init(&p->lock);
+		INIT_WORK(&p->work, pending_rcu_items_work);
+	}
+
+	pending->srcu = srcu;
+	pending->process = process;
+
+	return 0;
+}
diff --git a/fs/bcachefs/rcu_pending.h b/fs/bcachefs/rcu_pending.h
new file mode 100644
index 000000000000..e45ede074443
--- /dev/null
+++ b/fs/bcachefs/rcu_pending.h
@@ -0,0 +1,25 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_RCU_PENDING_H
+#define _BCACHEFS_RCU_PENDING_H
+
+struct pending_rcu_items;
+typedef void (*pending_rcu_item_process_fn)(struct pending_rcu_items *, struct rcu_head *);
+
+struct pending_rcu_items_pcpu;
+
+struct pending_rcu_items {
+	struct pending_rcu_items_pcpu __percpu *p;
+	struct srcu_struct		*srcu;
+	pending_rcu_item_process_fn	process;
+};
+
+void bch2_pending_rcu_item_enqueue(struct pending_rcu_items *pending, struct rcu_head *obj);
+struct rcu_head *bch2_pending_rcu_item_dequeue(struct pending_rcu_items *pending);
+struct rcu_head *bch2_pending_rcu_item_dequeue_from_all(struct pending_rcu_items *pending);
+
+void bch2_pending_rcu_items_exit(struct pending_rcu_items *pending);
+int bch2_pending_rcu_items_init(struct pending_rcu_items *pending,
+				struct srcu_struct *srcu,
+				pending_rcu_item_process_fn process);
+
+#endif /* _BCACHEFS_RCU_PENDING_H */