diff mbox series

[v2,1/2] btrfs: slightly loose the requirement for qgroup removal

Message ID 3fa525bceeec6096ddd131da98036e07b9947c9c.1713519718.git.wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: qgroup: stale qgroups related impromvents | expand

Commit Message

Qu Wenruo April 19, 2024, 9:46 a.m. UTC
[BUG]
Currently if one is utilizing "qgroups/drop_subtree_threshold" sysfs,
and a snapshot with level higher than that value is dropped, btrfs will
not be able to delete the qgroup until next qgroup rescan:

  uuid=ffffffff-eeee-dddd-cccc-000000000000

  wipefs -fa $dev
  mkfs.btrfs -f $dev -O quota -s 4k -n 4k -U $uuid
  mount $dev $mnt

  btrfs subvolume create $mnt/subv1/
  for (( i = 0; i < 1024; i++ )); do
  	xfs_io -f -c "pwrite 0 2k" $mnt/subv1/file_$i > /dev/null
  done
  sync
  btrfs subv snapshot $mnt/subv1 $mnt/snapshot
  btrfs quota enable $mnt
  btrfs quota rescan -w $mnt
  sync
  echo 1 > /sys/fs/btrfs/$uuid/qgroups/drop_subtree_threshold
  btrfs subvolume delete $mnt/snapshot
  btrfs subv sync $mnt
  btrfs qgroup show -prce --sync $mnt
  btrfs qgroup destroy 0/257 $mnt
  umount $mnt

The final qgroup removal would fail with the following error:

  ERROR: unable to destroy quota group: Device or resource busy

[CAUSE]
The above script would generate a subvolume of level 2, then snapshot
it, enable qgroup, set the drop_subtree_threshold, then drop the
snapshot.

Since the subvolume drop would meet the threshold, qgroup would be
marked inconsistent and skip accounting to avoid hanging the system at
transaction commit.

But currently we do not allow a qgroup with any rfer/excl numbers to be
dropped, and this is not really compatible with the new
drop_subtree_threshold behavior.

[FIX]
Instead of a strong requirement for zero rfer/excl numbers, just check
if there is any child for higher level qgroup, and for subvolume qgroups
check if there is a corresponding subvolume for it.

For rsv values, do extra warnings, and for rfer/excl numbers, only do the
warning if we're in full accounting mode and the qgroup is consistent.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/qgroup.c | 69 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 62 insertions(+), 7 deletions(-)

Comments

Boris Burkov April 29, 2024, 12:47 p.m. UTC | #1
On Fri, Apr 19, 2024 at 07:16:52PM +0930, Qu Wenruo wrote:
> [BUG]
> Currently if one is utilizing "qgroups/drop_subtree_threshold" sysfs,
> and a snapshot with level higher than that value is dropped, btrfs will
> not be able to delete the qgroup until next qgroup rescan:
> 
>   uuid=ffffffff-eeee-dddd-cccc-000000000000
> 
>   wipefs -fa $dev
>   mkfs.btrfs -f $dev -O quota -s 4k -n 4k -U $uuid
>   mount $dev $mnt
> 
>   btrfs subvolume create $mnt/subv1/
>   for (( i = 0; i < 1024; i++ )); do
>   	xfs_io -f -c "pwrite 0 2k" $mnt/subv1/file_$i > /dev/null
>   done
>   sync
>   btrfs subv snapshot $mnt/subv1 $mnt/snapshot
>   btrfs quota enable $mnt
>   btrfs quota rescan -w $mnt
>   sync
>   echo 1 > /sys/fs/btrfs/$uuid/qgroups/drop_subtree_threshold
>   btrfs subvolume delete $mnt/snapshot
>   btrfs subv sync $mnt
>   btrfs qgroup show -prce --sync $mnt
>   btrfs qgroup destroy 0/257 $mnt
>   umount $mnt
> 
> The final qgroup removal would fail with the following error:
> 
>   ERROR: unable to destroy quota group: Device or resource busy
> 
> [CAUSE]
> The above script would generate a subvolume of level 2, then snapshot
> it, enable qgroup, set the drop_subtree_threshold, then drop the
> snapshot.
> 
> Since the subvolume drop would meet the threshold, qgroup would be
> marked inconsistent and skip accounting to avoid hanging the system at
> transaction commit.
> 
> But currently we do not allow a qgroup with any rfer/excl numbers to be
> dropped, and this is not really compatible with the new
> drop_subtree_threshold behavior.
> 
> [FIX]
> Instead of a strong requirement for zero rfer/excl numbers, just check
> if there is any child for higher level qgroup, and for subvolume qgroups
> check if there is a corresponding subvolume for it.
> 
> For rsv values, do extra warnings, and for rfer/excl numbers, only do the
> warning if we're in full accounting mode and the qgroup is consistent.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/qgroup.c | 69 ++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 62 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index 9a9f84c4d3b8..2ea16a07a7d4 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -1736,13 +1736,38 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
>  	return ret;
>  }
>  
> -static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
> +static bool can_delete_qgroup(struct btrfs_fs_info *fs_info,
> +			      struct btrfs_qgroup *qgroup)
>  {
> -	return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
> -		qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
> -		qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
> -		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
> -		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
> +	struct btrfs_key key;
> +	struct btrfs_path *path;
> +	int ret;
> +
> +	/* For higher level qgroup, we can only delete it if it has no child. */
> +	if (btrfs_qgroup_level(qgroup->qgroupid)) {
> +		if (!list_empty(&qgroup->members))
> +			return false;
> +		return true;
> +	}
> +
> +	/*
> +	 * For level-0 qgroups, we can only delete it if it has no subvolume
> +	 * for it.
> +	 * This means even a subvolume is unlinked but not yet fully dropped,
> +	 * we can not delete the qgroup.
> +	 */

This was what I originally considered for normal qgroups before observing
that usage is 0 anyway. I didn't know about the drop tree threshold,
my mistake.

With that said, I support this change for non-squota qgroups.

For squota, I think this behavior would be OK, but undesirable, IMO. Any
parent qgroup would still have its usage incremented against the limit,
but it would be impossible to find any information on where it was from.

How do you feel about making this patch add the new logic and make it
conditional on qgroup mode?

Thanks,
Boris

> +	key.objectid = qgroup->qgroupid;
> +	key.type = BTRFS_ROOT_ITEM_KEY;
> +	key.offset = -1ULL;
> +	path = btrfs_alloc_path();
> +	if (!path)
> +		return false;
> +
> +	ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
> +	btrfs_free_path(path);
> +	if (ret > 0)
> +		return true;
> +	return false;
>  }
>  
>  int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
> @@ -1764,7 +1789,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
>  		goto out;
>  	}
>  
> -	if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
> +	if (!can_delete_qgroup(fs_info, qgroup)) {
>  		ret = -EBUSY;
>  		goto out;
>  	}
> @@ -1789,6 +1814,36 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
>  	}
>  
>  	spin_lock(&fs_info->qgroup_lock);
> +	/*
> +	 * Warn on reserved space. The subvolume should has no child nor
> +	 * corresponding subvolume.
> +	 * Thus its reserved space should all be zero, no matter if qgroup
> +	 * is consistent or the mode.
> +	 */
> +	WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
> +		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
> +		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
> +	/*
> +	 * The same for rfer/excl numbers, but that's only if our qgroup is
> +	 * consistent and if it's in regular qgroup mode.
> +	 * For simple mode it's not as accurate thus we can hit non-zero values
> +	 * very frequently.
> +	 */
> +	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
> +	    !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
> +		if (WARN_ON(qgroup->rfer || qgroup->excl ||
> +			    qgroup->rfer_cmpr || qgroup->excl_cmpr)) {
> +			btrfs_warn_rl(fs_info,
> +"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
> +				      btrfs_qgroup_level(qgroup->qgroupid),
> +				      btrfs_qgroup_subvolid(qgroup->qgroupid),
> +				      qgroup->rfer,
> +				      qgroup->rfer_cmpr,
> +				      qgroup->excl,
> +				      qgroup->excl_cmpr);
> +			qgroup_mark_inconsistent(fs_info);
> +		}
> +	}

If you go ahead with making it conditional, I would fold this warning
into the check logic. Either way is fine, of course.

>  	del_qgroup_rb(fs_info, qgroupid);
>  	spin_unlock(&fs_info->qgroup_lock);
>  
> -- 
> 2.44.0
>
Qu Wenruo April 29, 2024, 10 p.m. UTC | #2
在 2024/4/29 22:17, Boris Burkov 写道:
> On Fri, Apr 19, 2024 at 07:16:52PM +0930, Qu Wenruo wrote:
>> [BUG]
>> Currently if one is utilizing "qgroups/drop_subtree_threshold" sysfs,
>> and a snapshot with level higher than that value is dropped, btrfs will
>> not be able to delete the qgroup until next qgroup rescan:
>>
>>    uuid=ffffffff-eeee-dddd-cccc-000000000000
>>
>>    wipefs -fa $dev
>>    mkfs.btrfs -f $dev -O quota -s 4k -n 4k -U $uuid
>>    mount $dev $mnt
>>
>>    btrfs subvolume create $mnt/subv1/
>>    for (( i = 0; i < 1024; i++ )); do
>>    	xfs_io -f -c "pwrite 0 2k" $mnt/subv1/file_$i > /dev/null
>>    done
>>    sync
>>    btrfs subv snapshot $mnt/subv1 $mnt/snapshot
>>    btrfs quota enable $mnt
>>    btrfs quota rescan -w $mnt
>>    sync
>>    echo 1 > /sys/fs/btrfs/$uuid/qgroups/drop_subtree_threshold
>>    btrfs subvolume delete $mnt/snapshot
>>    btrfs subv sync $mnt
>>    btrfs qgroup show -prce --sync $mnt
>>    btrfs qgroup destroy 0/257 $mnt
>>    umount $mnt
>>
>> The final qgroup removal would fail with the following error:
>>
>>    ERROR: unable to destroy quota group: Device or resource busy
>>
>> [CAUSE]
>> The above script would generate a subvolume of level 2, then snapshot
>> it, enable qgroup, set the drop_subtree_threshold, then drop the
>> snapshot.
>>
>> Since the subvolume drop would meet the threshold, qgroup would be
>> marked inconsistent and skip accounting to avoid hanging the system at
>> transaction commit.
>>
>> But currently we do not allow a qgroup with any rfer/excl numbers to be
>> dropped, and this is not really compatible with the new
>> drop_subtree_threshold behavior.
>>
>> [FIX]
>> Instead of a strong requirement for zero rfer/excl numbers, just check
>> if there is any child for higher level qgroup, and for subvolume qgroups
>> check if there is a corresponding subvolume for it.
>>
>> For rsv values, do extra warnings, and for rfer/excl numbers, only do the
>> warning if we're in full accounting mode and the qgroup is consistent.
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>   fs/btrfs/qgroup.c | 69 ++++++++++++++++++++++++++++++++++++++++++-----
>>   1 file changed, 62 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
>> index 9a9f84c4d3b8..2ea16a07a7d4 100644
>> --- a/fs/btrfs/qgroup.c
>> +++ b/fs/btrfs/qgroup.c
>> @@ -1736,13 +1736,38 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
>>   	return ret;
>>   }
>>
>> -static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
>> +static bool can_delete_qgroup(struct btrfs_fs_info *fs_info,
>> +			      struct btrfs_qgroup *qgroup)
>>   {
>> -	return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
>> -		qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
>> -		qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
>> -		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
>> -		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
>> +	struct btrfs_key key;
>> +	struct btrfs_path *path;
>> +	int ret;
>> +
>> +	/* For higher level qgroup, we can only delete it if it has no child. */
>> +	if (btrfs_qgroup_level(qgroup->qgroupid)) {
>> +		if (!list_empty(&qgroup->members))
>> +			return false;
>> +		return true;
>> +	}
>> +
>> +	/*
>> +	 * For level-0 qgroups, we can only delete it if it has no subvolume
>> +	 * for it.
>> +	 * This means even a subvolume is unlinked but not yet fully dropped,
>> +	 * we can not delete the qgroup.
>> +	 */
>
> This was what I originally considered for normal qgroups before observing
> that usage is 0 anyway. I didn't know about the drop tree threshold,
> my mistake.
>
> With that said, I support this change for non-squota qgroups.
>
> For squota, I think this behavior would be OK, but undesirable, IMO. Any
> parent qgroup would still have its usage incremented against the limit,
> but it would be impossible to find any information on where it was from.

That's indeed another problem.

For regular qgroup that could only happen when qgroup is inconsistent,
and we're fine to drop the qgroup without updating the parent.

But for squota, there is no inconsistent state, meaning we should also
drop all the numbers from parent too.

>
> How do you feel about making this patch add the new logic and make it
> conditional on qgroup mode?

I'm totally fine to do it conditional, although I'd also like to get
feedback on dropping the numbers from parent qgroup (so we can do it for
both qgroup and squota).

Would the auto drop for parent numbers be a solution?

Thanks,
Qu
>
> Thanks,
> Boris
>
>> +	key.objectid = qgroup->qgroupid;
>> +	key.type = BTRFS_ROOT_ITEM_KEY;
>> +	key.offset = -1ULL;
>> +	path = btrfs_alloc_path();
>> +	if (!path)
>> +		return false;
>> +
>> +	ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
>> +	btrfs_free_path(path);
>> +	if (ret > 0)
>> +		return true;
>> +	return false;
>>   }
>>
>>   int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
>> @@ -1764,7 +1789,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
>>   		goto out;
>>   	}
>>
>> -	if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
>> +	if (!can_delete_qgroup(fs_info, qgroup)) {
>>   		ret = -EBUSY;
>>   		goto out;
>>   	}
>> @@ -1789,6 +1814,36 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
>>   	}
>>
>>   	spin_lock(&fs_info->qgroup_lock);
>> +	/*
>> +	 * Warn on reserved space. The subvolume should has no child nor
>> +	 * corresponding subvolume.
>> +	 * Thus its reserved space should all be zero, no matter if qgroup
>> +	 * is consistent or the mode.
>> +	 */
>> +	WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
>> +		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
>> +		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
>> +	/*
>> +	 * The same for rfer/excl numbers, but that's only if our qgroup is
>> +	 * consistent and if it's in regular qgroup mode.
>> +	 * For simple mode it's not as accurate thus we can hit non-zero values
>> +	 * very frequently.
>> +	 */
>> +	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
>> +	    !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
>> +		if (WARN_ON(qgroup->rfer || qgroup->excl ||
>> +			    qgroup->rfer_cmpr || qgroup->excl_cmpr)) {
>> +			btrfs_warn_rl(fs_info,
>> +"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
>> +				      btrfs_qgroup_level(qgroup->qgroupid),
>> +				      btrfs_qgroup_subvolid(qgroup->qgroupid),
>> +				      qgroup->rfer,
>> +				      qgroup->rfer_cmpr,
>> +				      qgroup->excl,
>> +				      qgroup->excl_cmpr);
>> +			qgroup_mark_inconsistent(fs_info);
>> +		}
>> +	}
>
> If you go ahead with making it conditional, I would fold this warning
> into the check logic. Either way is fine, of course.
>
>>   	del_qgroup_rb(fs_info, qgroupid);
>>   	spin_unlock(&fs_info->qgroup_lock);
>>
>> --
>> 2.44.0
>>
>
Boris Burkov April 29, 2024, 10:19 p.m. UTC | #3
On Tue, Apr 30, 2024 at 07:30:58AM +0930, Qu Wenruo wrote:
> 
> 
> 在 2024/4/29 22:17, Boris Burkov 写道:
> > On Fri, Apr 19, 2024 at 07:16:52PM +0930, Qu Wenruo wrote:
> > > [BUG]
> > > Currently if one is utilizing "qgroups/drop_subtree_threshold" sysfs,
> > > and a snapshot with level higher than that value is dropped, btrfs will
> > > not be able to delete the qgroup until next qgroup rescan:
> > > 
> > >    uuid=ffffffff-eeee-dddd-cccc-000000000000
> > > 
> > >    wipefs -fa $dev
> > >    mkfs.btrfs -f $dev -O quota -s 4k -n 4k -U $uuid
> > >    mount $dev $mnt
> > > 
> > >    btrfs subvolume create $mnt/subv1/
> > >    for (( i = 0; i < 1024; i++ )); do
> > >    	xfs_io -f -c "pwrite 0 2k" $mnt/subv1/file_$i > /dev/null
> > >    done
> > >    sync
> > >    btrfs subv snapshot $mnt/subv1 $mnt/snapshot
> > >    btrfs quota enable $mnt
> > >    btrfs quota rescan -w $mnt
> > >    sync
> > >    echo 1 > /sys/fs/btrfs/$uuid/qgroups/drop_subtree_threshold
> > >    btrfs subvolume delete $mnt/snapshot
> > >    btrfs subv sync $mnt
> > >    btrfs qgroup show -prce --sync $mnt
> > >    btrfs qgroup destroy 0/257 $mnt
> > >    umount $mnt
> > > 
> > > The final qgroup removal would fail with the following error:
> > > 
> > >    ERROR: unable to destroy quota group: Device or resource busy
> > > 
> > > [CAUSE]
> > > The above script would generate a subvolume of level 2, then snapshot
> > > it, enable qgroup, set the drop_subtree_threshold, then drop the
> > > snapshot.
> > > 
> > > Since the subvolume drop would meet the threshold, qgroup would be
> > > marked inconsistent and skip accounting to avoid hanging the system at
> > > transaction commit.
> > > 
> > > But currently we do not allow a qgroup with any rfer/excl numbers to be
> > > dropped, and this is not really compatible with the new
> > > drop_subtree_threshold behavior.
> > > 
> > > [FIX]
> > > Instead of a strong requirement for zero rfer/excl numbers, just check
> > > if there is any child for higher level qgroup, and for subvolume qgroups
> > > check if there is a corresponding subvolume for it.
> > > 
> > > For rsv values, do extra warnings, and for rfer/excl numbers, only do the
> > > warning if we're in full accounting mode and the qgroup is consistent.
> > > 
> > > Signed-off-by: Qu Wenruo <wqu@suse.com>
> > > ---
> > >   fs/btrfs/qgroup.c | 69 ++++++++++++++++++++++++++++++++++++++++++-----
> > >   1 file changed, 62 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> > > index 9a9f84c4d3b8..2ea16a07a7d4 100644
> > > --- a/fs/btrfs/qgroup.c
> > > +++ b/fs/btrfs/qgroup.c
> > > @@ -1736,13 +1736,38 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
> > >   	return ret;
> > >   }
> > > 
> > > -static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
> > > +static bool can_delete_qgroup(struct btrfs_fs_info *fs_info,
> > > +			      struct btrfs_qgroup *qgroup)
> > >   {
> > > -	return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
> > > -		qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
> > > -		qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
> > > -		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
> > > -		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
> > > +	struct btrfs_key key;
> > > +	struct btrfs_path *path;
> > > +	int ret;
> > > +
> > > +	/* For higher level qgroup, we can only delete it if it has no child. */
> > > +	if (btrfs_qgroup_level(qgroup->qgroupid)) {
> > > +		if (!list_empty(&qgroup->members))
> > > +			return false;
> > > +		return true;
> > > +	}
> > > +
> > > +	/*
> > > +	 * For level-0 qgroups, we can only delete it if it has no subvolume
> > > +	 * for it.
> > > +	 * This means even a subvolume is unlinked but not yet fully dropped,
> > > +	 * we can not delete the qgroup.
> > > +	 */
> > 
> > This was what I originally considered for normal qgroups before observing
> > that usage is 0 anyway. I didn't know about the drop tree threshold,
> > my mistake.
> > 
> > With that said, I support this change for non-squota qgroups.
> > 
> > For squota, I think this behavior would be OK, but undesirable, IMO. Any
> > parent qgroup would still have its usage incremented against the limit,
> > but it would be impossible to find any information on where it was from.
> 
> That's indeed another problem.
> 
> For regular qgroup that could only happen when qgroup is inconsistent,
> and we're fine to drop the qgroup without updating the parent.
> 
> But for squota, there is no inconsistent state, meaning we should also
> drop all the numbers from parent too.
> 
> > 
> > How do you feel about making this patch add the new logic and make it
> > conditional on qgroup mode?
> 
> I'm totally fine to do it conditional, although I'd also like to get
> feedback on dropping the numbers from parent qgroup (so we can do it for
> both qgroup and squota).
> 
> Would the auto drop for parent numbers be a solution?

It's a good question. It never occurred to me until this discussion
today.

Thinking out loud, squotas as a feature is already "comfortable" with
unaccounted space. If you enable it on a live FS, all the extents that
already exist are unaccounted, so there is no objection on that front.

I think from a container isolation perspective, the current behavior
makes more sense than auto dropping from the parent on subvol delete to
allow cleaning up the qgroup.

Suppose we create a container wrapping parent qgroup with ID 1/100. To
enforce a limit on the container customer, we set some limit on 1/100.
Then we create a container subvol and put 0/svid0 into 1/100. The
customer gets to do stuff in the subvol. They are a fancy customer and
create a subvol svid1, snapshot it svid2, and delete svid1.

svid1 and svid2 are created in the subvol of id svid0, so auto-inheritance
ensured that 1/100 was the parent of 0/svid1 and 0/svid2, but now
deleting svid1 frees all that customer usage from 1/100 and allows the
customer to escape the limit. This is obviously quite undesirable, and
wouldn't require a particularly malicious customer to hit.

Boris

> 
> Thanks,
> Qu
> > 
> > Thanks,
> > Boris
> > 
> > > +	key.objectid = qgroup->qgroupid;
> > > +	key.type = BTRFS_ROOT_ITEM_KEY;
> > > +	key.offset = -1ULL;
> > > +	path = btrfs_alloc_path();
> > > +	if (!path)
> > > +		return false;
> > > +
> > > +	ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
> > > +	btrfs_free_path(path);
> > > +	if (ret > 0)
> > > +		return true;
> > > +	return false;
> > >   }
> > > 
> > >   int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
> > > @@ -1764,7 +1789,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
> > >   		goto out;
> > >   	}
> > > 
> > > -	if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
> > > +	if (!can_delete_qgroup(fs_info, qgroup)) {
> > >   		ret = -EBUSY;
> > >   		goto out;
> > >   	}
> > > @@ -1789,6 +1814,36 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
> > >   	}
> > > 
> > >   	spin_lock(&fs_info->qgroup_lock);
> > > +	/*
> > > +	 * Warn on reserved space. The subvolume should has no child nor
> > > +	 * corresponding subvolume.
> > > +	 * Thus its reserved space should all be zero, no matter if qgroup
> > > +	 * is consistent or the mode.
> > > +	 */
> > > +	WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
> > > +		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
> > > +		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
> > > +	/*
> > > +	 * The same for rfer/excl numbers, but that's only if our qgroup is
> > > +	 * consistent and if it's in regular qgroup mode.
> > > +	 * For simple mode it's not as accurate thus we can hit non-zero values
> > > +	 * very frequently.
> > > +	 */
> > > +	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
> > > +	    !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
> > > +		if (WARN_ON(qgroup->rfer || qgroup->excl ||
> > > +			    qgroup->rfer_cmpr || qgroup->excl_cmpr)) {
> > > +			btrfs_warn_rl(fs_info,
> > > +"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
> > > +				      btrfs_qgroup_level(qgroup->qgroupid),
> > > +				      btrfs_qgroup_subvolid(qgroup->qgroupid),
> > > +				      qgroup->rfer,
> > > +				      qgroup->rfer_cmpr,
> > > +				      qgroup->excl,
> > > +				      qgroup->excl_cmpr);
> > > +			qgroup_mark_inconsistent(fs_info);
> > > +		}
> > > +	}
> > 
> > If you go ahead with making it conditional, I would fold this warning
> > into the check logic. Either way is fine, of course.
> > 
> > >   	del_qgroup_rb(fs_info, qgroupid);
> > >   	spin_unlock(&fs_info->qgroup_lock);
> > > 
> > > --
> > > 2.44.0
> > > 
> >
Qu Wenruo April 29, 2024, 10:29 p.m. UTC | #4
在 2024/4/30 07:49, Boris Burkov 写道:
> On Tue, Apr 30, 2024 at 07:30:58AM +0930, Qu Wenruo wrote:
[...]
>>
>> I'm totally fine to do it conditional, although I'd also like to get
>> feedback on dropping the numbers from parent qgroup (so we can do it for
>> both qgroup and squota).
>>
>> Would the auto drop for parent numbers be a solution?
> 
> It's a good question. It never occurred to me until this discussion
> today.
> 
> Thinking out loud, squotas as a feature is already "comfortable" with
> unaccounted space. If you enable it on a live FS, all the extents that
> already exist are unaccounted, so there is no objection on that front.
> 
> I think from a container isolation perspective, the current behavior
> makes more sense than auto dropping from the parent on subvol delete to
> allow cleaning up the qgroup.
> 
> Suppose we create a container wrapping parent qgroup with ID 1/100. To
> enforce a limit on the container customer, we set some limit on 1/100.
> Then we create a container subvol and put 0/svid0 into 1/100. The
> customer gets to do stuff in the subvol. They are a fancy customer and
> create a subvol svid1, snapshot it svid2, and delete svid1.
> 
> svid1 and svid2 are created in the subvol of id svid0, so auto-inheritance
> ensured that 1/100 was the parent of 0/svid1 and 0/svid2, but now
> deleting svid1 frees all that customer usage from 1/100 and allows the
> customer to escape the limit. This is obviously quite undesirable, and
> wouldn't require a particularly malicious customer to hit.

OK, got your point. Thanks for the detailed explanation, and I can not 
come up with any alternative so far.

So I'll make the qgroup drop to have an extra condition for squota, so 
that a squota qgroup can only be dropped when:

1) The subvolume is fully dropped
    The same one for both regular qgroup and squota

2) The number are all zero
    This would be squota specific one.

So that the numbers would still be correct while regular qgroup can 
still handle auto-deletion for inconsistent case.

Thanks,
Qu

> 
> Boris
> 
>>
>> Thanks,
>> Qu
>>>
>>> Thanks,
>>> Boris
>>>
>>>> +	key.objectid = qgroup->qgroupid;
>>>> +	key.type = BTRFS_ROOT_ITEM_KEY;
>>>> +	key.offset = -1ULL;
>>>> +	path = btrfs_alloc_path();
>>>> +	if (!path)
>>>> +		return false;
>>>> +
>>>> +	ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
>>>> +	btrfs_free_path(path);
>>>> +	if (ret > 0)
>>>> +		return true;
>>>> +	return false;
>>>>    }
>>>>
>>>>    int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
>>>> @@ -1764,7 +1789,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
>>>>    		goto out;
>>>>    	}
>>>>
>>>> -	if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
>>>> +	if (!can_delete_qgroup(fs_info, qgroup)) {
>>>>    		ret = -EBUSY;
>>>>    		goto out;
>>>>    	}
>>>> @@ -1789,6 +1814,36 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
>>>>    	}
>>>>
>>>>    	spin_lock(&fs_info->qgroup_lock);
>>>> +	/*
>>>> +	 * Warn on reserved space. The subvolume should has no child nor
>>>> +	 * corresponding subvolume.
>>>> +	 * Thus its reserved space should all be zero, no matter if qgroup
>>>> +	 * is consistent or the mode.
>>>> +	 */
>>>> +	WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
>>>> +		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
>>>> +		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
>>>> +	/*
>>>> +	 * The same for rfer/excl numbers, but that's only if our qgroup is
>>>> +	 * consistent and if it's in regular qgroup mode.
>>>> +	 * For simple mode it's not as accurate thus we can hit non-zero values
>>>> +	 * very frequently.
>>>> +	 */
>>>> +	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
>>>> +	    !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
>>>> +		if (WARN_ON(qgroup->rfer || qgroup->excl ||
>>>> +			    qgroup->rfer_cmpr || qgroup->excl_cmpr)) {
>>>> +			btrfs_warn_rl(fs_info,
>>>> +"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
>>>> +				      btrfs_qgroup_level(qgroup->qgroupid),
>>>> +				      btrfs_qgroup_subvolid(qgroup->qgroupid),
>>>> +				      qgroup->rfer,
>>>> +				      qgroup->rfer_cmpr,
>>>> +				      qgroup->excl,
>>>> +				      qgroup->excl_cmpr);
>>>> +			qgroup_mark_inconsistent(fs_info);
>>>> +		}
>>>> +	}
>>>
>>> If you go ahead with making it conditional, I would fold this warning
>>> into the check logic. Either way is fine, of course.
>>>
>>>>    	del_qgroup_rb(fs_info, qgroupid);
>>>>    	spin_unlock(&fs_info->qgroup_lock);
>>>>
>>>> --
>>>> 2.44.0
>>>>
>>>
Boris Burkov April 29, 2024, 10:38 p.m. UTC | #5
On Tue, Apr 30, 2024 at 07:59:17AM +0930, Qu Wenruo wrote:
> 
> 
> 在 2024/4/30 07:49, Boris Burkov 写道:
> > On Tue, Apr 30, 2024 at 07:30:58AM +0930, Qu Wenruo wrote:
> [...]
> > > 
> > > I'm totally fine to do it conditional, although I'd also like to get
> > > feedback on dropping the numbers from parent qgroup (so we can do it for
> > > both qgroup and squota).
> > > 
> > > Would the auto drop for parent numbers be a solution?
> > 
> > It's a good question. It never occurred to me until this discussion
> > today.
> > 
> > Thinking out loud, squotas as a feature is already "comfortable" with
> > unaccounted space. If you enable it on a live FS, all the extents that
> > already exist are unaccounted, so there is no objection on that front.
> > 
> > I think from a container isolation perspective, the current behavior
> > makes more sense than auto dropping from the parent on subvol delete to
> > allow cleaning up the qgroup.
> > 
> > Suppose we create a container wrapping parent qgroup with ID 1/100. To
> > enforce a limit on the container customer, we set some limit on 1/100.
> > Then we create a container subvol and put 0/svid0 into 1/100. The
> > customer gets to do stuff in the subvol. They are a fancy customer and
> > create a subvol svid1, snapshot it svid2, and delete svid1.
> > 
> > svid1 and svid2 are created in the subvol of id svid0, so auto-inheritance
> > ensured that 1/100 was the parent of 0/svid1 and 0/svid2, but now
> > deleting svid1 frees all that customer usage from 1/100 and allows the
> > customer to escape the limit. This is obviously quite undesirable, and
> > wouldn't require a particularly malicious customer to hit.
> 
> OK, got your point. Thanks for the detailed explanation, and I can not come
> up with any alternative so far.
> 
> So I'll make the qgroup drop to have an extra condition for squota, so that
> a squota qgroup can only be dropped when:
> 
> 1) The subvolume is fully dropped
>    The same one for both regular qgroup and squota
> 
> 2) The number are all zero
>    This would be squota specific one.
> 
> So that the numbers would still be correct while regular qgroup can still
> handle auto-deletion for inconsistent case.

That sounds like a good design to me. And as I mentioned, you might be
able to share the conditions for squota failing with EBUSY and normal
qgroup printing a debug message about being inconsistent

> 
> Thanks,
> Qu
> 
> > 
> > Boris
> > 
> > > 
> > > Thanks,
> > > Qu
> > > > 
> > > > Thanks,
> > > > Boris
> > > > 
> > > > > +	key.objectid = qgroup->qgroupid;
> > > > > +	key.type = BTRFS_ROOT_ITEM_KEY;
> > > > > +	key.offset = -1ULL;
> > > > > +	path = btrfs_alloc_path();
> > > > > +	if (!path)
> > > > > +		return false;
> > > > > +
> > > > > +	ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
> > > > > +	btrfs_free_path(path);
> > > > > +	if (ret > 0)
> > > > > +		return true;
> > > > > +	return false;
> > > > >    }
> > > > > 
> > > > >    int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
> > > > > @@ -1764,7 +1789,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
> > > > >    		goto out;
> > > > >    	}
> > > > > 
> > > > > -	if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
> > > > > +	if (!can_delete_qgroup(fs_info, qgroup)) {
> > > > >    		ret = -EBUSY;
> > > > >    		goto out;
> > > > >    	}
> > > > > @@ -1789,6 +1814,36 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
> > > > >    	}
> > > > > 
> > > > >    	spin_lock(&fs_info->qgroup_lock);
> > > > > +	/*
> > > > > +	 * Warn on reserved space. The subvolume should has no child nor
> > > > > +	 * corresponding subvolume.
> > > > > +	 * Thus its reserved space should all be zero, no matter if qgroup
> > > > > +	 * is consistent or the mode.
> > > > > +	 */
> > > > > +	WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
> > > > > +		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
> > > > > +		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
> > > > > +	/*
> > > > > +	 * The same for rfer/excl numbers, but that's only if our qgroup is
> > > > > +	 * consistent and if it's in regular qgroup mode.
> > > > > +	 * For simple mode it's not as accurate thus we can hit non-zero values
> > > > > +	 * very frequently.
> > > > > +	 */
> > > > > +	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
> > > > > +	    !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
> > > > > +		if (WARN_ON(qgroup->rfer || qgroup->excl ||
> > > > > +			    qgroup->rfer_cmpr || qgroup->excl_cmpr)) {
> > > > > +			btrfs_warn_rl(fs_info,
> > > > > +"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
> > > > > +				      btrfs_qgroup_level(qgroup->qgroupid),
> > > > > +				      btrfs_qgroup_subvolid(qgroup->qgroupid),
> > > > > +				      qgroup->rfer,
> > > > > +				      qgroup->rfer_cmpr,
> > > > > +				      qgroup->excl,
> > > > > +				      qgroup->excl_cmpr);
> > > > > +			qgroup_mark_inconsistent(fs_info);
> > > > > +		}
> > > > > +	}
> > > > 
> > > > If you go ahead with making it conditional, I would fold this warning
> > > > into the check logic. Either way is fine, of course.
> > > > 
> > > > >    	del_qgroup_rb(fs_info, qgroupid);
> > > > >    	spin_unlock(&fs_info->qgroup_lock);
> > > > > 
> > > > > --
> > > > > 2.44.0
> > > > > 
> > > >
diff mbox series

Patch

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9a9f84c4d3b8..2ea16a07a7d4 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1736,13 +1736,38 @@  int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
 	return ret;
 }
 
-static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
+static bool can_delete_qgroup(struct btrfs_fs_info *fs_info,
+			      struct btrfs_qgroup *qgroup)
 {
-	return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
-		qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
-		qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
-		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
-		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	int ret;
+
+	/* For higher level qgroup, we can only delete it if it has no child. */
+	if (btrfs_qgroup_level(qgroup->qgroupid)) {
+		if (!list_empty(&qgroup->members))
+			return false;
+		return true;
+	}
+
+	/*
+	 * For level-0 qgroups, we can only delete it if it has no subvolume
+	 * for it.
+	 * This means even a subvolume is unlinked but not yet fully dropped,
+	 * we can not delete the qgroup.
+	 */
+	key.objectid = qgroup->qgroupid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = -1ULL;
+	path = btrfs_alloc_path();
+	if (!path)
+		return false;
+
+	ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
+	btrfs_free_path(path);
+	if (ret > 0)
+		return true;
+	return false;
 }
 
 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
@@ -1764,7 +1789,7 @@  int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
 		goto out;
 	}
 
-	if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
+	if (!can_delete_qgroup(fs_info, qgroup)) {
 		ret = -EBUSY;
 		goto out;
 	}
@@ -1789,6 +1814,36 @@  int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
 	}
 
 	spin_lock(&fs_info->qgroup_lock);
+	/*
+	 * Warn on reserved space. The subvolume should has no child nor
+	 * corresponding subvolume.
+	 * Thus its reserved space should all be zero, no matter if qgroup
+	 * is consistent or the mode.
+	 */
+	WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
+		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
+		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
+	/*
+	 * The same for rfer/excl numbers, but that's only if our qgroup is
+	 * consistent and if it's in regular qgroup mode.
+	 * For simple mode it's not as accurate thus we can hit non-zero values
+	 * very frequently.
+	 */
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
+	    !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
+		if (WARN_ON(qgroup->rfer || qgroup->excl ||
+			    qgroup->rfer_cmpr || qgroup->excl_cmpr)) {
+			btrfs_warn_rl(fs_info,
+"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
+				      btrfs_qgroup_level(qgroup->qgroupid),
+				      btrfs_qgroup_subvolid(qgroup->qgroupid),
+				      qgroup->rfer,
+				      qgroup->rfer_cmpr,
+				      qgroup->excl,
+				      qgroup->excl_cmpr);
+			qgroup_mark_inconsistent(fs_info);
+		}
+	}
 	del_qgroup_rb(fs_info, qgroupid);
 	spin_unlock(&fs_info->qgroup_lock);