diff mbox series

[v5] btrfs-progs: add --subvol option to mkfs.btrfs

Message ID 20240808171721.370556-1-maharmstone@fb.com (mailing list archive)
State New, archived
Headers show
Series [v5] btrfs-progs: add --subvol option to mkfs.btrfs | expand

Commit Message

Mark Harmstone Aug. 8, 2024, 5:17 p.m. UTC
This patch adds a --subvol option, which tells mkfs.btrfs to create the
specified directories as subvolumes.

Given a populated directory img, the command

$ mkfs.btrfs --rootdir img --subvol img/usr --subvol img/home --subvol img/home/username /dev/loop0

will create subvolumes usr and home within the FS root, and subvolume
username within the home subvolume. It will fail if any of the
directories do not yet exist.

Signed-off-by: Mark Harmstone <maharmstone@fb.com>
---
 mkfs/main.c                                 | 146 ++++++++++++++++++--
 mkfs/rootdir.c                              | 131 ++++++++++++++----
 mkfs/rootdir.h                              |   9 +-
 tests/mkfs-tests/036-rootdir-subvol/test.sh |  33 +++++
 4 files changed, 277 insertions(+), 42 deletions(-)
 create mode 100755 tests/mkfs-tests/036-rootdir-subvol/test.sh

Changelog:

Patch 2:
* Rebased against upstream changes
* Rewrote so that directory sizes are correct within transactions
* Changed --subvol so that it is relative to cwd rather than rootdir, so
that in future we might allow out-of-tree subvols

Patch 3:
* Changed btrfs_mkfs_fill_dir so it doesn't start a transaction itself
* Moved subvol creation and linking into traverse_directory
* Removed depth calculation code, no longer needed

Patch 4:
* Rebased against upstream changes

Patch 5:
* Removed some useless calls to list_empty

Comments

David Sterba Aug. 9, 2024, 7:31 p.m. UTC | #1
On Thu, Aug 08, 2024 at 06:17:16PM +0100, Mark Harmstone wrote:
> This patch adds a --subvol option, which tells mkfs.btrfs to create the
> specified directories as subvolumes.
> 
> Given a populated directory img, the command
> 
> $ mkfs.btrfs --rootdir img --subvol img/usr --subvol img/home --subvol img/home/username /dev/loop0
> 
> will create subvolumes usr and home within the FS root, and subvolume
> username within the home subvolume. It will fail if any of the
> directories do not yet exist.

Can this be fixed so the intermediate directories are created if they
don't exist? So for example

mkfs.btrfs --rootdir dir --subvolume /var/lib/images

where dir contains only /var. I don't think it's that common but we
should not make users type something can be done programmaticaly.

> Signed-off-by: Mark Harmstone <maharmstone@fb.com>
> ---
>  mkfs/main.c                                 | 146 ++++++++++++++++++--
>  mkfs/rootdir.c                              | 131 ++++++++++++++----
>  mkfs/rootdir.h                              |   9 +-
>  tests/mkfs-tests/036-rootdir-subvol/test.sh |  33 +++++
>  4 files changed, 277 insertions(+), 42 deletions(-)
>  create mode 100755 tests/mkfs-tests/036-rootdir-subvol/test.sh
> 
> Changelog:
> 
> Patch 2:
> * Rebased against upstream changes
> * Rewrote so that directory sizes are correct within transactions
> * Changed --subvol so that it is relative to cwd rather than rootdir, so
> that in future we might allow out-of-tree subvols
> 
> Patch 3:
> * Changed btrfs_mkfs_fill_dir so it doesn't start a transaction itself
> * Moved subvol creation and linking into traverse_directory
> * Removed depth calculation code, no longer needed
> 
> Patch 4:
> * Rebased against upstream changes
> 
> Patch 5:
> * Removed some useless calls to list_empty
> 
> diff --git a/mkfs/main.c b/mkfs/main.c
> index b24b148d..ebf2a9c0 100644
> --- a/mkfs/main.c
> +++ b/mkfs/main.c
> @@ -440,6 +440,7 @@ static const char * const mkfs_usage[] = {
>  	"Creation:",
>  	OPTLINE("-b|--byte-count SIZE", "set size of each device to SIZE (filesystem size is sum of all device sizes)"),
>  	OPTLINE("-r|--rootdir DIR", "copy files from DIR to the image root directory"),
> +	OPTLINE("-u|--subvol SUBDIR", "create SUBDIR as subvolume rather than normal directory"),

Please mention that it can be specified multiple times.

>  	OPTLINE("--shrink", "(with --rootdir) shrink the filled filesystem to minimal size"),
>  	OPTLINE("-K|--nodiscard", "do not perform whole device TRIM"),
>  	OPTLINE("-f|--force", "force overwrite of existing filesystem"),
> @@ -1055,6 +1056,9 @@ int BOX_MAIN(mkfs)(int argc, char **argv)
>  	char *label = NULL;
>  	int nr_global_roots = sysconf(_SC_NPROCESSORS_ONLN);
>  	char *source_dir = NULL;
> +	size_t source_dir_len = 0;
> +	struct rootdir_subvol *rds;

Please use a more descriptive variable name.

> +	LIST_HEAD(subvols);
>  
>  	cpu_detect_flags();
>  	hash_init_accel();
> @@ -1085,6 +1089,7 @@ int BOX_MAIN(mkfs)(int argc, char **argv)
>  			{ "data", required_argument, NULL, 'd' },
>  			{ "version", no_argument, NULL, 'V' },
>  			{ "rootdir", required_argument, NULL, 'r' },
> +			{ "subvol", required_argument, NULL, 'u' },
>  			{ "nodiscard", no_argument, NULL, 'K' },
>  			{ "features", required_argument, NULL, 'O' },
>  			{ "runtime-features", required_argument, NULL, 'R' },
> @@ -1102,7 +1107,7 @@ int BOX_MAIN(mkfs)(int argc, char **argv)
>  			{ NULL, 0, NULL, 0}
>  		};
>  
> -		c = getopt_long(argc, argv, "A:b:fl:n:s:m:d:L:R:O:r:U:VvMKq",
> +		c = getopt_long(argc, argv, "A:b:fl:n:s:m:d:L:R:O:r:U:VvMKqu:",
>  				long_options, NULL);
>  		if (c < 0)
>  			break;
> @@ -1208,6 +1213,22 @@ int BOX_MAIN(mkfs)(int argc, char **argv)
>  				free(source_dir);
>  				source_dir = strdup(optarg);
>  				break;
> +			case 'u': {
> +				struct rootdir_subvol *s;

Please avoid sinle letter variables (except 'i' for indexing).

> +
> +				s = malloc(sizeof(struct rootdir_subvol));
> +				if (!s) {
> +					error("out of memory");

There's the error_msg ERROR_MSG_MEMORY template for errors, please use it.

> +					ret = 1;
> +					goto error;
> +				}
> +
> +				s->dir = strdup(optarg);
> +				s->full_path = NULL;
> +
> +				list_add_tail(&s->list, &subvols);
> +				break;
> +				}
>  			case 'U':
>  				strncpy_null(fs_uuid, optarg, BTRFS_UUID_UNPARSED_SIZE);
>  				break;
> @@ -1272,6 +1293,71 @@ int BOX_MAIN(mkfs)(int argc, char **argv)
>  		ret = 1;
>  		goto error;
>  	}
> +	if (!list_empty(&subvols) && source_dir == NULL) {
> +		error("the option --subvol must be used with --rootdir");

This may be also allowed, e.g. for use case that creates the skeleton of
the subvolumes but does not fill it with files that can be later
copied over the existing structure. As this may be more comples doing
that in another patch is ok.

> +		ret = 1;
> +		goto error;
> +	}
> +
> +	if (source_dir) {
> +		char *canonical = realpath(source_dir, NULL);
> +
> +		if (!canonical) {
> +			error("could not get canonical path to %s", source_dir);
> +			ret = 1;
> +			goto error;
> +		}
> +
> +		free(source_dir);
> +		source_dir = canonical;
> +		source_dir_len = strlen(source_dir);
> +	}
> +
> +	list_for_each_entry(rds, &subvols, list) {
> +		char *path;
> +		struct rootdir_subvol *rds2;
> +
> +		if (!path_exists(rds->dir)) {
> +			error("subvol %s does not exist", rds->dir);

In error messages please use 'subvolume'

> +			ret = 1;
> +			goto error;
> +		}
> +
> +		if (!path_is_dir(rds->dir)) {
> +			error("subvol %s is not a directory", rds->dir);
> +			ret = 1;
> +			goto error;
> +		}
> +
> +		path = realpath(rds->dir, NULL);
> +
> +		if (!path) {
> +			error("could not get canonical path to %s", rds->dir);
> +			ret = 1;
> +			goto error;
> +		}
> +
> +		rds->full_path = path;
> +
> +		if (strlen(path) < source_dir_len + 1 ||
> +		    memcmp(path, source_dir, source_dir_len) ||

Please use explicit == 0 or != 0 for memcmp (and strcmp).

> +		    path[source_dir_len] != '/') {
> +			error("subvol %s is not a child of %s", rds->dir,
> +			      source_dir);
> +			ret = 1;
> +			goto error;
> +		}
> +
> +		for (rds2 = list_first_entry(&subvols, struct rootdir_subvol, list);
> +		     rds2 != rds; rds2 = list_next_entry(rds2, list)) {
> +			if (!strcmp(rds2->full_path, path)) {
> +				error("subvol %s specified more than once",
> +					rds->dir);

I wonder if this should be a hard error, mentioning the subvolume twice
does not change anything and it's slightly more convenient for example
when the subvolume list is generated

> +				ret = 1;
> +				goto error;
> +			}
> +		}
> +	}
>  
>  	if (*fs_uuid) {
>  		uuid_t dummy_uuid;
> @@ -1821,24 +1907,37 @@ raid_groups:
>  		error_msg(ERROR_MSG_START_TRANS, "%m");
>  		goto out;
>  	}
> -	ret = btrfs_rebuild_uuid_tree(fs_info);
> -	if (ret < 0)
> -		goto out;
> -
> -	ret = cleanup_temp_chunks(fs_info, &allocation, data_profile,
> -				  metadata_profile, metadata_profile);
> -	if (ret < 0) {
> -		error("failed to cleanup temporary chunks: %d", ret);
> -		goto out;
> -	}
>  
>  	if (source_dir) {
>  		pr_verbose(LOG_DEFAULT, "Rootdir from:       %s\n", source_dir);
> -		ret = btrfs_mkfs_fill_dir(source_dir, root);
> +
> +		trans = btrfs_start_transaction(root, 1);
> +		if (IS_ERR(trans)) {
> +			errno = -PTR_ERR(trans);
> +			error_msg(ERROR_MSG_START_TRANS, "%m");
> +			goto out;
> +		}
> +
> +		ret = btrfs_mkfs_fill_dir(trans, source_dir, root,
> +					  &subvols);
>  		if (ret) {
>  			error("error while filling filesystem: %d", ret);
> +			btrfs_abort_transaction(trans, ret);
> +			goto out;
> +		}
> +
> +		ret = btrfs_commit_transaction(trans, root);
> +		if (ret) {
> +			errno = -ret;
> +			error_msg(ERROR_MSG_COMMIT_TRANS, "%m");
>  			goto out;
>  		}
> +
> +		list_for_each_entry(rds, &subvols, list) {
> +			pr_verbose(LOG_DEFAULT, "  Subvol from:      %s\n",

The messages is probably copied from the 'Rootdir from' but here the
'from' does not make sense as it's only creating the subvolume, not
copying files "from" it.

> +				   rds->full_path);
> +		}
> +
>  		if (shrink_rootdir) {
>  			pr_verbose(LOG_DEFAULT, "  Shrink:           yes\n");
>  			ret = btrfs_mkfs_shrink_fs(fs_info, &shrink_size,
> @@ -1853,6 +1952,17 @@ raid_groups:
>  		}
>  	}
>  
> +	ret = btrfs_rebuild_uuid_tree(fs_info);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = cleanup_temp_chunks(fs_info, &allocation, data_profile,
> +				  metadata_profile, metadata_profile);
> +	if (ret < 0) {
> +		error("failed to cleanup temporary chunks: %d", ret);
> +		goto out;
> +	}
> +
>  	if (features.runtime_flags & BTRFS_FEATURE_RUNTIME_QUOTA ||
>  	    features.incompat_flags & BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) {
>  		ret = setup_quota_root(fs_info);
> @@ -1946,6 +2056,18 @@ error:
>  	free(label);
>  	free(source_dir);
>  
> +	while (!list_empty(&subvols)) {
> +		struct rootdir_subvol *head = list_entry(subvols.next,
> +					      struct rootdir_subvol,
> +					      list);
> +
> +		free(head->dir);
> +		free(head->full_path);
> +
> +		list_del(&head->list);
> +		free(head);
> +	}
> +
>  	return !!ret;
>  
>  success:
> diff --git a/mkfs/rootdir.c b/mkfs/rootdir.c
> index 05787dc3..8f5658e1 100644
> --- a/mkfs/rootdir.c
> +++ b/mkfs/rootdir.c
> @@ -40,6 +40,8 @@
>  #include "common/messages.h"
>  #include "common/utils.h"
>  #include "common/extent-tree-utils.h"
> +#include "common/root-tree-utils.h"
> +#include "common/path-utils.h"
>  #include "mkfs/rootdir.h"
>  
>  static u32 fs_block_size;
> @@ -68,6 +70,7 @@ static u64 ftw_data_size;
>  struct inode_entry {
>  	/* The inode number inside btrfs. */
>  	u64 ino;
> +	struct btrfs_root *root;
>  	struct list_head list;
>  };
>  
> @@ -91,6 +94,8 @@ static struct rootdir_path current_path = {
>  };
>  
>  static struct btrfs_trans_handle *g_trans = NULL;
> +static struct list_head *g_subvols;
> +static u64 next_subvol_id = BTRFS_FIRST_FREE_OBJECTID;
>  
>  static inline struct inode_entry *rootdir_path_last(struct rootdir_path *path)
>  {
> @@ -111,13 +116,15 @@ static void rootdir_path_pop(struct rootdir_path *path)
>  	free(last);
>  }
>  
> -static int rootdir_path_push(struct rootdir_path *path, u64 ino)
> +static int rootdir_path_push(struct rootdir_path *path, struct btrfs_root *root,
> +			     u64 ino)
>  {
>  	struct inode_entry *new;
>  
>  	new = malloc(sizeof(*new));
>  	if (!new)
>  		return -ENOMEM;
> +	new->root = root;
>  	new->ino = ino;
>  	list_add_tail(&new->list, &path->inode_list);
>  	path->level++;
> @@ -409,13 +416,83 @@ static u8 ftype_to_btrfs_type(mode_t ftype)
>  	return BTRFS_FT_UNKNOWN;
>  }
>  
> +static int ftw_add_subvol(const char *full_path, const struct stat *st,
> +			  int typeflag, struct FTW *ftwbuf,
> +			  struct rootdir_subvol *s)

No single letter parameters

> +{
> +	int ret;
> +	struct btrfs_key key;
> +	struct btrfs_root *new_root;
> +	struct inode_entry *parent;
> +	struct btrfs_inode_item inode_item = { 0 };
> +	u64 subvol_id, ino;
> +
> +	subvol_id = next_subvol_id++;
> +
> +	ret = btrfs_make_subvolume(g_trans, subvol_id);
> +	if (ret < 0) {
> +		error("failed to create subvolume: %d", ret);
> +		return ret;
> +	}
> +
> +	key.objectid = subvol_id;
> +	key.type = BTRFS_ROOT_ITEM_KEY;
> +	key.offset = (u64)-1;
> +
> +	new_root = btrfs_read_fs_root(g_trans->fs_info, &key);
> +	if (IS_ERR(new_root)) {
> +		error("unable to fs read root: %lu", PTR_ERR(new_root));
> +		return -PTR_ERR(new_root);
> +	}
> +
> +	parent = rootdir_path_last(&current_path);
> +
> +	ret = btrfs_link_subvolume(g_trans, parent->root, parent->ino,
> +				   path_basename(s->full_path),
> +				   strlen(path_basename(s->full_path)), new_root);
> +	if (ret) {
> +		error("unable to link subvolume %s", path_basename(s->full_path));
> +		return ret;
> +	}
> +
> +	ino = btrfs_root_dirid(&new_root->root_item);
> +
> +	ret = add_xattr_item(g_trans, new_root, ino, full_path);
> +	if (ret < 0) {
> +		errno = -ret;
> +		error("failed to add xattr item for the top level inode in subvol %llu: %m",

Please also add the full_path to the error message

> +		      subvol_id);
> +		return ret;
> +	}
> +	stat_to_inode_item(&inode_item, st);
> +
> +	btrfs_set_stack_inode_nlink(&inode_item, 1);
> +	ret = update_inode_item(g_trans, new_root, &inode_item, ino);
> +	if (ret < 0) {
> +		errno = -ret;
> +		error("failed to update root dir for root %llu: %m", subvol_id);
> +		return ret;
> +	}
> +
> +	ret = rootdir_path_push(&current_path, new_root, ino);
> +	if (ret < 0) {
> +		errno = -ret;
> +		error("failed to allocate new entry for subvol %llu ('%s'): %m",
> +		      subvol_id, full_path);
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
>  static int ftw_add_inode(const char *full_path, const struct stat *st,
>  			 int typeflag, struct FTW *ftwbuf)
>  {
>  	struct btrfs_fs_info *fs_info = g_trans->fs_info;
> -	struct btrfs_root *root = fs_info->fs_root;
> +	struct btrfs_root *root;
>  	struct btrfs_inode_item inode_item = { 0 };
>  	struct inode_entry *parent;
> +	struct rootdir_subvol *rds;
>  	u64 ino;
>  	int ret;
>  
> @@ -436,7 +513,10 @@ static int ftw_add_inode(const char *full_path, const struct stat *st,
>  
>  	/* The rootdir itself. */
>  	if (unlikely(ftwbuf->level == 0)) {
> -		u64 root_ino = btrfs_root_dirid(&root->root_item);
> +		u64 root_ino;
> +
> +		root = fs_info->fs_root;
> +		root_ino = btrfs_root_dirid(&root->root_item);
>  
>  		UASSERT(S_ISDIR(st->st_mode));
>  		UASSERT(current_path.level == 0);
> @@ -462,7 +542,7 @@ static int ftw_add_inode(const char *full_path, const struct stat *st,
>  		}
>  
>  		/* Push (and initialize) the rootdir directory into the stack. */
> -		ret = rootdir_path_push(&current_path,
> +		ret = rootdir_path_push(&current_path, root,
>  					btrfs_root_dirid(&root->root_item));
>  		if (ret < 0) {
>  			errno = -ret;
> @@ -511,6 +591,18 @@ static int ftw_add_inode(const char *full_path, const struct stat *st,
>  	while (current_path.level > ftwbuf->level)
>  		rootdir_path_pop(&current_path);
>  
> +	if (S_ISDIR(st->st_mode)) {
> +		list_for_each_entry(rds, g_subvols, list) {
> +			if (!strcmp(full_path, rds->full_path)) {

Please use == 0 or != for stcmp.

> +				return ftw_add_subvol(full_path, st, typeflag,
> +						      ftwbuf, rds);
> +			}
> +		}
> +	}
> +
> +	parent = rootdir_path_last(&current_path);
> +	root = parent->root;
> +
>  	ret = btrfs_find_free_objectid(g_trans, root,
>  				       BTRFS_FIRST_FREE_OBJECTID, &ino);
>  	if (ret < 0) {
> @@ -529,7 +621,6 @@ static int ftw_add_inode(const char *full_path, const struct stat *st,
>  		return ret;
>  	}
>  
> -	parent = rootdir_path_last(&current_path);
>  	ret = btrfs_add_link(g_trans, root, ino, parent->ino,
>  			     full_path + ftwbuf->base,
>  			     strlen(full_path) - ftwbuf->base,
> @@ -556,7 +647,7 @@ static int ftw_add_inode(const char *full_path, const struct stat *st,
>  		return ret;
>  	}
>  	if (S_ISDIR(st->st_mode)) {
> -		ret = rootdir_path_push(&current_path, ino);
> +		ret = rootdir_path_push(&current_path, root, ino);
>  		if (ret < 0) {
>  			errno = -ret;
>  			error("failed to allocate new entry for inode %llu ('%s'): %m",
> @@ -597,49 +688,31 @@ static int ftw_add_inode(const char *full_path, const struct stat *st,
>  	return 0;
>  };
>  
> -int btrfs_mkfs_fill_dir(const char *source_dir, struct btrfs_root *root)
> +int btrfs_mkfs_fill_dir(struct btrfs_trans_handle *trans, const char *source_dir,
> +			struct btrfs_root *root, struct list_head *subvols)
>  {
>  	int ret;
> -	struct btrfs_trans_handle *trans;
>  	struct stat root_st;
>  
>  	ret = lstat(source_dir, &root_st);
>  	if (ret) {
>  		error("unable to lstat %s: %m", source_dir);
> -		ret = -errno;
> -		goto out;
> -	}
> -
> -	trans = btrfs_start_transaction(root, 1);
> -	if (IS_ERR(trans)) {
> -		ret = PTR_ERR(trans);
> -		errno = -ret;
> -		error_msg(ERROR_MSG_START_TRANS, "%m");
> -		goto fail;
> +		return -errno;
>  	}
>  
>  	g_trans = trans;
> +	g_subvols = subvols;
>  	INIT_LIST_HEAD(&current_path.inode_list);
>  
>  	ret = nftw(source_dir, ftw_add_inode, 32, FTW_PHYS);
>  	if (ret) {
>  		error("unable to traverse directory %s: %d", source_dir, ret);
> -		goto fail;
> -	}
> -	ret = btrfs_commit_transaction(trans, root);
> -	if (ret) {
> -		errno = -ret;
> -		error_msg(ERROR_MSG_COMMIT_TRANS, "%m");
> -		goto out;
> +		return ret;
>  	}
>  	while (current_path.level > 0)
>  		rootdir_path_pop(&current_path);
>  
>  	return 0;
> -fail:
> -	btrfs_abort_transaction(trans, ret);
> -out:
> -	return ret;
>  }
>  
>  static int ftw_add_entry_size(const char *fpath, const struct stat *st,
> diff --git a/mkfs/rootdir.h b/mkfs/rootdir.h
> index 4233431a..128e9e09 100644
> --- a/mkfs/rootdir.h
> +++ b/mkfs/rootdir.h
> @@ -28,7 +28,14 @@
>  struct btrfs_fs_info;
>  struct btrfs_root;
>  
> -int btrfs_mkfs_fill_dir(const char *source_dir, struct btrfs_root *root);
> +struct rootdir_subvol {
> +	struct list_head list;
> +	char *dir;
> +	char *full_path;
> +};
> +
> +int btrfs_mkfs_fill_dir(struct btrfs_trans_handle *trans, const char *source_dir,
> +			struct btrfs_root *root, struct list_head *subvols);
>  u64 btrfs_mkfs_size_dir(const char *dir_name, u32 sectorsize, u64 min_dev_size,
>  			u64 meta_profile, u64 data_profile);
>  int btrfs_mkfs_shrink_fs(struct btrfs_fs_info *fs_info, u64 *new_size_ret,
> diff --git a/tests/mkfs-tests/036-rootdir-subvol/test.sh b/tests/mkfs-tests/036-rootdir-subvol/test.sh
> new file mode 100755
> index 00000000..ccd6893f
> --- /dev/null
> +++ b/tests/mkfs-tests/036-rootdir-subvol/test.sh
> @@ -0,0 +1,33 @@
> +#!/bin/bash
> +# smoke test for mkfs.btrfs --subvol option
> +
> +source "$TEST_TOP/common" || exit
> +
> +check_prereq mkfs.btrfs
> +check_prereq btrfs
> +
> +setup_root_helper
> +prepare_test_dev
> +
> +tmp=$(_mktemp_dir mkfs-rootdir)
> +
> +touch $tmp/foo
> +mkdir $tmp/dir
> +mkdir $tmp/dir/subvol
> +touch $tmp/dir/subvol/bar

Please quote all shell variables
(https://github.com/kdave/btrfs-progs/tree/devel/tests#coding-style-best-practices)

> +
> +run_check_mkfs_test_dev --rootdir "$tmp" --subvol "$tmp/dir/subvol"
> +run_check $SUDO_HELPER "$TOP/btrfs" check "$TEST_DEV"
> +
> +run_check_mount_test_dev
> +run_check_stdout $SUDO_HELPER "$TOP/btrfs" subvolume list "$TEST_MNT" | \
> +	cut -d\  -f9 > "$tmp/output"
> +run_check_umount_test_dev
> +
> +result=$(cat "$tmp/output")
> +
> +if [ "$result" != "dir/subvol" ]; then
> +	_fail "dir/subvol not in subvolume list"
> +fi
> +
> +rm -rf -- "$tmp"
> -- 
> 2.44.2
> 
>
Goffredo Baroncelli Aug. 10, 2024, 9:53 a.m. UTC | #2
On 09/08/2024 21.31, David Sterba wrote:
> On Thu, Aug 08, 2024 at 06:17:16PM +0100, Mark Harmstone wrote:
>> This patch adds a --subvol option, which tells mkfs.btrfs to create the
>> specified directories as subvolumes.
>>
>> Given a populated directory img, the command
>>
>> $ mkfs.btrfs --rootdir img --subvol img/usr --subvol img/home --subvol img/home/username /dev/loop0
>>
>> will create subvolumes usr and home within the FS root, and subvolume
>> username within the home subvolume. It will fail if any of the
>> directories do not yet exist.
> 
> Can this be fixed so the intermediate directories are created if they
> don't exist? So for example
> 
> mkfs.btrfs --rootdir dir --subvolume /var/lib/images
> 
> where dir contains only /var. I don't think it's that common but we
> should not make users type something can be done programmaticaly.

Can we go a bit further ? I mean get rid of --rootdir completely, so
a filesystem with "a default" subvolume can be created without
passing --rootdir.

However, this would lead to the queston: which user and permission has to be set to those subvolumes ?
So I think that we need a further parameter like "--subvol-perm" and "--subvol-owner"...

BR
Qu Wenruo Aug. 10, 2024, 10:40 p.m. UTC | #3
在 2024/8/10 19:23, Goffredo Baroncelli 写道:
> On 09/08/2024 21.31, David Sterba wrote:
>> On Thu, Aug 08, 2024 at 06:17:16PM +0100, Mark Harmstone wrote:
>>> This patch adds a --subvol option, which tells mkfs.btrfs to create the
>>> specified directories as subvolumes.
>>>
>>> Given a populated directory img, the command
>>>
>>> $ mkfs.btrfs --rootdir img --subvol img/usr --subvol img/home
>>> --subvol img/home/username /dev/loop0
>>>
>>> will create subvolumes usr and home within the FS root, and subvolume
>>> username within the home subvolume. It will fail if any of the
>>> directories do not yet exist.
>>
>> Can this be fixed so the intermediate directories are created if they
>> don't exist? So for example
>>
>> mkfs.btrfs --rootdir dir --subvolume /var/lib/images
>>
>> where dir contains only /var. I don't think it's that common but we
>> should not make users type something can be done programmaticaly.
>
> Can we go a bit further ? I mean get rid of --rootdir completely, so
> a filesystem with "a default" subvolume can be created without
> passing --rootdir.

Personally speaking, I would prefer the current scheme way more than the
out of tree subvolumes.

It's super easy to have something like this:

rootdir
|- dir1
|- dir2

Then you specify "--rootdir rootdir and --subvolume /somewhereelse/dir1"

This is going to lead filename conflicts and mostly an EEXIST to end the
operation.


 From my understand, the "--rootdir" along with "--subvol" is mostly
used to populate a fs image for distro building.

If you really want just a single subvolume, why won't "--rootdir rootdir
--subvol dir1" work for you?

If your only goal is to reduce parameters, then your next question is
already answering why the idea is a bad one.

>
> However, this would lead to the queston: which user and permission has
> to be set to those subvolumes ?
> So I think that we need a further parameter like "--subvol-perm" and
> "--subvol-owner"...

Nope, the current code is already handling that way better.
The user/owner and modes are from the rootdir.

Meanwhile your idea is just asking for extra problems.

Thanks,
Qu

>
> BR
>
Goffredo Baroncelli Aug. 12, 2024, 11:42 a.m. UTC | #4
On 11/08/2024 00.40, Qu Wenruo wrote:
> 
> 
> 在 2024/8/10 19:23, Goffredo Baroncelli 写道:
>> On 09/08/2024 21.31, David Sterba wrote:
>>> On Thu, Aug 08, 2024 at 06:17:16PM +0100, Mark Harmstone wrote:
>>>> This patch adds a --subvol option, which tells mkfs.btrfs to create the
>>>> specified directories as subvolumes.
>>>>
>>>> Given a populated directory img, the command
>>>>
>>>> $ mkfs.btrfs --rootdir img --subvol img/usr --subvol img/home
>>>> --subvol img/home/username /dev/loop0
>>>>
>>>> will create subvolumes usr and home within the FS root, and subvolume
>>>> username within the home subvolume. It will fail if any of the
>>>> directories do not yet exist.
>>>
>>> Can this be fixed so the intermediate directories are created if they
>>> don't exist? So for example
>>>
>>> mkfs.btrfs --rootdir dir --subvolume /var/lib/images
>>>
>>> where dir contains only /var. I don't think it's that common but we
>>> should not make users type something can be done programmaticaly.
>>
>> Can we go a bit further ? I mean get rid of --rootdir completely, so
>> a filesystem with "a default" subvolume can be created without
>> passing --rootdir.
> 
> Personally speaking, I would prefer the current scheme way more than the
> out of tree subvolumes.
> 
> It's super easy to have something like this:
> 
> rootdir
> |- dir1
> |- dir2
> 
> Then you specify "--rootdir rootdir and --subvolume /somewhereelse/dir1"
> 
> This is going to lead filename conflicts and mostly an EEXIST to end the
> operation.

I am not sure to fully understand to what you means as "filename conflicts";
anyhow, now you have the ENOEXIST problem :-)

> 
> 
>  From my understand, the "--rootdir" along with "--subvol" is mostly
> used to populate a fs image for distro building.
> 
> If you really want just a single subvolume, why won't "--rootdir rootdir
> --subvol dir1" work for you?
> 
> If your only goal is to reduce parameters, then your next question is
> already answering why the idea is a bad one.


The use case that I have in my mind is to create a filesystem with a default
non root sub-volume, and nothing more. This would prevent the typical problem
that you face when you populate the root-subvolume, then snapshot it and then
revert to an old snapshot, swapping the subvolumes.
It is not easy to swap the root subvolume, because it is a special in the sense
that it cannot be deleted and it is the root of all subvolumes.

Being so I think that it is better to avoid to have the root subvolume as
default sub-volume.

For my goal (having a filesystem with a non root default sub-volume) creating a
template filesystem is waste of effort.

Now these two use cases (my one, and the one related to this patch) have a lot
in common. So I trying to find a way so the two can be joined.

> 
>>
>> However, this would lead to the queston: which user and permission has
>> to be set to those subvolumes ?
>> So I think that we need a further parameter like "--subvol-perm" and
>> "--subvol-owner"...
> 
> Nope, the current code is already handling that way better.
> The user/owner and modes are from the rootdir.
> 
> Meanwhile your idea is just asking for extra problems.
> 
> Thanks,
> Qu
> 
>>
>> BR
>>
Mark Harmstone Aug. 12, 2024, 2:39 p.m. UTC | #5
As Qu said, this patch is just for in-tree subvols. Having out-of-tree 
subvols is something I do intend to look at, but it's sufficiently 
complicated that it would need a separate patch. You have to worry about 
name collisions, missing intermediary directories, etc.

> On 09/08/2024 21.31, David Sterba wrote:
> Can this be fixed so the intermediate directories are created if they
> don't exist? So for example
>
> mkfs.btrfs --rootdir dir --subvolume /var/lib/images
>
> where dir contains only /var. I don't think it's that common but we
> should not make users type something can be done programmaticaly.

--subvol is relative to the working directory, not the root dir, so if 
you tried this you'd get an error that /var/lib/images wasn't a child of 
dir.

Mark
Qu Wenruo Aug. 12, 2024, 10:08 p.m. UTC | #6
在 2024/8/12 21:12, Goffredo Baroncelli 写道:
> On 11/08/2024 00.40, Qu Wenruo wrote:
[...]
>>
>> Personally speaking, I would prefer the current scheme way more than the
>> out of tree subvolumes.
>>
>> It's super easy to have something like this:
>>
>> rootdir
>> |- dir1
>> |- dir2
>>
>> Then you specify "--rootdir rootdir and --subvolume /somewhereelse/dir1"
>>
>> This is going to lead filename conflicts and mostly an EEXIST to end the
>> operation.
>
> I am not sure to fully understand to what you means as "filename
> conflicts";
> anyhow, now you have the ENOEXIST problem :-)

I mean, if you support specifying a out of rootdir subvolume, along with
rootdir, then it's going to cause conflicts (the file from rootdir with
the same filename conflicts with the out-of-rootdir subvolume)
>
>>
>>
>>  From my understand, the "--rootdir" along with "--subvol" is mostly
>> used to populate a fs image for distro building.
>>
>> If you really want just a single subvolume, why won't "--rootdir rootdir
>> --subvol dir1" work for you?
>>
>> If your only goal is to reduce parameters, then your next question is
>> already answering why the idea is a bad one.
>
>
> The use case that I have in my mind is to create a filesystem with a
> default
> non root sub-volume, and nothing more.

You ignored all the things like owner/group/privilege bits and maybe
even xattrs (for SELINUX) that will be needed.


> This would prevent the typical
> problem
> that you face when you populate the root-subvolume, then snapshot it and
> then
> revert to an old snapshot, swapping the subvolumes.
> It is not easy to swap the root subvolume, because it is a special in
> the sense
> that it cannot be deleted and it is the root of all subvolumes.

A seemingly simple solution is not always that simple.

And I can always argue that, for your "simple" subvolume case, why not
just mount it and create one?
Especially that one needs to set the default subvolume of the fs anyway.

>
> Being so I think that it is better to avoid to have the root subvolume as
> default sub-volume.
>
> For my goal (having a filesystem with a non root default sub-volume)
> creating a
> template filesystem is waste of effort.
>
> Now these two use cases (my one, and the one related to this patch) have
> a lot
> in common. So I trying to find a way so the two can be joined.

Unfortunately, as long as you're creating a subvolume, you need all the
details you ignored (which makes the use case looking simple).

In that case, it's just a subset of the feature of Mark's patch.

Meanwhile Mark's solution provides all the information needed, thus I
see no reason to introduce extra interfaces especially when Mark's
solution is already a superset.

Thanks,
Qu
>
>>
>>>
>>> However, this would lead to the queston: which user and permission has
>>> to be set to those subvolumes ?
>>> So I think that we need a further parameter like "--subvol-perm" and
>>> "--subvol-owner"...
>>
>> Nope, the current code is already handling that way better.
>> The user/owner and modes are from the rootdir.
>>
>> Meanwhile your idea is just asking for extra problems.
>>
>> Thanks,
>> Qu
>>
>>>
>>> BR
>>>
>
>
Anand Jain Aug. 13, 2024, 12:52 a.m. UTC | #7
Nice feature.

>> +	OPTLINE("-u|--subvol SUBDIR", "create SUBDIR as subvolume rather than normal directory"),
> 
> Please mention that it can be specified multiple times.

Nitpick..

Should '--subvol' be '--subvolume' for consistency? use the full names.

Thx.
Mark Harmstone Aug. 13, 2024, 10:38 a.m. UTC | #8
On 13/8/24 01:52, Anand Jain wrote:
> > 
> 
> Nice feature.
> 
>>> +    OPTLINE("-u|--subvol SUBDIR", "create SUBDIR as subvolume rather 
>>> than normal directory"),
>>
>> Please mention that it can be specified multiple times.
> 
> Nitpick..
> 
> Should '--subvol' be '--subvolume' for consistency? use the full names.
> 
> Thx.

Maybe, but we have --rootdir rather than --rootdirectory.

Mark
David Sterba Aug. 14, 2024, 9:47 p.m. UTC | #9
On Tue, Aug 13, 2024 at 08:52:04AM +0800, Anand Jain wrote:
> 
> Nice feature.
> 
> >> +	OPTLINE("-u|--subvol SUBDIR", "create SUBDIR as subvolume rather than normal directory"),
> > 
> > Please mention that it can be specified multiple times.
> 
> Nitpick..
> 
> Should '--subvol' be '--subvolume' for consistency? use the full names.

For convenience both can be accepted, there are aliases for options. The
subcommand names are recognized if abbreviated and still unique, I'm not
sure getopt does the same for long options, if not adding another line
to define another name is no problem.
David Sterba Aug. 14, 2024, 9:57 p.m. UTC | #10
On Tue, Aug 13, 2024 at 07:38:22AM +0930, Qu Wenruo wrote:
> 在 2024/8/12 21:12, Goffredo Baroncelli 写道:
> > On 11/08/2024 00.40, Qu Wenruo wrote:
> [...]
> >>
> >> Personally speaking, I would prefer the current scheme way more than the
> >> out of tree subvolumes.
> >>
> >> It's super easy to have something like this:
> >>
> >> rootdir
> >> |- dir1
> >> |- dir2
> >>
> >> Then you specify "--rootdir rootdir and --subvolume /somewhereelse/dir1"
> >>
> >> This is going to lead filename conflicts and mostly an EEXIST to end the
> >> operation.
> >
> > I am not sure to fully understand to what you means as "filename
> > conflicts";
> > anyhow, now you have the ENOEXIST problem :-)
> 
> I mean, if you support specifying a out of rootdir subvolume, along with
> rootdir, then it's going to cause conflicts (the file from rootdir with
> the same filename conflicts with the out-of-rootdir subvolume)
> >
> >>
> >>
> >>  From my understand, the "--rootdir" along with "--subvol" is mostly
> >> used to populate a fs image for distro building.
> >>
> >> If you really want just a single subvolume, why won't "--rootdir rootdir
> >> --subvol dir1" work for you?
> >>
> >> If your only goal is to reduce parameters, then your next question is
> >> already answering why the idea is a bad one.
> >
> >
> > The use case that I have in my mind is to create a filesystem with a
> > default
> > non root sub-volume, and nothing more.
> 
> You ignored all the things like owner/group/privilege bits and maybe
> even xattrs (for SELINUX) that will be needed.

There are probably two main use cases:

- create something simple, with files copied from a directory and now
  newly also to make some directories subvolumes

- create a filesystem where any detail can be specified, like uid/gid,
  access rights, ACL, XATTR, ...

So far the focus is on option 1 and it seems we have the most common use
covered. For option 2 the command line options are probably the wron
interface.

I suggest to use an approach using a list of definitions, e.g. in a
file, where each line says what should be done on the filesystem. This
is not a new idea, XFS has a protofile, which is ancient and we don't
have to copy the exact format, but it's the same idea.

The structure is like this:
  COMMAND OPTIONS...

and we can define anything, like "ROOTDIR dir" that will mimick the
--rootdir option, but also "CHMOD RWX PATH", to do mkfs-emulated
"chmod RWX PATH". And custom commands like default subvolume.

This is obviously more work than option 1 so does not need to be
implemented right now. For the command-line options the most common use
case should work. If we allow to take the proto file from stdin it can
be created on the fly too.
Qu Wenruo Aug. 14, 2024, 10:11 p.m. UTC | #11
在 2024/8/15 07:27, David Sterba 写道:
[...]
>>
>> You ignored all the things like owner/group/privilege bits and maybe
>> even xattrs (for SELINUX) that will be needed.
>
> There are probably two main use cases:
>
> - create something simple, with files copied from a directory and now
>    newly also to make some directories subvolumes
>
> - create a filesystem where any detail can be specified, like uid/gid,
>    access rights, ACL, XATTR, ...
>
> So far the focus is on option 1 and it seems we have the most common use
> covered. For option 2 the command line options are probably the wron
> interface.

Nope, the current implementation (inherited from the --rootdir option)
handles both.

We have everything, except the hardlinks, copied from the rootdir,
including uid/gid/access mode, XATTR.

>
> I suggest to use an approach using a list of definitions, e.g. in a
> file, where each line says what should be done on the filesystem. This
> is not a new idea, XFS has a protofile, which is ancient and we don't
> have to copy the exact format, but it's the same idea.

I think it's asking for problems that is already resolved.

The current rootdir is already providing every functionality, except
subvolume related ones (subvolume, snapshot, default subvolume etc).

And Mark's excellent work handles the missing subvolume part, and he is
also working on the default subvolume part too.

So far I really see no reason to introduce a completely new and complex
(and conflicting with --rootdir) interface, while everything can be done
by --rootdir + --subvol already.

The only exception is snapshot, but for mkosi usage, I didn't see much
need for a snapshot.

Thanks,
Qu

>
> The structure is like this:
>    COMMAND OPTIONS...
>
> and we can define anything, like "ROOTDIR dir" that will mimick the
> --rootdir option, but also "CHMOD RWX PATH", to do mkfs-emulated
> "chmod RWX PATH". And custom commands like default subvolume.
>
> This is obviously more work than option 1 so does not need to be
> implemented right now. For the command-line options the most common use
> case should work. If we allow to take the proto file from stdin it can
> be created on the fly too.
David Sterba Aug. 14, 2024, 10:20 p.m. UTC | #12
On Thu, Aug 15, 2024 at 07:41:03AM +0930, Qu Wenruo wrote:
> 
> 
> 在 2024/8/15 07:27, David Sterba 写道:
> [...]
> >>
> >> You ignored all the things like owner/group/privilege bits and maybe
> >> even xattrs (for SELINUX) that will be needed.
> >
> > There are probably two main use cases:
> >
> > - create something simple, with files copied from a directory and now
> >    newly also to make some directories subvolumes
> >
> > - create a filesystem where any detail can be specified, like uid/gid,
> >    access rights, ACL, XATTR, ...
> >
> > So far the focus is on option 1 and it seems we have the most common use
> > covered. For option 2 the command line options are probably the wron
> > interface.
> 
> Nope, the current implementation (inherited from the --rootdir option)
> handles both.
> 
> We have everything, except the hardlinks, copied from the rootdir,
> including uid/gid/access mode, XATTR.

For rootdir it copies exactly the uid/git values. The use case I have in
mind is to allow to change it to eg. root:root although the source files
are onwed by a "user:group".

> > I suggest to use an approach using a list of definitions, e.g. in a
> > file, where each line says what should be done on the filesystem. This
> > is not a new idea, XFS has a protofile, which is ancient and we don't
> > have to copy the exact format, but it's the same idea.
> 
> I think it's asking for problems that is already resolved.
> 
> The current rootdir is already providing every functionality, except
> subvolume related ones (subvolume, snapshot, default subvolume etc).
> 
> And Mark's excellent work handles the missing subvolume part, and he is
> also working on the default subvolume part too.
> 
> So far I really see no reason to introduce a completely new and complex
> (and conflicting with --rootdir) interface, while everything can be done
> by --rootdir + --subvol already.
> 
> The only exception is snapshot, but for mkosi usage, I didn't see much
> need for a snapshot.

I don't agree that everything is possible with --rootdir, like the
example above with the different user:group. Setting a root user
requires root but for cases where the image is built by an unprivileged
user it does not work. The difference is between actually modifying the
existing file (e.g. under SElinux or similar), but allowing the user to
create the filesystem image without limitations.
diff mbox series

Patch

diff --git a/mkfs/main.c b/mkfs/main.c
index b24b148d..ebf2a9c0 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -440,6 +440,7 @@  static const char * const mkfs_usage[] = {
 	"Creation:",
 	OPTLINE("-b|--byte-count SIZE", "set size of each device to SIZE (filesystem size is sum of all device sizes)"),
 	OPTLINE("-r|--rootdir DIR", "copy files from DIR to the image root directory"),
+	OPTLINE("-u|--subvol SUBDIR", "create SUBDIR as subvolume rather than normal directory"),
 	OPTLINE("--shrink", "(with --rootdir) shrink the filled filesystem to minimal size"),
 	OPTLINE("-K|--nodiscard", "do not perform whole device TRIM"),
 	OPTLINE("-f|--force", "force overwrite of existing filesystem"),
@@ -1055,6 +1056,9 @@  int BOX_MAIN(mkfs)(int argc, char **argv)
 	char *label = NULL;
 	int nr_global_roots = sysconf(_SC_NPROCESSORS_ONLN);
 	char *source_dir = NULL;
+	size_t source_dir_len = 0;
+	struct rootdir_subvol *rds;
+	LIST_HEAD(subvols);
 
 	cpu_detect_flags();
 	hash_init_accel();
@@ -1085,6 +1089,7 @@  int BOX_MAIN(mkfs)(int argc, char **argv)
 			{ "data", required_argument, NULL, 'd' },
 			{ "version", no_argument, NULL, 'V' },
 			{ "rootdir", required_argument, NULL, 'r' },
+			{ "subvol", required_argument, NULL, 'u' },
 			{ "nodiscard", no_argument, NULL, 'K' },
 			{ "features", required_argument, NULL, 'O' },
 			{ "runtime-features", required_argument, NULL, 'R' },
@@ -1102,7 +1107,7 @@  int BOX_MAIN(mkfs)(int argc, char **argv)
 			{ NULL, 0, NULL, 0}
 		};
 
-		c = getopt_long(argc, argv, "A:b:fl:n:s:m:d:L:R:O:r:U:VvMKq",
+		c = getopt_long(argc, argv, "A:b:fl:n:s:m:d:L:R:O:r:U:VvMKqu:",
 				long_options, NULL);
 		if (c < 0)
 			break;
@@ -1208,6 +1213,22 @@  int BOX_MAIN(mkfs)(int argc, char **argv)
 				free(source_dir);
 				source_dir = strdup(optarg);
 				break;
+			case 'u': {
+				struct rootdir_subvol *s;
+
+				s = malloc(sizeof(struct rootdir_subvol));
+				if (!s) {
+					error("out of memory");
+					ret = 1;
+					goto error;
+				}
+
+				s->dir = strdup(optarg);
+				s->full_path = NULL;
+
+				list_add_tail(&s->list, &subvols);
+				break;
+				}
 			case 'U':
 				strncpy_null(fs_uuid, optarg, BTRFS_UUID_UNPARSED_SIZE);
 				break;
@@ -1272,6 +1293,71 @@  int BOX_MAIN(mkfs)(int argc, char **argv)
 		ret = 1;
 		goto error;
 	}
+	if (!list_empty(&subvols) && source_dir == NULL) {
+		error("the option --subvol must be used with --rootdir");
+		ret = 1;
+		goto error;
+	}
+
+	if (source_dir) {
+		char *canonical = realpath(source_dir, NULL);
+
+		if (!canonical) {
+			error("could not get canonical path to %s", source_dir);
+			ret = 1;
+			goto error;
+		}
+
+		free(source_dir);
+		source_dir = canonical;
+		source_dir_len = strlen(source_dir);
+	}
+
+	list_for_each_entry(rds, &subvols, list) {
+		char *path;
+		struct rootdir_subvol *rds2;
+
+		if (!path_exists(rds->dir)) {
+			error("subvol %s does not exist", rds->dir);
+			ret = 1;
+			goto error;
+		}
+
+		if (!path_is_dir(rds->dir)) {
+			error("subvol %s is not a directory", rds->dir);
+			ret = 1;
+			goto error;
+		}
+
+		path = realpath(rds->dir, NULL);
+
+		if (!path) {
+			error("could not get canonical path to %s", rds->dir);
+			ret = 1;
+			goto error;
+		}
+
+		rds->full_path = path;
+
+		if (strlen(path) < source_dir_len + 1 ||
+		    memcmp(path, source_dir, source_dir_len) ||
+		    path[source_dir_len] != '/') {
+			error("subvol %s is not a child of %s", rds->dir,
+			      source_dir);
+			ret = 1;
+			goto error;
+		}
+
+		for (rds2 = list_first_entry(&subvols, struct rootdir_subvol, list);
+		     rds2 != rds; rds2 = list_next_entry(rds2, list)) {
+			if (!strcmp(rds2->full_path, path)) {
+				error("subvol %s specified more than once",
+					rds->dir);
+				ret = 1;
+				goto error;
+			}
+		}
+	}
 
 	if (*fs_uuid) {
 		uuid_t dummy_uuid;
@@ -1821,24 +1907,37 @@  raid_groups:
 		error_msg(ERROR_MSG_START_TRANS, "%m");
 		goto out;
 	}
-	ret = btrfs_rebuild_uuid_tree(fs_info);
-	if (ret < 0)
-		goto out;
-
-	ret = cleanup_temp_chunks(fs_info, &allocation, data_profile,
-				  metadata_profile, metadata_profile);
-	if (ret < 0) {
-		error("failed to cleanup temporary chunks: %d", ret);
-		goto out;
-	}
 
 	if (source_dir) {
 		pr_verbose(LOG_DEFAULT, "Rootdir from:       %s\n", source_dir);
-		ret = btrfs_mkfs_fill_dir(source_dir, root);
+
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans)) {
+			errno = -PTR_ERR(trans);
+			error_msg(ERROR_MSG_START_TRANS, "%m");
+			goto out;
+		}
+
+		ret = btrfs_mkfs_fill_dir(trans, source_dir, root,
+					  &subvols);
 		if (ret) {
 			error("error while filling filesystem: %d", ret);
+			btrfs_abort_transaction(trans, ret);
+			goto out;
+		}
+
+		ret = btrfs_commit_transaction(trans, root);
+		if (ret) {
+			errno = -ret;
+			error_msg(ERROR_MSG_COMMIT_TRANS, "%m");
 			goto out;
 		}
+
+		list_for_each_entry(rds, &subvols, list) {
+			pr_verbose(LOG_DEFAULT, "  Subvol from:      %s\n",
+				   rds->full_path);
+		}
+
 		if (shrink_rootdir) {
 			pr_verbose(LOG_DEFAULT, "  Shrink:           yes\n");
 			ret = btrfs_mkfs_shrink_fs(fs_info, &shrink_size,
@@ -1853,6 +1952,17 @@  raid_groups:
 		}
 	}
 
+	ret = btrfs_rebuild_uuid_tree(fs_info);
+	if (ret < 0)
+		goto out;
+
+	ret = cleanup_temp_chunks(fs_info, &allocation, data_profile,
+				  metadata_profile, metadata_profile);
+	if (ret < 0) {
+		error("failed to cleanup temporary chunks: %d", ret);
+		goto out;
+	}
+
 	if (features.runtime_flags & BTRFS_FEATURE_RUNTIME_QUOTA ||
 	    features.incompat_flags & BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) {
 		ret = setup_quota_root(fs_info);
@@ -1946,6 +2056,18 @@  error:
 	free(label);
 	free(source_dir);
 
+	while (!list_empty(&subvols)) {
+		struct rootdir_subvol *head = list_entry(subvols.next,
+					      struct rootdir_subvol,
+					      list);
+
+		free(head->dir);
+		free(head->full_path);
+
+		list_del(&head->list);
+		free(head);
+	}
+
 	return !!ret;
 
 success:
diff --git a/mkfs/rootdir.c b/mkfs/rootdir.c
index 05787dc3..8f5658e1 100644
--- a/mkfs/rootdir.c
+++ b/mkfs/rootdir.c
@@ -40,6 +40,8 @@ 
 #include "common/messages.h"
 #include "common/utils.h"
 #include "common/extent-tree-utils.h"
+#include "common/root-tree-utils.h"
+#include "common/path-utils.h"
 #include "mkfs/rootdir.h"
 
 static u32 fs_block_size;
@@ -68,6 +70,7 @@  static u64 ftw_data_size;
 struct inode_entry {
 	/* The inode number inside btrfs. */
 	u64 ino;
+	struct btrfs_root *root;
 	struct list_head list;
 };
 
@@ -91,6 +94,8 @@  static struct rootdir_path current_path = {
 };
 
 static struct btrfs_trans_handle *g_trans = NULL;
+static struct list_head *g_subvols;
+static u64 next_subvol_id = BTRFS_FIRST_FREE_OBJECTID;
 
 static inline struct inode_entry *rootdir_path_last(struct rootdir_path *path)
 {
@@ -111,13 +116,15 @@  static void rootdir_path_pop(struct rootdir_path *path)
 	free(last);
 }
 
-static int rootdir_path_push(struct rootdir_path *path, u64 ino)
+static int rootdir_path_push(struct rootdir_path *path, struct btrfs_root *root,
+			     u64 ino)
 {
 	struct inode_entry *new;
 
 	new = malloc(sizeof(*new));
 	if (!new)
 		return -ENOMEM;
+	new->root = root;
 	new->ino = ino;
 	list_add_tail(&new->list, &path->inode_list);
 	path->level++;
@@ -409,13 +416,83 @@  static u8 ftype_to_btrfs_type(mode_t ftype)
 	return BTRFS_FT_UNKNOWN;
 }
 
+static int ftw_add_subvol(const char *full_path, const struct stat *st,
+			  int typeflag, struct FTW *ftwbuf,
+			  struct rootdir_subvol *s)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_root *new_root;
+	struct inode_entry *parent;
+	struct btrfs_inode_item inode_item = { 0 };
+	u64 subvol_id, ino;
+
+	subvol_id = next_subvol_id++;
+
+	ret = btrfs_make_subvolume(g_trans, subvol_id);
+	if (ret < 0) {
+		error("failed to create subvolume: %d", ret);
+		return ret;
+	}
+
+	key.objectid = subvol_id;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	new_root = btrfs_read_fs_root(g_trans->fs_info, &key);
+	if (IS_ERR(new_root)) {
+		error("unable to fs read root: %lu", PTR_ERR(new_root));
+		return -PTR_ERR(new_root);
+	}
+
+	parent = rootdir_path_last(&current_path);
+
+	ret = btrfs_link_subvolume(g_trans, parent->root, parent->ino,
+				   path_basename(s->full_path),
+				   strlen(path_basename(s->full_path)), new_root);
+	if (ret) {
+		error("unable to link subvolume %s", path_basename(s->full_path));
+		return ret;
+	}
+
+	ino = btrfs_root_dirid(&new_root->root_item);
+
+	ret = add_xattr_item(g_trans, new_root, ino, full_path);
+	if (ret < 0) {
+		errno = -ret;
+		error("failed to add xattr item for the top level inode in subvol %llu: %m",
+		      subvol_id);
+		return ret;
+	}
+	stat_to_inode_item(&inode_item, st);
+
+	btrfs_set_stack_inode_nlink(&inode_item, 1);
+	ret = update_inode_item(g_trans, new_root, &inode_item, ino);
+	if (ret < 0) {
+		errno = -ret;
+		error("failed to update root dir for root %llu: %m", subvol_id);
+		return ret;
+	}
+
+	ret = rootdir_path_push(&current_path, new_root, ino);
+	if (ret < 0) {
+		errno = -ret;
+		error("failed to allocate new entry for subvol %llu ('%s'): %m",
+		      subvol_id, full_path);
+		return ret;
+	}
+
+	return 0;
+}
+
 static int ftw_add_inode(const char *full_path, const struct stat *st,
 			 int typeflag, struct FTW *ftwbuf)
 {
 	struct btrfs_fs_info *fs_info = g_trans->fs_info;
-	struct btrfs_root *root = fs_info->fs_root;
+	struct btrfs_root *root;
 	struct btrfs_inode_item inode_item = { 0 };
 	struct inode_entry *parent;
+	struct rootdir_subvol *rds;
 	u64 ino;
 	int ret;
 
@@ -436,7 +513,10 @@  static int ftw_add_inode(const char *full_path, const struct stat *st,
 
 	/* The rootdir itself. */
 	if (unlikely(ftwbuf->level == 0)) {
-		u64 root_ino = btrfs_root_dirid(&root->root_item);
+		u64 root_ino;
+
+		root = fs_info->fs_root;
+		root_ino = btrfs_root_dirid(&root->root_item);
 
 		UASSERT(S_ISDIR(st->st_mode));
 		UASSERT(current_path.level == 0);
@@ -462,7 +542,7 @@  static int ftw_add_inode(const char *full_path, const struct stat *st,
 		}
 
 		/* Push (and initialize) the rootdir directory into the stack. */
-		ret = rootdir_path_push(&current_path,
+		ret = rootdir_path_push(&current_path, root,
 					btrfs_root_dirid(&root->root_item));
 		if (ret < 0) {
 			errno = -ret;
@@ -511,6 +591,18 @@  static int ftw_add_inode(const char *full_path, const struct stat *st,
 	while (current_path.level > ftwbuf->level)
 		rootdir_path_pop(&current_path);
 
+	if (S_ISDIR(st->st_mode)) {
+		list_for_each_entry(rds, g_subvols, list) {
+			if (!strcmp(full_path, rds->full_path)) {
+				return ftw_add_subvol(full_path, st, typeflag,
+						      ftwbuf, rds);
+			}
+		}
+	}
+
+	parent = rootdir_path_last(&current_path);
+	root = parent->root;
+
 	ret = btrfs_find_free_objectid(g_trans, root,
 				       BTRFS_FIRST_FREE_OBJECTID, &ino);
 	if (ret < 0) {
@@ -529,7 +621,6 @@  static int ftw_add_inode(const char *full_path, const struct stat *st,
 		return ret;
 	}
 
-	parent = rootdir_path_last(&current_path);
 	ret = btrfs_add_link(g_trans, root, ino, parent->ino,
 			     full_path + ftwbuf->base,
 			     strlen(full_path) - ftwbuf->base,
@@ -556,7 +647,7 @@  static int ftw_add_inode(const char *full_path, const struct stat *st,
 		return ret;
 	}
 	if (S_ISDIR(st->st_mode)) {
-		ret = rootdir_path_push(&current_path, ino);
+		ret = rootdir_path_push(&current_path, root, ino);
 		if (ret < 0) {
 			errno = -ret;
 			error("failed to allocate new entry for inode %llu ('%s'): %m",
@@ -597,49 +688,31 @@  static int ftw_add_inode(const char *full_path, const struct stat *st,
 	return 0;
 };
 
-int btrfs_mkfs_fill_dir(const char *source_dir, struct btrfs_root *root)
+int btrfs_mkfs_fill_dir(struct btrfs_trans_handle *trans, const char *source_dir,
+			struct btrfs_root *root, struct list_head *subvols)
 {
 	int ret;
-	struct btrfs_trans_handle *trans;
 	struct stat root_st;
 
 	ret = lstat(source_dir, &root_st);
 	if (ret) {
 		error("unable to lstat %s: %m", source_dir);
-		ret = -errno;
-		goto out;
-	}
-
-	trans = btrfs_start_transaction(root, 1);
-	if (IS_ERR(trans)) {
-		ret = PTR_ERR(trans);
-		errno = -ret;
-		error_msg(ERROR_MSG_START_TRANS, "%m");
-		goto fail;
+		return -errno;
 	}
 
 	g_trans = trans;
+	g_subvols = subvols;
 	INIT_LIST_HEAD(&current_path.inode_list);
 
 	ret = nftw(source_dir, ftw_add_inode, 32, FTW_PHYS);
 	if (ret) {
 		error("unable to traverse directory %s: %d", source_dir, ret);
-		goto fail;
-	}
-	ret = btrfs_commit_transaction(trans, root);
-	if (ret) {
-		errno = -ret;
-		error_msg(ERROR_MSG_COMMIT_TRANS, "%m");
-		goto out;
+		return ret;
 	}
 	while (current_path.level > 0)
 		rootdir_path_pop(&current_path);
 
 	return 0;
-fail:
-	btrfs_abort_transaction(trans, ret);
-out:
-	return ret;
 }
 
 static int ftw_add_entry_size(const char *fpath, const struct stat *st,
diff --git a/mkfs/rootdir.h b/mkfs/rootdir.h
index 4233431a..128e9e09 100644
--- a/mkfs/rootdir.h
+++ b/mkfs/rootdir.h
@@ -28,7 +28,14 @@ 
 struct btrfs_fs_info;
 struct btrfs_root;
 
-int btrfs_mkfs_fill_dir(const char *source_dir, struct btrfs_root *root);
+struct rootdir_subvol {
+	struct list_head list;
+	char *dir;
+	char *full_path;
+};
+
+int btrfs_mkfs_fill_dir(struct btrfs_trans_handle *trans, const char *source_dir,
+			struct btrfs_root *root, struct list_head *subvols);
 u64 btrfs_mkfs_size_dir(const char *dir_name, u32 sectorsize, u64 min_dev_size,
 			u64 meta_profile, u64 data_profile);
 int btrfs_mkfs_shrink_fs(struct btrfs_fs_info *fs_info, u64 *new_size_ret,
diff --git a/tests/mkfs-tests/036-rootdir-subvol/test.sh b/tests/mkfs-tests/036-rootdir-subvol/test.sh
new file mode 100755
index 00000000..ccd6893f
--- /dev/null
+++ b/tests/mkfs-tests/036-rootdir-subvol/test.sh
@@ -0,0 +1,33 @@ 
+#!/bin/bash
+# smoke test for mkfs.btrfs --subvol option
+
+source "$TEST_TOP/common" || exit
+
+check_prereq mkfs.btrfs
+check_prereq btrfs
+
+setup_root_helper
+prepare_test_dev
+
+tmp=$(_mktemp_dir mkfs-rootdir)
+
+touch $tmp/foo
+mkdir $tmp/dir
+mkdir $tmp/dir/subvol
+touch $tmp/dir/subvol/bar
+
+run_check_mkfs_test_dev --rootdir "$tmp" --subvol "$tmp/dir/subvol"
+run_check $SUDO_HELPER "$TOP/btrfs" check "$TEST_DEV"
+
+run_check_mount_test_dev
+run_check_stdout $SUDO_HELPER "$TOP/btrfs" subvolume list "$TEST_MNT" | \
+	cut -d\  -f9 > "$tmp/output"
+run_check_umount_test_dev
+
+result=$(cat "$tmp/output")
+
+if [ "$result" != "dir/subvol" ]; then
+	_fail "dir/subvol not in subvolume list"
+fi
+
+rm -rf -- "$tmp"