diff mbox

[RFC,7/7] Btrfs: introduce BTRFS_IOC_SEND for btrfs send/receive (part 2)

Message ID 1341409108-13567-8-git-send-email-ablock84@googlemail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Alexander Block July 4, 2012, 1:38 p.m. UTC
This is the second part of the splitted BTRFS_IOC_SEND patch which
contains the actual send logic.

Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/ioctl.c |    3 +
 fs/btrfs/send.c  | 3246 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/send.h  |    4 +
 3 files changed, 3253 insertions(+)

Comments

Alex Lyakas July 10, 2012, 3:26 p.m. UTC | #1
Alexander,
this focuses on area of sending file extents:

> +static int is_extent_unchanged(struct send_ctx *sctx,
> +                              struct btrfs_path *left_path,
> +                              struct btrfs_key *ekey)
> +{
> +       int ret = 0;
> +       struct btrfs_key key;
> +       struct btrfs_path *path = NULL;
> +       struct extent_buffer *eb;
> +       int slot;
> +       struct btrfs_key found_key;
> +       struct btrfs_file_extent_item *ei;
> +       u64 left_disknr;
> +       u64 right_disknr;
> +       u64 left_offset;
> +       u64 right_offset;
> +       u64 left_len;
> +       u64 right_len;
> +       u8 left_type;
> +       u8 right_type;
> +
> +       path = alloc_path_for_send();
> +       if (!path)
> +               return -ENOMEM;
> +
> +       eb = left_path->nodes[0];
> +       slot = left_path->slots[0];
> +
> +       ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
> +       left_type = btrfs_file_extent_type(eb, ei);
> +       left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
> +       left_len = btrfs_file_extent_num_bytes(eb, ei);
> +       left_offset = btrfs_file_extent_offset(eb, ei);
> +
> +       if (left_type != BTRFS_FILE_EXTENT_REG) {
> +               ret = 0;
> +               goto out;
> +       }
> +
> +       key.objectid = ekey->objectid;
> +       key.type = BTRFS_EXTENT_DATA_KEY;
> +       key.offset = ekey->offset;
> +
> +       while (1) {
> +               ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path,
> +                               0, 0);
> +               if (ret < 0)
> +                       goto out;
> +               if (ret) {
> +                       ret = 0;
> +                       goto out;
> +               }
> +               btrfs_item_key_to_cpu(path->nodes[0], &found_key,
> +                               path->slots[0]);
> +               if (found_key.objectid != key.objectid ||
> +                   found_key.type != key.type) {
> +                       ret = 0;
> +                       goto out;
> +               }
> +
> +               eb = path->nodes[0];
> +               slot = path->slots[0];
> +
> +               ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
> +               right_type = btrfs_file_extent_type(eb, ei);
> +               right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
> +               right_len = btrfs_file_extent_num_bytes(eb, ei);
> +               right_offset = btrfs_file_extent_offset(eb, ei);
> +               btrfs_release_path(path);
> +
> +               if (right_type != BTRFS_FILE_EXTENT_REG) {
> +                       ret = 0;
> +                       goto out;
> +               }
> +
> +               if (left_disknr != right_disknr) {
> +                       ret = 0;
> +                       goto out;
> +               }
> +
> +               key.offset = found_key.offset + right_len;
> +               if (key.offset >= ekey->offset + left_len) {
> +                       ret = 1;
> +                       goto out;
> +               }
> +       }
> +
> +out:
> +       btrfs_free_path(path);
> +       return ret;
> +}
> +

Should we always treat left extent with bytenr==0 as not changed?
Because right now, it simply reads and sends data of such extent,
while bytenr==0 means "no data allocated here". Since we always do
send_truncate() afterwards, file size will always be correct, so we
can just skip bytenr==0 extents.
Same is true for BTRFS_FILE_EXTENT_PREALLOC extents, I think. Those
also don't contain real data.
So something like:
if (left_disknr == 0 || left_type == BTRFS_FILE_EXTENT_REG) {
	ret = 1;
	goto out;
}
before we check for BTRFS_FILE_EXTENT_REG.

Now I have a question about the rest of the logic that decides that
extent is unchanged. I understand that if we see the same extent (same
disk_bytenr) shared between parent_root and send_root, then it must
contain the same data, even in nodatacow mode, because on a first
write to such shared extent, it is cow'ed even with nodatacow.

However, shouldn't we check btrfs_file_extent_offset(), to make sure
that both send_root and parent_root point at the same offset into
extent from the same file offset? Because if extent_offset values are
different, then the data of the file might different, even though we
are talking about the same extent.

So I am thinking about something like:

- ekey.offset points at data at logical address
left_disknr+left_offset (logical address within CHUNK_ITEM address
space) for left_len bytes
- found_key.offset points at data at logical address
right_disknr+right_offset for right_len
- we know that found_key.offset <= ekey.offset

So we need to ensure that left_disknr==right_disknr and also:
right_disknr+right_offset + (ekey.offset - found_key.offset) ==
left_disknr+left_offset
or does this while loop somehow ensures this equation?

However, I must admit I don't fully understand the logic behind
deciding that extent is unchanged. Can you pls explain what this tries
to accomplish, and why it decides that extent is unchanged here:
key.offset = found_key.offset + right_len;
if (key.offset >= ekey->offset + left_len) {
	ret = 1;
	goto out;
}

Also: when searching for the next extent, should we use
btrfs_file_extent_num_bytes() or btrfs_file_extent_disk_num_bytes()?
They are not equal sometimes...not sure at which offset the next
extent (if any) should be. What about holes in files? Then we will
have non-consecutive offsets.

Thanks,
Alex.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arne Jansen July 23, 2012, 11:16 a.m. UTC | #2
This is a first review run. I ask for more comments in several places.
Maybe these comments can help to dive deeper into a functional review
in a second run.
I'd really appreciate it if you could write a few pages about the
concepts how you decide what to send and when.
It seems there's still a lot of headroom for performance optimizations
cpu/seek-wise.
All in all I really like this work.

On 04.07.2012 15:38, Alexander Block wrote:
> This is the second part of the splitted BTRFS_IOC_SEND patch which
> contains the actual send logic.
> 
> Signed-off-by: Alexander Block <ablock84@googlemail.com>
> ---
>  fs/btrfs/ioctl.c |    3 +
>  fs/btrfs/send.c  | 3246 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/btrfs/send.h  |    4 +
>  3 files changed, 3253 insertions(+)
> 
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 8d258cb..9173867 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -54,6 +54,7 @@
>  #include "inode-map.h"
>  #include "backref.h"
>  #include "rcu-string.h"
> +#include "send.h"
>  
>  /* Mask out flags that are inappropriate for the given type of inode. */
>  static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
> @@ -3567,6 +3568,8 @@ long btrfs_ioctl(struct file *file, unsigned int
>  		return btrfs_ioctl_balance_progress(root, argp);
>  	case BTRFS_IOC_SET_RECEIVED_SUBVOL:
>  		return btrfs_ioctl_set_received_subvol(file, argp);
> +	case BTRFS_IOC_SEND:
> +		return btrfs_ioctl_send(file, argp);
>  	case BTRFS_IOC_GET_DEV_STATS:
>  		return btrfs_ioctl_get_dev_stats(root, argp, 0);
>  	case BTRFS_IOC_GET_AND_RESET_DEV_STATS:
> diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
> index 47a2557..4d3fcfc 100644
> --- a/fs/btrfs/send.c
> +++ b/fs/btrfs/send.c
> @@ -1007,3 +1007,3249 @@ out:
>  	return ret;
>  }
>  
> +struct backref_ctx {
> +	struct send_ctx *sctx;
> +
> +	/* number of total found references */
> +	u64 found;
> +
> +	/*
> +	 * used for clones found in send_root. clones found behind cur_objectid
> +	 * and cur_offset are not considered as allowed clones.
> +	 */
> +	u64 cur_objectid;
> +	u64 cur_offset;
> +
> +	/* may be truncated in case it's the last extent in a file */
> +	u64 extent_len;
> +
> +	/* Just to check for bugs in backref resolving */
> +	int found_in_send_root;
> +};
> +
> +static int __clone_root_cmp_bsearch(const void *key, const void *elt)
> +{
> +	u64 root = (u64)key;
> +	struct clone_root *cr = (struct clone_root *)elt;
> +
> +	if (root < cr->root->objectid)
> +		return -1;
> +	if (root > cr->root->objectid)
> +		return 1;
> +	return 0;
> +}
> +
> +static int __clone_root_cmp_sort(const void *e1, const void *e2)
> +{
> +	struct clone_root *cr1 = (struct clone_root *)e1;
> +	struct clone_root *cr2 = (struct clone_root *)e2;
> +
> +	if (cr1->root->objectid < cr2->root->objectid)
> +		return -1;
> +	if (cr1->root->objectid > cr2->root->objectid)
> +		return 1;
> +	return 0;
> +}
> +
> +/*
> + * Called for every backref that is found for the current extent.

Comment: results are collected in sctx->clone_roots->ino/offset/found_refs

> + */
> +static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
> +{
> +	struct backref_ctx *bctx = ctx_;
> +	struct clone_root *found;
> +	int ret;
> +	u64 i_size;
> +
> +	/* First check if the root is in the list of accepted clone sources */
> +	found = bsearch((void *)root, bctx->sctx->clone_roots,
> +			bctx->sctx->clone_roots_cnt,
> +			sizeof(struct clone_root),
> +			__clone_root_cmp_bsearch);
> +	if (!found)
> +		return 0;
> +
> +	if (found->root == bctx->sctx->send_root &&
> +	    ino == bctx->cur_objectid &&
> +	    offset == bctx->cur_offset) {
> +		bctx->found_in_send_root = 1;

found_in_send_root_and_cur_ino_offset?

> +	}
> +
> +	/*
> +	 * There are inodes that have extents that lie behind it's i_size. Don't
                                                              its
> +	 * accept clones from these extents.
> +	 */
> +	ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (offset + bctx->extent_len > i_size)
> +		return 0;
> +
> +	/*
> +	 * Make sure we don't consider clones from send_root that are
> +	 * behind the current inode/offset.
> +	 */
> +	if (found->root == bctx->sctx->send_root) {
> +		/*
> +		 * TODO for the moment we don't accept clones from the inode
> +		 * that is currently send. We may change this when
> +		 * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same
> +		 * file.
> +		 */
> +		if (ino >= bctx->cur_objectid)
> +			return 0;
> +		/*if (ino > ctx->cur_objectid)
> +			return 0;
> +		if (offset + ctx->extent_len > ctx->cur_offset)
> +			return 0;*/

#if 0 ... #else ... #endif

> +
> +		bctx->found++;
> +		found->found_refs++;
> +		found->ino = ino;
> +		found->offset = offset;

only the last ino is kept?

> +		return 0;
> +	}
> +
> +	bctx->found++;
> +	found->found_refs++;
> +	if (ino < found->ino) {
> +		found->ino = ino;
> +		found->offset = offset;

whereas here only the lowest ino is kept. Why?

> +	} else if (found->ino == ino) {
> +		/*
> +		 * same extent found more then once in the same file.
> +		 */
> +		if (found->offset > offset + bctx->extent_len)
> +			found->offset = offset;

This is unclear to me. Seems to mean something like
'find the lowest offset', but not exactly. Some explaination
would be good.

> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * path must point to the extent item when called.
> + */

What is the purpose of this function? I probably will figure it out
when reading on, but a comment would be nice here.

> +static int find_extent_clone(struct send_ctx *sctx,
> +			     struct btrfs_path *path,
> +			     u64 ino, u64 data_offset,
> +			     u64 ino_size,
> +			     struct clone_root **found)
> +{
> +	int ret;
> +	int extent_type;
> +	u64 logical;
> +	u64 num_bytes;
> +	u64 extent_item_pos;
> +	struct btrfs_file_extent_item *fi;
> +	struct extent_buffer *eb = path->nodes[0];
> +	struct backref_ctx backref_ctx;

currently it's still small enough to keep in on stack, maybe a
comment in struct backref_ctx that it is kept on stack would be
nice.

> +	struct clone_root *cur_clone_root;
> +	struct btrfs_key found_key;
> +	struct btrfs_path *tmp_path;
> +	u32 i;
> +
> +	tmp_path = alloc_path_for_send();
> +	if (!tmp_path)
> +		return -ENOMEM;
> +
> +	if (data_offset >= ino_size) {
> +		/*
> +		 * There may be extents that lie behind the file's size.
> +		 * I at least had this in combination with snapshotting while
> +		 * writing large files.
> +		 */
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	fi = btrfs_item_ptr(eb, path->slots[0],
> +			struct btrfs_file_extent_item);
> +	extent_type = btrfs_file_extent_type(eb, fi);
> +	if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
> +		ret = -ENOENT;
> +		goto out;
> +	}
> +
> +	num_bytes = btrfs_file_extent_num_bytes(eb, fi);
> +	logical = btrfs_file_extent_disk_bytenr(eb, fi);
> +	if (logical == 0) {
> +		ret = -ENOENT;
> +		goto out;
> +	}
> +	logical += btrfs_file_extent_offset(eb, fi);
> +
> +	ret = extent_from_logical(sctx->send_root->fs_info,
> +			logical, tmp_path, &found_key);
> +	btrfs_release_path(tmp_path);
> +
> +	if (ret < 0)
> +		goto out;
> +	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
> +		ret = -EIO;
> +		goto out;
> +	}
> +
> +	/*
> +	 * Setup the clone roots.
> +	 */
> +	for (i = 0; i < sctx->clone_roots_cnt; i++) {
> +		cur_clone_root = sctx->clone_roots + i;
> +		cur_clone_root->ino = (u64)-1;
> +		cur_clone_root->offset = 0;
> +		cur_clone_root->found_refs = 0;
> +	}
> +
> +	backref_ctx.sctx = sctx;
> +	backref_ctx.found = 0;
> +	backref_ctx.cur_objectid = ino;
> +	backref_ctx.cur_offset = data_offset;
> +	backref_ctx.found_in_send_root = 0;
> +	backref_ctx.extent_len = num_bytes;
> +
> +	/*
> +	 * The last extent of a file may be too large due to page alignment.
> +	 * We need to adjust extent_len in this case so that the checks in
> +	 * __iterate_backrefs work.
> +	 */
> +	if (data_offset + num_bytes >= ino_size)
> +		backref_ctx.extent_len = ino_size - data_offset;
> +
> +	/*
> +	 * Now collect all backrefs.
> +	 */
> +	extent_item_pos = logical - found_key.objectid;
> +	ret = iterate_extent_inodes(sctx->send_root->fs_info,
> +					found_key.objectid, extent_item_pos, 1,
> +					__iterate_backrefs, &backref_ctx);
> +	if (ret < 0)
> +		goto out;
> +
> +	if (!backref_ctx.found_in_send_root) {
> +		/* found a bug in backref code? */
> +		ret = -EIO;
> +		printk(KERN_ERR "btrfs: ERROR did not find backref in "
> +				"send_root. inode=%llu, offset=%llu, "
> +				"logical=%llu\n",
> +				ino, data_offset, logical);
> +		goto out;
> +	}
> +
> +verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
> +		"ino=%llu, "
> +		"num_bytes=%llu, logical=%llu\n",
> +		data_offset, ino, num_bytes, logical);
> +
> +	if (!backref_ctx.found)
> +		verbose_printk("btrfs:    no clones found\n");
> +
> +	cur_clone_root = NULL;
> +	for (i = 0; i < sctx->clone_roots_cnt; i++) {
> +		if (sctx->clone_roots[i].found_refs) {
> +			if (!cur_clone_root)
> +				cur_clone_root = sctx->clone_roots + i;
> +			else if (sctx->clone_roots[i].root == sctx->send_root)
> +				/* prefer clones from send_root over others */
> +				cur_clone_root = sctx->clone_roots + i;
> +			break;

If you break after the first found ref, you might miss the send_root.

> +		}
> +
> +	}
> +
> +	if (cur_clone_root) {
> +		*found = cur_clone_root;
> +		ret = 0;
> +	} else {
> +		ret = -ENOENT;
> +	}
> +
> +out:
> +	btrfs_free_path(tmp_path);
> +	return ret;
> +}
> +
> +static int read_symlink(struct send_ctx *sctx,
> +			struct btrfs_root *root,
> +			u64 ino,
> +			struct fs_path *dest)
> +{
> +	int ret;
> +	struct btrfs_path *path;
> +	struct btrfs_key key;
> +	struct btrfs_file_extent_item *ei;
> +	u8 type;
> +	u8 compression;
> +	unsigned long off;
> +	int len;
> +
> +	path = alloc_path_for_send();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	key.objectid = ino;
> +	key.type = BTRFS_EXTENT_DATA_KEY;
> +	key.offset = 0;
> +	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
> +	if (ret < 0)
> +		goto out;
> +	BUG_ON(ret);
> +
> +	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
> +			struct btrfs_file_extent_item);
> +	type = btrfs_file_extent_type(path->nodes[0], ei);
> +	compression = btrfs_file_extent_compression(path->nodes[0], ei);
> +	BUG_ON(type != BTRFS_FILE_EXTENT_INLINE);
> +	BUG_ON(compression);
> +
> +	off = btrfs_file_extent_inline_start(ei);
> +	len = btrfs_file_extent_inline_len(path->nodes[0], ei);
> +
> +	ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
> +	if (ret < 0)
> +		goto out;

superfluous

> +
> +out:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
> +/*
> + * Helper function to generate a file name that is unique in the root of
> + * send_root and parent_root. This is used to generate names for orphan inodes.
> + */
> +static int gen_unique_name(struct send_ctx *sctx,
> +			   u64 ino, u64 gen,
> +			   struct fs_path *dest)
> +{
> +	int ret = 0;
> +	struct btrfs_path *path;
> +	struct btrfs_dir_item *di;
> +	char tmp[64];
> +	int len;
> +	u64 idx = 0;
> +
> +	path = alloc_path_for_send();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	while (1) {
> +		len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu",
> +				ino, gen, idx);

wouldn't it be easier to just take a uuid? This would save you a lot
of code and especially the need to verify that the name is really
unique, saving seeks.

> +		if (len >= sizeof(tmp)) {
> +			/* should really not happen */
> +			ret = -EOVERFLOW;
> +			goto out;
> +		}
> +
> +		di = btrfs_lookup_dir_item(NULL, sctx->send_root,
> +				path, BTRFS_FIRST_FREE_OBJECTID,
> +				tmp, strlen(tmp), 0);
> +		btrfs_release_path(path);
> +		if (IS_ERR(di)) {
> +			ret = PTR_ERR(di);
> +			goto out;
> +		}
> +		if (di) {
> +			/* not unique, try again */
> +			idx++;
> +			continue;
> +		}
> +
> +		if (!sctx->parent_root) {
> +			/* unique */
> +			ret = 0;
> +			break;
> +		}
> +
> +		di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
> +				path, BTRFS_FIRST_FREE_OBJECTID,
> +				tmp, strlen(tmp), 0);
> +		btrfs_release_path(path);
> +		if (IS_ERR(di)) {
> +			ret = PTR_ERR(di);
> +			goto out;
> +		}
> +		if (di) {
> +			/* not unique, try again */
> +			idx++;
> +			continue;
> +		}
> +		/* unique */
> +		break;
> +	}
> +
> +	ret = fs_path_add(dest, tmp, strlen(tmp));
> +
> +out:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
> +enum inode_state {
> +	inode_state_no_change,
> +	inode_state_will_create,
> +	inode_state_did_create,
> +	inode_state_will_delete,
> +	inode_state_did_delete,
> +};
> +
> +static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)

don't you want to return a enum inode_state instead of int?

> +{
> +	int ret;
> +	int left_ret;
> +	int right_ret;
> +	u64 left_gen;
> +	u64 right_gen;
> +
> +	ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
> +			NULL);
> +	if (ret < 0 && ret != -ENOENT)
> +		goto out;
> +	left_ret = ret;
> +
> +	if (!sctx->parent_root) {
> +		right_ret = -ENOENT;
> +	} else {
> +		ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
> +				NULL, NULL, NULL);
> +		if (ret < 0 && ret != -ENOENT)
> +			goto out;
> +		right_ret = ret;
> +	}
> +
> +	if (!left_ret && !right_ret) {
> +		if (left_gen == gen && right_gen == gen)

Please also use {} here

> +			ret = inode_state_no_change;
> +		else if (left_gen == gen) {
> +			if (ino < sctx->send_progress)
> +				ret = inode_state_did_create;
> +			else
> +				ret = inode_state_will_create;
> +		} else if (right_gen == gen) {
> +			if (ino < sctx->send_progress)
> +				ret = inode_state_did_delete;
> +			else
> +				ret = inode_state_will_delete;
> +		} else  {
> +			ret = -ENOENT;
> +		}
> +	} else if (!left_ret) {
> +		if (left_gen == gen) {
> +			if (ino < sctx->send_progress)
> +				ret = inode_state_did_create;
> +			else
> +				ret = inode_state_will_create;
> +		} else {
> +			ret = -ENOENT;
> +		}
> +	} else if (!right_ret) {
> +		if (right_gen == gen) {
> +			if (ino < sctx->send_progress)
> +				ret = inode_state_did_delete;
> +			else
> +				ret = inode_state_will_delete;
> +		} else {
> +			ret = -ENOENT;
> +		}
> +	} else {
> +		ret = -ENOENT;
> +	}
> +
> +out:
> +	return ret;
> +}
> +
> +static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
> +{
> +	int ret;
> +
> +	ret = get_cur_inode_state(sctx, ino, gen);
> +	if (ret < 0)
> +		goto out;
> +
> +	if (ret == inode_state_no_change ||
> +	    ret == inode_state_did_create ||
> +	    ret == inode_state_will_delete)
> +		ret = 1;
> +	else
> +		ret = 0;
> +
> +out:
> +	return ret;
> +}
> +
> +/*
> + * Helper function to lookup a dir item in a dir.
> + */
> +static int lookup_dir_item_inode(struct btrfs_root *root,
> +				 u64 dir, const char *name, int name_len,
> +				 u64 *found_inode,
> +				 u8 *found_type)
> +{
> +	int ret = 0;
> +	struct btrfs_dir_item *di;
> +	struct btrfs_key key;
> +	struct btrfs_path *path;
> +
> +	path = alloc_path_for_send();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	di = btrfs_lookup_dir_item(NULL, root, path,
> +			dir, name, name_len, 0);
> +	if (!di) {
> +		ret = -ENOENT;
> +		goto out;
> +	}
> +	if (IS_ERR(di)) {
> +		ret = PTR_ERR(di);
> +		goto out;
> +	}
> +	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
> +	*found_inode = key.objectid;
> +	*found_type = btrfs_dir_type(path->nodes[0], di);
> +
> +out:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
> +static int get_first_ref(struct send_ctx *sctx,

The name does not reflect well what the function does.
It's more like get_first_parent_dir or get_first_inode_ref

> +			 struct btrfs_root *root, u64 ino,
> +			 u64 *dir, u64 *dir_gen, struct fs_path *name)
> +{
> +	int ret;
> +	struct btrfs_key key;
> +	struct btrfs_key found_key;
> +	struct btrfs_path *path;
> +	struct btrfs_inode_ref *iref;
> +	int len;
> +
> +	path = alloc_path_for_send();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	key.objectid = ino;
> +	key.type = BTRFS_INODE_REF_KEY;
> +	key.offset = 0;
> +
> +	ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
> +	if (ret < 0)
> +		goto out;
> +	if (!ret)
> +		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
> +				path->slots[0]);
> +	if (ret || found_key.objectid != key.objectid ||
> +	    found_key.type != key.type) {
> +		ret = -ENOENT;
> +		goto out;
> +	}
> +
> +	iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
> +			struct btrfs_inode_ref);
> +	len = btrfs_inode_ref_name_len(path->nodes[0], iref);
> +	ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
> +			(unsigned long)(iref + 1), len);
> +	if (ret < 0)
> +		goto out;
> +	btrfs_release_path(path);
> +
> +	ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL,
> +			NULL);
> +	if (ret < 0)
> +		goto out;
> +
> +	*dir = found_key.offset;
> +
> +out:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
> +static int is_first_ref(struct send_ctx *sctx,
> +			struct btrfs_root *root,
> +			u64 ino, u64 dir,
> +			const char *name, int name_len)
> +{
> +	int ret;
> +	struct fs_path *tmp_name;
> +	u64 tmp_dir;
> +	u64 tmp_dir_gen;
> +
> +	tmp_name = fs_path_alloc(sctx);
> +	if (!tmp_name)
> +		return -ENOMEM;
> +
> +	ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
> +	if (ret < 0)
> +		goto out;
> +
> +	if (name_len != fs_path_len(tmp_name)) {
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	ret = memcmp(tmp_name->start, name, name_len);

or just ret = !memcmp...?

> +	if (ret)
> +		ret = 0;
> +	else
> +		ret = 1;
> +
> +out:
> +	fs_path_free(sctx, tmp_name);
> +	return ret;
> +}
> +
> +static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
> +			      const char *name, int name_len,
> +			      u64 *who_ino, u64 *who_gen)
> +{
> +	int ret = 0;
> +	u64 other_inode = 0;
> +	u8 other_type = 0;
> +
> +	if (!sctx->parent_root)
> +		goto out;
> +
> +	ret = is_inode_existent(sctx, dir, dir_gen);
> +	if (ret <= 0)
> +		goto out;
> +
> +	ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
> +			&other_inode, &other_type);
> +	if (ret < 0 && ret != -ENOENT)
> +		goto out;
> +	if (ret) {
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	if (other_inode > sctx->send_progress) {

I haven't really grasped what this function does (a comment would be
nice), but I have a feeling that renames might break things when the
parent is not a direct ancenstor. Maybe it gets clearer when I read
on ;)

> +		ret = get_inode_info(sctx->parent_root, other_inode, NULL,
> +				who_gen, NULL, NULL, NULL);
> +		if (ret < 0)
> +			goto out;
> +
> +		ret = 1;
> +		*who_ino = other_inode;
> +	} else {
> +		ret = 0;
> +	}
> +
> +out:
> +	return ret;
> +}
> +
> +static int did_overwrite_ref(struct send_ctx *sctx,
> +			    u64 dir, u64 dir_gen,
> +			    u64 ino, u64 ino_gen,
> +			    const char *name, int name_len)
> +{
> +	int ret = 0;
> +	u64 gen;
> +	u64 ow_inode;
> +	u8 other_type;
> +
> +	if (!sctx->parent_root)
> +		goto out;
> +
> +	ret = is_inode_existent(sctx, dir, dir_gen);
> +	if (ret <= 0)
> +		goto out;
> +
> +	/* check if the ref was overwritten by another ref */
> +	ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
> +			&ow_inode, &other_type);
> +	if (ret < 0 && ret != -ENOENT)
> +		goto out;
> +	if (ret) {
> +		/* was never and will never be overwritten */
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
> +			NULL);
> +	if (ret < 0)
> +		goto out;
> +
> +	if (ow_inode == ino && gen == ino_gen) {
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	/* we know that it is or will be overwritten. check this now */
> +	if (ow_inode < sctx->send_progress)
> +		ret = 1;
> +	else
> +		ret = 0;
> +
> +out:
> +	return ret;
> +}
> +
> +static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
> +{
> +	int ret = 0;
> +	struct fs_path *name = NULL;
> +	u64 dir;
> +	u64 dir_gen;
> +
> +	if (!sctx->parent_root)
> +		goto out;
> +
> +	name = fs_path_alloc(sctx);
> +	if (!name)
> +		return -ENOMEM;
> +
> +	ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
> +			name->start, fs_path_len(name));

> +	if (ret < 0)
> +		goto out;

superfluous

> +
> +out:
> +	fs_path_free(sctx, name);
> +	return ret;
> +}
> +
> +static int name_cache_insert(struct send_ctx *sctx,
> +			     struct name_cache_entry *nce)
> +{
> +	int ret = 0;
> +	struct name_cache_entry **ncea;
> +
> +	ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);

attention: radix_trees take an unsigned long as index, and ino
is a u64. You're in trouble on 32 bit.

> +	if (ncea) {
> +		if (!ncea[0])
> +			ncea[0] = nce;
> +		else if (!ncea[1])
> +			ncea[1] = nce;
> +		else
> +			BUG();
> +	} else {
> +		ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS);
> +		if (!ncea)
> +			return -ENOMEM;
> +
> +		ncea[0] = nce;
> +		ncea[1] = NULL;
> +		ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea);
> +		if (ret < 0)
> +			return ret;
> +	}
> +	list_add_tail(&nce->list, &sctx->name_cache_list);
> +	sctx->name_cache_size++;
> +
> +	return ret;
> +}
> +
> +static void name_cache_delete(struct send_ctx *sctx,
> +			      struct name_cache_entry *nce)
> +{
> +	struct name_cache_entry **ncea;
> +
> +	ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
> +	BUG_ON(!ncea);
> +
> +	if (ncea[0] == nce)
> +		ncea[0] = NULL;
> +	else if (ncea[1] == nce)
> +		ncea[1] = NULL;
> +	else
> +		BUG();
> +
> +	if (!ncea[0] && !ncea[1]) {
> +		radix_tree_delete(&sctx->name_cache, nce->ino);
> +		kfree(ncea);
> +	}
> +
> +	list_del(&nce->list);
> +
> +	sctx->name_cache_size--;
> +}
> +
> +static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
> +						    u64 ino, u64 gen)
> +{
> +	struct name_cache_entry **ncea;
> +
> +	ncea = radix_tree_lookup(&sctx->name_cache, ino);
> +	if (!ncea)
> +		return NULL;
> +
> +	if (ncea[0] && ncea[0]->gen == gen)
> +		return ncea[0];
> +	else if (ncea[1] && ncea[1]->gen == gen)
> +		return ncea[1];
> +	return NULL;
> +}
> +
> +static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
> +{
> +	list_del(&nce->list);
> +	list_add_tail(&nce->list, &sctx->name_cache_list);
> +}
> +
> +static void name_cache_clean_unused(struct send_ctx *sctx)
> +{
> +	struct name_cache_entry *nce;
> +
> +	if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
> +		return;

superfluous, the while condition below is enough.

> +
> +	while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
> +		nce = list_entry(sctx->name_cache_list.next,
> +				struct name_cache_entry, list);
> +		name_cache_delete(sctx, nce);
> +		kfree(nce);
> +	}
> +}
> +
> +static void name_cache_free(struct send_ctx *sctx)
> +{
> +	struct name_cache_entry *nce;
> +	struct name_cache_entry *tmp;
> +
> +	list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) {

it's easier to just always delete the head until the list is empty.
Saves you the tmp-var.

> +		name_cache_delete(sctx, nce);
> +	}
> +}
> +
> +static int __get_cur_name_and_parent(struct send_ctx *sctx,
> +				     u64 ino, u64 gen,
> +				     u64 *parent_ino,
> +				     u64 *parent_gen,
> +				     struct fs_path *dest)
> +{
> +	int ret;
> +	int nce_ret;
> +	struct btrfs_path *path = NULL;
> +	struct name_cache_entry *nce = NULL;
> +
> +	nce = name_cache_search(sctx, ino, gen);
> +	if (nce) {
> +		if (ino < sctx->send_progress && nce->need_later_update) {
> +			name_cache_delete(sctx, nce);
> +			kfree(nce);
> +			nce = NULL;
> +		} else {
> +			name_cache_used(sctx, nce);
> +			*parent_ino = nce->parent_ino;
> +			*parent_gen = nce->parent_gen;
> +			ret = fs_path_add(dest, nce->name, nce->name_len);
> +			if (ret < 0)
> +				goto out;
> +			ret = nce->ret;
> +			goto out;
> +		}
> +	}
> +
> +	path = alloc_path_for_send();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	ret = is_inode_existent(sctx, ino, gen);
> +	if (ret < 0)
> +		goto out;
> +
> +	if (!ret) {
> +		ret = gen_unique_name(sctx, ino, gen, dest);
> +		if (ret < 0)
> +			goto out;
> +		ret = 1;
> +		goto out_cache;
> +	}
> +
> +	if (ino < sctx->send_progress)
> +		ret = get_first_ref(sctx, sctx->send_root, ino,
> +				parent_ino, parent_gen, dest);
> +	else
> +		ret = get_first_ref(sctx, sctx->parent_root, ino,
> +				parent_ino, parent_gen, dest);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
> +			dest->start, dest->end - dest->start);
> +	if (ret < 0)
> +		goto out;
> +	if (ret) {
> +		fs_path_reset(dest);
> +		ret = gen_unique_name(sctx, ino, gen, dest);
> +		if (ret < 0)
> +			goto out;
> +		ret = 1;
> +	}
> +
> +out_cache:
> +	nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
> +	if (!nce) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	nce->ino = ino;
> +	nce->gen = gen;
> +	nce->parent_ino = *parent_ino;
> +	nce->parent_gen = *parent_gen;
> +	nce->name_len = fs_path_len(dest);
> +	nce->ret = ret;

This is a bit too magic for me. ret == 1 iff it's a unique_name?

> +	strcpy(nce->name, dest->start);
> +	memset(&nce->use_list, 0, sizeof(nce->use_list));

use_list is unused, anyway, it's a strange way to initialize a
list_head. There's the INIT_LIST_HEAD macro.

> +
> +	if (ino < sctx->send_progress)
> +		nce->need_later_update = 0;
> +	else
> +		nce->need_later_update = 1;
> +
> +	nce_ret = name_cache_insert(sctx, nce);
> +	if (nce_ret < 0)
> +		ret = nce_ret;
> +	name_cache_clean_unused(sctx);
> +
> +out:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
> +/*
> + * Magic happens here. This function returns the first ref to an inode as it
> + * would look like while receiving the stream at this point in time.
> + * We walk the path up to the root. For every inode in between, we check if it
> + * was already processed/sent. If yes, we continue with the parent as found
> + * in send_root. If not, we continue with the parent as found in parent_root.
> + * If we encounter an inode that was deleted at this point in time, we use the
> + * inodes "orphan" name instead of the real name and stop. Same with new inodes
> + * that were not created yet and overwritten inodes/refs.
> + *
> + * When do we have have orphan inodes:
> + * 1. When an inode is freshly created and thus no valid refs are available yet
> + * 2. When a directory lost all it's refs (deleted) but still has dir items
> + *    inside which were not processed yet (pending for move/delete). If anyone
> + *    tried to get the path to the dir items, it would get a path inside that
> + *    orphan directory.
> + * 3. When an inode is moved around or gets new links, it may overwrite the ref
> + *    of an unprocessed inode. If in that case the first ref would be
> + *    overwritten, the overwritten inode gets "orphanized". Later when we
> + *    process this overwritten inode, it is restored at a new place by moving
> + *    the orphan inode.
> + *
> + * sctx->send_progress tells this function at which point in time receiving
> + * would be.
> + */

Thanks for the comment :)

> +static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
> +			struct fs_path *dest)
> +{
> +	int ret = 0;
> +	struct fs_path *name = NULL;
> +	u64 parent_inode = 0;
> +	u64 parent_gen = 0;
> +	int stop = 0;
> +
> +	name = fs_path_alloc(sctx);
> +	if (!name) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	dest->reversed = 1;
> +	fs_path_reset(dest);
> +
> +	while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
> +		fs_path_reset(name);
> +
> +		ret = __get_cur_name_and_parent(sctx, ino, gen,
> +				&parent_inode, &parent_gen, name);
> +		if (ret < 0)
> +			goto out;
> +		if (ret)
> +			stop = 1;
> +
> +		ret = fs_path_add_path(dest, name);
> +		if (ret < 0)
> +			goto out;
> +
> +		ino = parent_inode;
> +		gen = parent_gen;
> +	}
> +
> +out:
> +	fs_path_free(sctx, name);
> +	if (!ret)
> +		fs_path_unreverse(dest);
> +	return ret;
> +}
> +
> +/*
> + * Called for regular files when sending extents data. Opens a struct file
> + * to read from the file.
> + */
> +static int open_cur_inode_file(struct send_ctx *sctx)
> +{
> +	int ret = 0;
> +	struct btrfs_key key;
> +	struct vfsmount *mnt;
> +	struct inode *inode;
> +	struct dentry *dentry;
> +	struct file *filp;
> +	int new = 0;
> +
> +	if (sctx->cur_inode_filp)
> +		goto out;
> +
> +	key.objectid = sctx->cur_ino;
> +	key.type = BTRFS_INODE_ITEM_KEY;
> +	key.offset = 0;
> +
> +	inode = btrfs_iget(sctx->send_root->fs_info->sb, &key, sctx->send_root,
> +			&new);
> +	if (IS_ERR(inode)) {
> +		ret = PTR_ERR(inode);
> +		goto out;
> +	}
> +
> +	dentry = d_obtain_alias(inode);
> +	inode = NULL;
> +	if (IS_ERR(dentry)) {
> +		ret = PTR_ERR(dentry);
> +		goto out;
> +	}
> +
> +	mnt = mntget(sctx->mnt);
> +	filp = dentry_open(dentry, mnt, O_RDONLY | O_LARGEFILE, current_cred());
> +	dentry = NULL;
> +	mnt = NULL;

It would be good if this part could be reviewed by someone with
deep vfs knowledge. Maybe you can compile those parts into a
separate patch and send it to the appropriate ppl for review.

> +	if (IS_ERR(filp)) {
> +		ret = PTR_ERR(filp);
> +		goto out;
> +	}
> +	sctx->cur_inode_filp = filp;
> +
> +out:
> +	/*
> +	 * no xxxput required here as every vfs op
> +	 * does it by itself on failure
> +	 */
> +	return ret;
> +}
> +
> +/*
> + * Closes the struct file that was created in open_cur_inode_file
> + */
> +static int close_cur_inode_file(struct send_ctx *sctx)
> +{
> +	int ret = 0;
> +
> +	if (!sctx->cur_inode_filp)
> +		goto out;
> +
> +	ret = filp_close(sctx->cur_inode_filp, NULL);
> +	sctx->cur_inode_filp = NULL;
> +
> +out:
> +	return ret;
> +}
> +
> +/*
> + * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
> + */
> +static int send_subvol_begin(struct send_ctx *sctx)
> +{
> +	int ret;
> +	struct btrfs_root *send_root = sctx->send_root;
> +	struct btrfs_root *parent_root = sctx->parent_root;
> +	struct btrfs_path *path;
> +	struct btrfs_key key;
> +	struct btrfs_root_ref *ref;
> +	struct extent_buffer *leaf;
> +	char *name = NULL;
> +	int namelen;
> +
> +	path = alloc_path_for_send();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS);
> +	if (!name) {
> +		btrfs_free_path(path);
> +		return -ENOMEM;
> +	}
> +
> +	key.objectid = send_root->objectid;
> +	key.type = BTRFS_ROOT_BACKREF_KEY;
> +	key.offset = 0;
> +
> +	ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
> +				&key, path, 1, 0);
> +	if (ret < 0)
> +		goto out;
> +	if (ret) {
> +		ret = -ENOENT;
> +		goto out;
> +	}
> +
> +	leaf = path->nodes[0];
> +	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
> +	if (key.type != BTRFS_ROOT_BACKREF_KEY ||
> +	    key.objectid != send_root->objectid) {
> +		ret = -ENOENT;
> +		goto out;
> +	}

It looks like we could use a helper for finding the first entry
with a specific objectid+key...

> +	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
> +	namelen = btrfs_root_ref_name_len(leaf, ref);
> +	read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
> +	btrfs_release_path(path);
> +
> +	if (ret < 0)
> +		goto out;

How can ret be < 0 here?

> +
> +	if (parent_root) {
> +		ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
> +		if (ret < 0)
> +			goto out;
> +	} else {
> +		ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
> +		if (ret < 0)
> +			goto out;
> +	}
> +
> +	TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);

It's called PATH, but it seems to be only the last path component.
What about subvols that are ancored deeper in the dir tree?

> +	TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
> +			sctx->send_root->root_item.uuid);
> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
> +			sctx->send_root->root_item.ctransid);
> +	if (parent_root) {

The name of the parent is not sent?

> +		TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
> +				sctx->parent_root->root_item.uuid);
> +		TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
> +				sctx->parent_root->root_item.ctransid);
> +	}
> +
> +	ret = send_cmd(sctx);
> +
> +tlv_put_failure:
> +out:
> +	btrfs_free_path(path);
> +	kfree(name);
> +	return ret;
> +}
> +
> +static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
> +{
> +	int ret = 0;
> +	struct fs_path *p;
> +
> +verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
> +
> +	p = fs_path_alloc(sctx);
> +	if (!p)
> +		return -ENOMEM;
> +
> +	ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = get_cur_path(sctx, ino, gen, p);
> +	if (ret < 0)
> +		goto out;
> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
> +
> +	ret = send_cmd(sctx);
> +
> +tlv_put_failure:
> +out:
> +	fs_path_free(sctx, p);
> +	return ret;
> +}
> +
> +static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
> +{
> +	int ret = 0;
> +	struct fs_path *p;
> +
> +verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
> +
> +	p = fs_path_alloc(sctx);
> +	if (!p)
> +		return -ENOMEM;
> +
> +	ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = get_cur_path(sctx, ino, gen, p);
> +	if (ret < 0)
> +		goto out;
> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);

four 7?

> +
> +	ret = send_cmd(sctx);
> +
> +tlv_put_failure:
> +out:
> +	fs_path_free(sctx, p);
> +	return ret;
> +}
> +
> +static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
> +{
> +	int ret = 0;
> +	struct fs_path *p;
> +
> +verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
> +
> +	p = fs_path_alloc(sctx);
> +	if (!p)
> +		return -ENOMEM;
> +
> +	ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = get_cur_path(sctx, ino, gen, p);
> +	if (ret < 0)
> +		goto out;
> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
> +
> +	ret = send_cmd(sctx);
> +
> +tlv_put_failure:
> +out:
> +	fs_path_free(sctx, p);
> +	return ret;
> +}
> +
> +static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
> +{
> +	int ret = 0;
> +	struct fs_path *p = NULL;
> +	struct btrfs_inode_item *ii;
> +	struct btrfs_path *path = NULL;
> +	struct extent_buffer *eb;
> +	struct btrfs_key key;
> +	int slot;
> +
> +verbose_printk("btrfs: send_utimes %llu\n", ino);
> +
> +	p = fs_path_alloc(sctx);
> +	if (!p)
> +		return -ENOMEM;
> +
> +	path = alloc_path_for_send();
> +	if (!path) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	key.objectid = ino;
> +	key.type = BTRFS_INODE_ITEM_KEY;
> +	key.offset = 0;
> +	ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
> +	if (ret < 0)
> +		goto out;

you don't check for existence. I guess you know it exists, otherwise
you wouldn't end up here...

> +
> +	eb = path->nodes[0];
> +	slot = path->slots[0];
> +	ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
> +
> +	ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = get_cur_path(sctx, ino, gen, p);
> +	if (ret < 0)
> +		goto out;
> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
> +	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb,
> +			btrfs_inode_atime(ii));
> +	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb,
> +			btrfs_inode_mtime(ii));
> +	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
> +			btrfs_inode_ctime(ii));
> +	/* TODO otime? */

yes, please :)

> +
> +	ret = send_cmd(sctx);
> +
> +tlv_put_failure:
> +out:
> +	fs_path_free(sctx, p);
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
> +/*
> + * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
> + * a valid path yet because we did not process the refs yet. So, the inode
> + * is created as orphan.
> + */
> +static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path,
> +			     struct btrfs_key *key)
> +{
> +	int ret = 0;
> +	struct extent_buffer *eb = path->nodes[0];
> +	struct btrfs_inode_item *ii;
> +	struct fs_path *p;
> +	int slot = path->slots[0];
> +	int cmd;
> +	u64 mode;
> +
> +verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
> +
> +	p = fs_path_alloc(sctx);
> +	if (!p)
> +		return -ENOMEM;
> +
> +	ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
> +	mode = btrfs_inode_mode(eb, ii);
> +
> +	if (S_ISREG(mode))
> +		cmd = BTRFS_SEND_C_MKFILE;
> +	else if (S_ISDIR(mode))
> +		cmd = BTRFS_SEND_C_MKDIR;
> +	else if (S_ISLNK(mode))
> +		cmd = BTRFS_SEND_C_SYMLINK;
> +	else if (S_ISCHR(mode) || S_ISBLK(mode))
> +		cmd = BTRFS_SEND_C_MKNOD;
> +	else if (S_ISFIFO(mode))
> +		cmd = BTRFS_SEND_C_MKFIFO;
> +	else if (S_ISSOCK(mode))
> +		cmd = BTRFS_SEND_C_MKSOCK;
> +	else {

normally you'd put {} in all cases if you need it for one.

> +		printk(KERN_WARNING "btrfs: unexpected inode type %o",
> +				(int)(mode & S_IFMT));
> +		ret = -ENOTSUPP;
> +		goto out;
> +	}
> +
> +	ret = begin_cmd(sctx, cmd);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
> +	if (ret < 0)
> +		goto out;
> +
> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
> +
> +	if (S_ISLNK(mode)) {
> +		fs_path_reset(p);
> +		ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p);
> +		if (ret < 0)
> +			goto out;
> +		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
> +	} else if (S_ISCHR(mode) || S_ISBLK(mode) ||
> +		   S_ISFIFO(mode) || S_ISSOCK(mode)) {
> +		TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii));
> +	}
> +
> +	ret = send_cmd(sctx);
> +	if (ret < 0)
> +		goto out;
> +
> +
> +tlv_put_failure:
> +out:
> +	fs_path_free(sctx, p);
> +	return ret;
> +}
> +
> +struct recorded_ref {
> +	struct list_head list;
> +	char *dir_path;
> +	char *name;
> +	struct fs_path *full_path;
> +	u64 dir;
> +	u64 dir_gen;
> +	int dir_path_len;
> +	int name_len;
> +};
> +
> +/*
> + * We need to process new refs before deleted refs, but compare_tree gives us
> + * everything mixed. So we first record all refs and later process them.
> + * This function is a helper to record one ref.
> + */
> +static int record_ref(struct list_head *head, u64 dir,
> +		      u64 dir_gen, struct fs_path *path)
> +{
> +	struct recorded_ref *ref;
> +	char *tmp;
> +
> +	ref = kmalloc(sizeof(*ref), GFP_NOFS);
> +	if (!ref)
> +		return -ENOMEM;
> +
> +	ref->dir = dir;
> +	ref->dir_gen = dir_gen;
> +	ref->full_path = path;
> +
> +	tmp = strrchr(ref->full_path->start, '/');
> +	if (!tmp) {
> +		ref->name_len = ref->full_path->end - ref->full_path->start;
> +		ref->name = ref->full_path->start;
> +		ref->dir_path_len = 0;
> +		ref->dir_path = ref->full_path->start;
> +	} else {
> +		tmp++;
> +		ref->name_len = ref->full_path->end - tmp;
> +		ref->name = tmp;
> +		ref->dir_path = ref->full_path->start;
> +		ref->dir_path_len = ref->full_path->end -
> +				ref->full_path->start - 1 - ref->name_len;
> +	}
> +
> +	list_add_tail(&ref->list, head);
> +	return 0;
> +}
> +
> +static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
> +{
> +	struct recorded_ref *cur;
> +	struct recorded_ref *tmp;
> +
> +	list_for_each_entry_safe(cur, tmp, head, list) {
> +		fs_path_free(sctx, cur->full_path);
> +		kfree(cur);
> +	}
> +	INIT_LIST_HEAD(head);

This is a bit non-obvious. You use the _safe-macro as if you're
going to delete each entry, but then you don't delete it and
instead just reset the head. I'd prefer a while(!list_empty())-
list_del-loop here.

> +}
> +
> +static void free_recorded_refs(struct send_ctx *sctx)
> +{
> +	__free_recorded_refs(sctx, &sctx->new_refs);
> +	__free_recorded_refs(sctx, &sctx->deleted_refs);
> +}
> +
> +/*
> + * Renames/moves a file/dir to it's orphan name. Used when the first
                                  its

> + * ref of an unprocessed inode gets overwritten and for all non empty
> + * directories.
> + */
> +static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
> +			  struct fs_path *path)
> +{
> +	int ret;
> +	struct fs_path *orphan;
> +
> +	orphan = fs_path_alloc(sctx);
> +	if (!orphan)
> +		return -ENOMEM;
> +
> +	ret = gen_unique_name(sctx, ino, gen, orphan);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = send_rename(sctx, path, orphan);
> +
> +out:
> +	fs_path_free(sctx, orphan);
> +	return ret;
> +}
> +
> +/*
> + * Returns 1 if a directory can be removed at this point in time.
> + * We check this by iterating all dir items and checking if the inode behind
> + * the dir item was already processed.
> + */
> +static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
> +{
> +	int ret = 0;
> +	struct btrfs_root *root = sctx->parent_root;
> +	struct btrfs_path *path;
> +	struct btrfs_key key;
> +	struct btrfs_key found_key;
> +	struct btrfs_key loc;
> +	struct btrfs_dir_item *di;
> +
> +	path = alloc_path_for_send();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	key.objectid = dir;
> +	key.type = BTRFS_DIR_INDEX_KEY;
> +	key.offset = 0;
> +
> +	while (1) {
> +		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
> +		if (ret < 0)
> +			goto out;
> +		if (!ret) {
> +			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
> +					path->slots[0]);
> +		}
> +		if (ret || found_key.objectid != key.objectid ||
> +		    found_key.type != key.type) {
> +			break;
> +		}

another case for the above mentioned helper...

> +
> +		di = btrfs_item_ptr(path->nodes[0], path->slots[0],
> +				struct btrfs_dir_item);
> +		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
> +
> +		if (loc.objectid > send_progress) {
> +			ret = 0;
> +			goto out;
> +		}
> +
> +		btrfs_release_path(path);
> +		key.offset = found_key.offset + 1;
> +	}
> +
> +	ret = 1;
> +
> +out:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
> +/*
> + * This does all the move/link/unlink/rmdir magic.
> + */
> +static int process_recorded_refs(struct send_ctx *sctx)
> +{
> +	int ret = 0;
> +	struct recorded_ref *cur;
> +	struct ulist *check_dirs = NULL;
> +	struct ulist_iterator uit;
> +	struct ulist_node *un;
> +	struct fs_path *valid_path = NULL;
> +	u64 ow_inode;
> +	u64 ow_gen;
> +	int did_overwrite = 0;
> +	int is_orphan = 0;
> +
> +verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
> +
> +	valid_path = fs_path_alloc(sctx);
> +	if (!valid_path) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	check_dirs = ulist_alloc(GFP_NOFS);
> +	if (!check_dirs) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	/*
> +	 * First, check if the first ref of the current inode was overwritten
> +	 * before. If yes, we know that the current inode was already orphanized
> +	 * and thus use the orphan name. If not, we can use get_cur_path to
> +	 * get the path of the first ref as it would like while receiving at
> +	 * this point in time.
> +	 * New inodes are always orphan at the beginning, so force to use the
> +	 * orphan name in this case.
> +	 * The first ref is stored in valid_path and will be updated if it
> +	 * gets moved around.
> +	 */
> +	if (!sctx->cur_inode_new) {
> +		ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
> +				sctx->cur_inode_gen);
> +		if (ret < 0)
> +			goto out;
> +		if (ret)
> +			did_overwrite = 1;
> +	}
> +	if (sctx->cur_inode_new || did_overwrite) {
> +		ret = gen_unique_name(sctx, sctx->cur_ino,
> +				sctx->cur_inode_gen, valid_path);
> +		if (ret < 0)
> +			goto out;
> +		is_orphan = 1;
> +	} else {
> +		ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
> +				valid_path);
> +		if (ret < 0)
> +			goto out;
> +	}
> +
> +	list_for_each_entry(cur, &sctx->new_refs, list) {
> +		/*
> +		 * Check if this new ref would overwrite the first ref of
> +		 * another unprocessed inode. If yes, orphanize the
> +		 * overwritten inode. If we find an overwritten ref that is
> +		 * not the first ref, simply unlink it.
> +		 */
> +		ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
> +				cur->name, cur->name_len,
> +				&ow_inode, &ow_gen);
> +		if (ret < 0)
> +			goto out;
> +		if (ret) {
> +			ret = is_first_ref(sctx, sctx->parent_root,
> +					ow_inode, cur->dir, cur->name,
> +					cur->name_len);
> +			if (ret < 0)
> +				goto out;
> +			if (ret) {
> +				ret = orphanize_inode(sctx, ow_inode, ow_gen,
> +						cur->full_path);
> +				if (ret < 0)
> +					goto out;
> +			} else {
> +				ret = send_unlink(sctx, cur->full_path);
> +				if (ret < 0)
> +					goto out;
> +			}
> +		}
> +
> +		/*
> +		 * link/move the ref to the new place. If we have an orphan
> +		 * inode, move it and update valid_path. If not, link or move
> +		 * it depending on the inode mode.
> +		 */
> +		if (is_orphan) {
> +			ret = send_rename(sctx, valid_path, cur->full_path);
> +			if (ret < 0)
> +				goto out;
> +			is_orphan = 0;
> +			ret = fs_path_copy(valid_path, cur->full_path);
> +			if (ret < 0)
> +				goto out;
> +		} else {
> +			if (S_ISDIR(sctx->cur_inode_mode)) {

why not save a level of indentation here by using <else if>?

> +				/*
> +				 * Dirs can't be linked, so move it. For moved
> +				 * dirs, we always have one new and one deleted
> +				 * ref. The deleted ref is ignored later.
> +				 */
> +				ret = send_rename(sctx, valid_path,
> +						cur->full_path);
> +				if (ret < 0)
> +					goto out;
> +				ret = fs_path_copy(valid_path, cur->full_path);
> +				if (ret < 0)
> +					goto out;
> +			} else {
> +				ret = send_link(sctx, valid_path,
> +						cur->full_path);
> +				if (ret < 0)
> +					goto out;
> +			}
> +		}
> +		ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,

careful, aux is only an unsigned long, meant to be as large as a pointer.

> +				GFP_NOFS);
> +		if (ret < 0)
> +			goto out;
> +	}
> +
> +	if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
> +		/*
> +		 * Check if we can already rmdir the directory. If not,
> +		 * orphanize it. For every dir item inside that gets deleted
> +		 * later, we do this check again and rmdir it then if possible.
> +		 * See the use of check_dirs for more details.
> +		 */
> +		ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino);
> +		if (ret < 0)
> +			goto out;
> +		if (ret) {
> +			ret = send_rmdir(sctx, valid_path);
> +			if (ret < 0)
> +				goto out;
> +		} else if (!is_orphan) {
> +			ret = orphanize_inode(sctx, sctx->cur_ino,
> +					sctx->cur_inode_gen, valid_path);
> +			if (ret < 0)
> +				goto out;
> +			is_orphan = 1;
> +		}
> +
> +		list_for_each_entry(cur, &sctx->deleted_refs, list) {
> +			ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
> +					GFP_NOFS);
> +			if (ret < 0)
> +				goto out;
> +		}
> +	} else if (!S_ISDIR(sctx->cur_inode_mode)) {
> +		/*
> +		 * We have a non dir inode. Go through all deleted refs and
> +		 * unlink them if they were not already overwritten by other
> +		 * inodes.
> +		 */
> +		list_for_each_entry(cur, &sctx->deleted_refs, list) {
> +			ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen,
> +					sctx->cur_ino, sctx->cur_inode_gen,
> +					cur->name, cur->name_len);
> +			if (ret < 0)
> +				goto out;
> +			if (!ret) {
> +				ret = send_unlink(sctx, cur->full_path);
> +				if (ret < 0)
> +					goto out;
> +			}
> +			ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
> +					GFP_NOFS);
> +			if (ret < 0)
> +				goto out;
> +		}
> +
> +		/*
> +		 * If the inode is still orphan, unlink the orphan. This may
> +		 * happen when a previous inode did overwrite the first ref
> +		 * of this inode and no new refs were added for the current
> +		 * inode.
> +		 */
> +		if (is_orphan) {
> +			ret = send_unlink(sctx, valid_path);
> +			if (ret < 0)
> +				goto out;
> +		}
> +	}
> +
> +	/*
> +	 * We did collect all parent dirs where cur_inode was once located. We
> +	 * now go through all these dirs and check if they are pending for
> +	 * deletion and if it's finally possible to perform the rmdir now.
> +	 * We also update the inode stats of the parent dirs here.
> +	 */
> +	ULIST_ITER_INIT(&uit);
> +	while ((un = ulist_next(check_dirs, &uit))) {
> +		if (un->val > sctx->cur_ino)
> +			continue;
> +
> +		ret = get_cur_inode_state(sctx, un->val, un->aux);
> +		if (ret < 0)
> +			goto out;
> +
> +		if (ret == inode_state_did_create ||
> +		    ret == inode_state_no_change) {
> +			/* TODO delayed utimes */
> +			ret = send_utimes(sctx, un->val, un->aux);
> +			if (ret < 0)
> +				goto out;
> +		} else if (ret == inode_state_did_delete) {
> +			ret = can_rmdir(sctx, un->val, sctx->cur_ino);
> +			if (ret < 0)
> +				goto out;
> +			if (ret) {
> +				ret = get_cur_path(sctx, un->val, un->aux,
> +						valid_path);
> +				if (ret < 0)
> +					goto out;
> +				ret = send_rmdir(sctx, valid_path);
> +				if (ret < 0)
> +					goto out;
> +			}
> +		}
> +	}
> +
> +	/*
> +	 * Current inode is now at it's new position, so we must increase
                                   its
> +	 * send_progress
> +	 */
> +	sctx->send_progress = sctx->cur_ino + 1;

is this the right place for it, or should be done at the calling
site?

> +
> +	ret = 0;
> +
> +out:
> +	free_recorded_refs(sctx);
> +	ulist_free(check_dirs);
> +	fs_path_free(sctx, valid_path);
> +	return ret;
> +}
> +
> +static int __record_new_ref(int num, u64 dir, int index,
> +			    struct fs_path *name,
> +			    void *ctx)
> +{
> +	int ret = 0;
> +	struct send_ctx *sctx = ctx;
> +	struct fs_path *p;
> +	u64 gen;
> +
> +	p = fs_path_alloc(sctx);
> +	if (!p)
> +		return -ENOMEM;
> +
> +	ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
> +			NULL);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = get_cur_path(sctx, dir, gen, p);
> +	if (ret < 0)
> +		goto out;
> +	ret = fs_path_add_path(p, name);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = record_ref(&sctx->new_refs, dir, gen, p);
> +
> +out:
> +	if (ret)
> +		fs_path_free(sctx, p);
> +	return ret;
> +}
> +
> +static int __record_deleted_ref(int num, u64 dir, int index,
> +				struct fs_path *name,
> +				void *ctx)
> +{
> +	int ret = 0;
> +	struct send_ctx *sctx = ctx;
> +	struct fs_path *p;
> +	u64 gen;
> +
> +	p = fs_path_alloc(sctx);
> +	if (!p)
> +		return -ENOMEM;
> +
> +	ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
> +			NULL);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = get_cur_path(sctx, dir, gen, p);
> +	if (ret < 0)
> +		goto out;
> +	ret = fs_path_add_path(p, name);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = record_ref(&sctx->deleted_refs, dir, gen, p);
> +
> +out:
> +	if (ret)
> +		fs_path_free(sctx, p);
> +	return ret;
> +}
> +
> +static int record_new_ref(struct send_ctx *sctx)
> +{
> +	int ret;
> +
> +	ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
> +			sctx->cmp_key, 0, __record_new_ref, sctx);
> +
> +	return ret;
> +}
> +
> +static int record_deleted_ref(struct send_ctx *sctx)
> +{
> +	int ret;
> +
> +	ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
> +			sctx->cmp_key, 0, __record_deleted_ref, sctx);
> +	return ret;
> +}
> +
> +struct find_ref_ctx {
> +	u64 dir;
> +	struct fs_path *name;
> +	int found_idx;
> +};
> +
> +static int __find_iref(int num, u64 dir, int index,
> +		       struct fs_path *name,
> +		       void *ctx_)
> +{
> +	struct find_ref_ctx *ctx = ctx_;
> +
> +	if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) &&
> +	    strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) {
> +		ctx->found_idx = num;
> +		return 1;
> +	}
> +	return 0;
> +}
> +
> +static int find_iref(struct send_ctx *sctx,
> +		     struct btrfs_root *root,
> +		     struct btrfs_path *path,
> +		     struct btrfs_key *key,
> +		     u64 dir, struct fs_path *name)
> +{
> +	int ret;
> +	struct find_ref_ctx ctx;
> +
> +	ctx.dir = dir;
> +	ctx.name = name;
> +	ctx.found_idx = -1;
> +
> +	ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (ctx.found_idx == -1)
> +		return -ENOENT;
> +
> +	return ctx.found_idx;
> +}
> +
> +static int __record_changed_new_ref(int num, u64 dir, int index,
> +				    struct fs_path *name,
> +				    void *ctx)
> +{
> +	int ret;
> +	struct send_ctx *sctx = ctx;
> +
> +	ret = find_iref(sctx, sctx->parent_root, sctx->right_path,
> +			sctx->cmp_key, dir, name);
> +	if (ret == -ENOENT)
> +		ret = __record_new_ref(num, dir, index, name, sctx);
> +	else if (ret > 0)
> +		ret = 0;
> +
> +	return ret;
> +}
> +
> +static int __record_changed_deleted_ref(int num, u64 dir, int index,
> +					struct fs_path *name,
> +					void *ctx)
> +{
> +	int ret;
> +	struct send_ctx *sctx = ctx;
> +
> +	ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
> +			dir, name);
> +	if (ret == -ENOENT)
> +		ret = __record_deleted_ref(num, dir, index, name, sctx);
> +	else if (ret > 0)
> +		ret = 0;
> +
> +	return ret;
> +}
> +
> +static int record_changed_ref(struct send_ctx *sctx)
> +{
> +	int ret = 0;
> +
> +	ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
> +			sctx->cmp_key, 0, __record_changed_new_ref, sctx);
> +	if (ret < 0)
> +		goto out;
> +	ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
> +			sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
> +
> +out:
> +	return ret;
> +}
> +
> +/*
> + * Record and process all refs at once. Needed when an inode changes the
> + * generation number, which means that it was deleted and recreated.
> + */
> +static int process_all_refs(struct send_ctx *sctx,
> +			    enum btrfs_compare_tree_result cmd)
> +{
> +	int ret;
> +	struct btrfs_root *root;
> +	struct btrfs_path *path;
> +	struct btrfs_key key;
> +	struct btrfs_key found_key;
> +	struct extent_buffer *eb;
> +	int slot;
> +	iterate_inode_ref_t cb;
> +
> +	path = alloc_path_for_send();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	if (cmd == BTRFS_COMPARE_TREE_NEW) {
> +		root = sctx->send_root;
> +		cb = __record_new_ref;
> +	} else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
> +		root = sctx->parent_root;
> +		cb = __record_deleted_ref;
> +	} else {
> +		BUG();
> +	}
> +
> +	key.objectid = sctx->cmp_key->objectid;
> +	key.type = BTRFS_INODE_REF_KEY;
> +	key.offset = 0;
> +	while (1) {
> +		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
> +		if (ret < 0) {
> +			btrfs_release_path(path);

not needed

> +			goto out;
> +		}
> +		if (ret) {
> +			btrfs_release_path(path);

ditto

> +			break;
> +		}
> +
> +		eb = path->nodes[0];
> +		slot = path->slots[0];
> +		btrfs_item_key_to_cpu(eb, &found_key, slot);
> +
> +		if (found_key.objectid != key.objectid ||
> +		    found_key.type != key.type) {
> +			btrfs_release_path(path);

and here

> +			break;
> +		}

helper :)

> +
> +		ret = iterate_inode_ref(sctx, sctx->parent_root, path,
> +				&found_key, 0, cb, sctx);
> +		btrfs_release_path(path);
> +		if (ret < 0)
> +			goto out;
> +
> +		key.offset = found_key.offset + 1;
> +	}
> +
> +	ret = process_recorded_refs(sctx);
> +
> +out:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
> +static int send_set_xattr(struct send_ctx *sctx,
> +			  struct fs_path *path,
> +			  const char *name, int name_len,
> +			  const char *data, int data_len)
> +{
> +	int ret = 0;
> +
> +	ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
> +	if (ret < 0)
> +		goto out;
> +
> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
> +	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
> +	TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
> +
> +	ret = send_cmd(sctx);
> +
> +tlv_put_failure:
> +out:
> +	return ret;
> +}
> +
> +static int send_remove_xattr(struct send_ctx *sctx,
> +			  struct fs_path *path,
> +			  const char *name, int name_len)
> +{
> +	int ret = 0;
> +
> +	ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
> +	if (ret < 0)
> +		goto out;
> +
> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
> +	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
> +
> +	ret = send_cmd(sctx);
> +
> +tlv_put_failure:
> +out:
> +	return ret;
> +}
> +
> +static int __process_new_xattr(int num, const char *name, int name_len,
> +			       const char *data, int data_len,
> +			       u8 type, void *ctx)
> +{
> +	int ret;
> +	struct send_ctx *sctx = ctx;
> +	struct fs_path *p;
> +	posix_acl_xattr_header dummy_acl;
> +
> +	p = fs_path_alloc(sctx);
> +	if (!p)
> +		return -ENOMEM;
> +
> +	/*
> +	 * This hack is needed because empty acl's are stored as zero byte
> +	 * data in xattrs. Problem with that is, that receiving these zero byte
> +	 * acl's will fail later. To fix this, we send a dummy acl list that
> +	 * only contains the version number and no entries.
> +	 */
> +	if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
> +	    !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
> +		if (data_len == 0) {
> +			dummy_acl.a_version =
> +					cpu_to_le32(POSIX_ACL_XATTR_VERSION);
> +			data = (char *)&dummy_acl;
> +			data_len = sizeof(dummy_acl);
> +		}
> +	}
> +
> +	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
> +
> +out:
> +	fs_path_free(sctx, p);
> +	return ret;
> +}
> +
> +static int __process_deleted_xattr(int num, const char *name, int name_len,
> +				   const char *data, int data_len,
> +				   u8 type, void *ctx)
> +{
> +	int ret;
> +	struct send_ctx *sctx = ctx;
> +	struct fs_path *p;
> +
> +	p = fs_path_alloc(sctx);
> +	if (!p)
> +		return -ENOMEM;
> +
> +	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = send_remove_xattr(sctx, p, name, name_len);
> +
> +out:
> +	fs_path_free(sctx, p);
> +	return ret;
> +}
> +
> +static int process_new_xattr(struct send_ctx *sctx)
> +{
> +	int ret = 0;
> +
> +	ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
> +			sctx->cmp_key, __process_new_xattr, sctx);
> +
> +	return ret;
> +}
> +
> +static int process_deleted_xattr(struct send_ctx *sctx)
> +{
> +	int ret;
> +
> +	ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
> +			sctx->cmp_key, __process_deleted_xattr, sctx);
> +
> +	return ret;
> +}
> +
> +struct find_xattr_ctx {
> +	const char *name;
> +	int name_len;
> +	int found_idx;
> +	char *found_data;
> +	int found_data_len;
> +};
> +
> +static int __find_xattr(int num, const char *name, int name_len,
> +			const char *data, int data_len,
> +			u8 type, void *vctx)
> +{
> +	struct find_xattr_ctx *ctx = vctx;
> +
> +	if (name_len == ctx->name_len &&
> +	    strncmp(name, ctx->name, name_len) == 0) {
> +		ctx->found_idx = num;
> +		ctx->found_data_len = data_len;
> +		ctx->found_data = kmalloc(data_len, GFP_NOFS);
> +		if (!ctx->found_data)
> +			return -ENOMEM;
> +		memcpy(ctx->found_data, data, data_len);
> +		return 1;
> +	}
> +	return 0;
> +}
> +
> +static int find_xattr(struct send_ctx *sctx,
> +		      struct btrfs_root *root,
> +		      struct btrfs_path *path,
> +		      struct btrfs_key *key,
> +		      const char *name, int name_len,
> +		      char **data, int *data_len)
> +{
> +	int ret;
> +	struct find_xattr_ctx ctx;
> +
> +	ctx.name = name;
> +	ctx.name_len = name_len;
> +	ctx.found_idx = -1;
> +	ctx.found_data = NULL;
> +	ctx.found_data_len = 0;
> +
> +	ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (ctx.found_idx == -1)
> +		return -ENOENT;
> +	if (data) {
> +		*data = ctx.found_data;
> +		*data_len = ctx.found_data_len;
> +	} else {
> +		kfree(ctx.found_data);
> +	}
> +	return ctx.found_idx;
> +}
> +
> +
> +static int __process_changed_new_xattr(int num, const char *name, int name_len,
> +				       const char *data, int data_len,
> +				       u8 type, void *ctx)
> +{
> +	int ret;
> +	struct send_ctx *sctx = ctx;
> +	char *found_data = NULL;
> +	int found_data_len  = 0;
> +	struct fs_path *p = NULL;
> +
> +	ret = find_xattr(sctx, sctx->parent_root, sctx->right_path,
> +			sctx->cmp_key, name, name_len, &found_data,
> +			&found_data_len);
> +	if (ret == -ENOENT) {
> +		ret = __process_new_xattr(num, name, name_len, data, data_len,
> +				type, ctx);
> +	} else if (ret >= 0) {
> +		if (data_len != found_data_len ||
> +		    memcmp(data, found_data, data_len)) {
> +			ret = __process_new_xattr(num, name, name_len, data,
> +					data_len, type, ctx);
> +		} else {
> +			ret = 0;
> +		}
> +	}
> +
> +	kfree(found_data);
> +	fs_path_free(sctx, p);
> +	return ret;
> +}
> +
> +static int __process_changed_deleted_xattr(int num, const char *name,
> +					   int name_len,
> +					   const char *data, int data_len,
> +					   u8 type, void *ctx)
> +{
> +	int ret;
> +	struct send_ctx *sctx = ctx;
> +
> +	ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
> +			name, name_len, NULL, NULL);
> +	if (ret == -ENOENT)
> +		ret = __process_deleted_xattr(num, name, name_len, data,
> +				data_len, type, ctx);
> +	else if (ret >= 0)
> +		ret = 0;
> +
> +	return ret;
> +}
> +
> +static int process_changed_xattr(struct send_ctx *sctx)
> +{
> +	int ret = 0;
> +
> +	ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
> +			sctx->cmp_key, __process_changed_new_xattr, sctx);
> +	if (ret < 0)
> +		goto out;
> +	ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
> +			sctx->cmp_key, __process_changed_deleted_xattr, sctx);
> +
> +out:
> +	return ret;
> +}
> +
> +static int process_all_new_xattrs(struct send_ctx *sctx)
> +{
> +	int ret;
> +	struct btrfs_root *root;
> +	struct btrfs_path *path;
> +	struct btrfs_key key;
> +	struct btrfs_key found_key;
> +	struct extent_buffer *eb;
> +	int slot;
> +
> +	path = alloc_path_for_send();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	root = sctx->send_root;
> +
> +	key.objectid = sctx->cmp_key->objectid;
> +	key.type = BTRFS_XATTR_ITEM_KEY;
> +	key.offset = 0;
> +	while (1) {
> +		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
> +		if (ret < 0)
> +			goto out;
> +		if (ret) {
> +			ret = 0;
> +			goto out;
> +		}
> +
> +		eb = path->nodes[0];
> +		slot = path->slots[0];
> +		btrfs_item_key_to_cpu(eb, &found_key, slot);
> +
> +		if (found_key.objectid != key.objectid ||
> +		    found_key.type != key.type) {
> +			ret = 0;
> +			goto out;
> +		}

helper...

> +
> +		ret = iterate_dir_item(sctx, root, path, &found_key,
> +				__process_new_xattr, sctx);
> +		if (ret < 0)
> +			goto out;
> +
> +		btrfs_release_path(path);
> +		key.offset = found_key.offset + 1;
> +	}
> +
> +out:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
> +/*
> + * Read some bytes from the current inode/file and send a write command to
> + * user space.
> + */
> +static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
> +{
> +	int ret = 0;
> +	struct fs_path *p;
> +	loff_t pos = offset;
> +	int readed;
> +	mm_segment_t old_fs;
> +
> +	p = fs_path_alloc(sctx);
> +	if (!p)
> +		return -ENOMEM;
> +
> +	/*
> +	 * vfs normally only accepts user space buffers for security reasons.
> +	 * we only read from the file and also only provide the read_buf buffer
> +	 * to vfs. As this buffer does not come from a user space call, it's
> +	 * ok to temporary allow kernel space buffers.
> +	 */
> +	old_fs = get_fs();
> +	set_fs(KERNEL_DS);
> +
> +verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
> +
> +	ret = open_cur_inode_file(sctx);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
> +	if (ret < 0)
> +		goto out;
> +	readed = ret;

num_read?

> +	if (!readed)
> +		goto out;
> +
> +	ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
> +	if (ret < 0)
> +		goto out;
> +
> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
> +	TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed);
> +
> +	ret = send_cmd(sctx);
> +
> +tlv_put_failure:
> +out:
> +	fs_path_free(sctx, p);
> +	set_fs(old_fs);
> +	if (ret < 0)
> +		return ret;
> +	return readed;
> +}
> +
> +/*
> + * Send a clone command to user space.
> + */
> +static int send_clone(struct send_ctx *sctx,
> +		      u64 offset, u32 len,
> +		      struct clone_root *clone_root)
> +{
> +	int ret = 0;
> +	struct btrfs_root *clone_root2 = clone_root->root;

a name from hell :)

> +	struct fs_path *p;
> +	u64 gen;
> +
> +verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
> +	       "clone_inode=%llu, clone_offset=%llu\n", offset, len,
> +		clone_root->root->objectid, clone_root->ino,
> +		clone_root->offset);
> +
> +	p = fs_path_alloc(sctx);
> +	if (!p)
> +		return -ENOMEM;
> +
> +	ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
> +	if (ret < 0)
> +		goto out;
> +
> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
> +
> +	if (clone_root2 == sctx->send_root) {
> +		ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
> +				&gen, NULL, NULL, NULL);
> +		if (ret < 0)
> +			goto out;
> +		ret = get_cur_path(sctx, clone_root->ino, gen, p);
> +	} else {
> +		ret = get_inode_path(sctx, clone_root2, clone_root->ino, p);
> +	}
> +	if (ret < 0)
> +		goto out;
> +
> +	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
> +			clone_root2->root_item.uuid);
> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
> +			clone_root2->root_item.ctransid);
> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
> +			clone_root->offset);
> +
> +	ret = send_cmd(sctx);
> +
> +tlv_put_failure:
> +out:
> +	fs_path_free(sctx, p);
> +	return ret;
> +}
> +
> +static int send_write_or_clone(struct send_ctx *sctx,
> +			       struct btrfs_path *path,
> +			       struct btrfs_key *key,
> +			       struct clone_root *clone_root)
> +{
> +	int ret = 0;
> +	struct btrfs_file_extent_item *ei;
> +	u64 offset = key->offset;
> +	u64 pos = 0;
> +	u64 len;
> +	u32 l;
> +	u8 type;
> +
> +	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
> +			struct btrfs_file_extent_item);
> +	type = btrfs_file_extent_type(path->nodes[0], ei);
> +	if (type == BTRFS_FILE_EXTENT_INLINE)
> +		len = btrfs_file_extent_inline_len(path->nodes[0], ei);
> +	else
> +		len = btrfs_file_extent_num_bytes(path->nodes[0], ei);

BTRFS_FILE_EXTENT_PREALLOC?

> +
> +	if (offset + len > sctx->cur_inode_size)
> +		len = sctx->cur_inode_size - offset;
> +	if (len == 0) {
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	if (!clone_root) {
> +		while (pos < len) {
> +			l = len - pos;
> +			if (l > BTRFS_SEND_READ_SIZE)
> +				l = BTRFS_SEND_READ_SIZE;
> +			ret = send_write(sctx, pos + offset, l);
> +			if (ret < 0)
> +				goto out;
> +			if (!ret)
> +				break;
> +			pos += ret;
> +		}
> +		ret = 0;
> +	} else {
> +		ret = send_clone(sctx, offset, len, clone_root);
> +	}
> +
> +out:
> +	return ret;
> +}
> +
> +static int is_extent_unchanged(struct send_ctx *sctx,
> +			       struct btrfs_path *left_path,
> +			       struct btrfs_key *ekey)
> +{
> +	int ret = 0;
> +	struct btrfs_key key;
> +	struct btrfs_path *path = NULL;
> +	struct extent_buffer *eb;
> +	int slot;
> +	struct btrfs_key found_key;
> +	struct btrfs_file_extent_item *ei;
> +	u64 left_disknr;
> +	u64 right_disknr;
> +	u64 left_offset;
> +	u64 right_offset;
> +	u64 left_len;
> +	u64 right_len;
> +	u8 left_type;
> +	u8 right_type;
> +
> +	path = alloc_path_for_send();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	eb = left_path->nodes[0];
> +	slot = left_path->slots[0];
> +
> +	ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
> +	left_type = btrfs_file_extent_type(eb, ei);
> +	left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
> +	left_len = btrfs_file_extent_num_bytes(eb, ei);
> +	left_offset = btrfs_file_extent_offset(eb, ei);
> +
> +	if (left_type != BTRFS_FILE_EXTENT_REG) {
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	key.objectid = ekey->objectid;
> +	key.type = BTRFS_EXTENT_DATA_KEY;
> +	key.offset = ekey->offset;
> +
> +	while (1) {
> +		ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path,
> +				0, 0);
> +		if (ret < 0)
> +			goto out;
> +		if (ret) {
> +			ret = 0;
> +			goto out;
> +		}
> +		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
> +				path->slots[0]);
> +		if (found_key.objectid != key.objectid ||
> +		    found_key.type != key.type) {
> +			ret = 0;
> +			goto out;
> +		}
> +

helper...

> +		eb = path->nodes[0];
> +		slot = path->slots[0];
> +
> +		ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
> +		right_type = btrfs_file_extent_type(eb, ei);
> +		right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
> +		right_len = btrfs_file_extent_num_bytes(eb, ei);
> +		right_offset = btrfs_file_extent_offset(eb, ei);
> +		btrfs_release_path(path);
> +
> +		if (right_type != BTRFS_FILE_EXTENT_REG) {
> +			ret = 0;
> +			goto out;
> +		}
> +
> +		if (left_disknr != right_disknr) {
> +			ret = 0;
> +			goto out;
> +		}
> +
> +		key.offset = found_key.offset + right_len;
> +		if (key.offset >= ekey->offset + left_len) {
> +			ret = 1;
> +			goto out;
> +		}
> +	}
> +
> +out:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
> +static int process_extent(struct send_ctx *sctx,
> +			  struct btrfs_path *path,
> +			  struct btrfs_key *key)
> +{
> +	int ret = 0;
> +	struct clone_root *found_clone = NULL;
> +
> +	if (S_ISLNK(sctx->cur_inode_mode))
> +		return 0;
> +
> +	if (sctx->parent_root && !sctx->cur_inode_new) {
> +		ret = is_extent_unchanged(sctx, path, key);
> +		if (ret < 0)
> +			goto out;
> +		if (ret) {
> +			ret = 0;
> +			goto out;
> +		}
> +	}
> +
> +	ret = find_extent_clone(sctx, path, key->objectid, key->offset,
> +			sctx->cur_inode_size, &found_clone);
> +	if (ret != -ENOENT && ret < 0)
> +		goto out;
> +
> +	ret = send_write_or_clone(sctx, path, key, found_clone);
> +
> +out:
> +	return ret;
> +}
> +
> +static int process_all_extents(struct send_ctx *sctx)
> +{
> +	int ret;
> +	struct btrfs_root *root;
> +	struct btrfs_path *path;
> +	struct btrfs_key key;
> +	struct btrfs_key found_key;
> +	struct extent_buffer *eb;
> +	int slot;
> +
> +	root = sctx->send_root;
> +	path = alloc_path_for_send();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	key.objectid = sctx->cmp_key->objectid;
> +	key.type = BTRFS_EXTENT_DATA_KEY;
> +	key.offset = 0;
> +	while (1) {
> +		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
> +		if (ret < 0)
> +			goto out;
> +		if (ret) {
> +			ret = 0;
> +			goto out;
> +		}
> +
> +		eb = path->nodes[0];
> +		slot = path->slots[0];
> +		btrfs_item_key_to_cpu(eb, &found_key, slot);
> +
> +		if (found_key.objectid != key.objectid ||
> +		    found_key.type != key.type) {
> +			ret = 0;
> +			goto out;
> +		}
> +
> +		ret = process_extent(sctx, path, &found_key);
> +		if (ret < 0)
> +			goto out;
> +
> +		btrfs_release_path(path);
> +		key.offset = found_key.offset + 1;
> +	}
> +
> +out:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
> +static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
> +{
> +	int ret = 0;
> +
> +	if (sctx->cur_ino == 0)
> +		goto out;
> +	if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
> +	    sctx->cmp_key->type <= BTRFS_INODE_REF_KEY)
> +		goto out;
> +	if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
> +		goto out;
> +
> +	ret = process_recorded_refs(sctx);
> +
> +out:
> +	return ret;
> +}
> +
> +static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
> +{
> +	int ret = 0;
> +	u64 left_mode;
> +	u64 left_uid;
> +	u64 left_gid;
> +	u64 right_mode;
> +	u64 right_uid;
> +	u64 right_gid;
> +	int need_chmod = 0;
> +	int need_chown = 0;
> +
> +	ret = process_recorded_refs_if_needed(sctx, at_end);
> +	if (ret < 0)
> +		goto out;
> +
> +	if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
> +		goto out;
> +	if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
> +		goto out;
> +
> +	ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
> +			&left_mode, &left_uid, &left_gid);
> +	if (ret < 0)
> +		goto out;
> +
> +	if (!S_ISLNK(sctx->cur_inode_mode)) {
> +		if (!sctx->parent_root || sctx->cur_inode_new) {
> +			need_chmod = 1;
> +			need_chown = 1;
> +		} else {
> +			ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
> +					NULL, NULL, &right_mode, &right_uid,
> +					&right_gid);
> +			if (ret < 0)
> +				goto out;
> +
> +			if (left_uid != right_uid || left_gid != right_gid)
> +				need_chown = 1;
> +			if (left_mode != right_mode)
> +				need_chmod = 1;
> +		}
> +	}
> +
> +	if (S_ISREG(sctx->cur_inode_mode)) {
> +		ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
> +				sctx->cur_inode_size);
> +		if (ret < 0)
> +			goto out;
> +	}
> +
> +	if (need_chown) {
> +		ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen,
> +				left_uid, left_gid);
> +		if (ret < 0)
> +			goto out;
> +	}
> +	if (need_chmod) {
> +		ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen,
> +				left_mode);
> +		if (ret < 0)
> +			goto out;
> +	}
> +
> +	/*
> +	 * Need to send that every time, no matter if it actually changed
> +	 * between the two trees as we have done changes to the inode before.
> +	 */
> +	ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
> +	if (ret < 0)
> +		goto out;
> +
> +out:
> +	return ret;
> +}
> +
> +static int changed_inode(struct send_ctx *sctx,
> +			 enum btrfs_compare_tree_result result)
> +{
> +	int ret = 0;
> +	struct btrfs_key *key = sctx->cmp_key;
> +	struct btrfs_inode_item *left_ii = NULL;
> +	struct btrfs_inode_item *right_ii = NULL;
> +	u64 left_gen = 0;
> +	u64 right_gen = 0;
> +
> +	ret = close_cur_inode_file(sctx);
> +	if (ret < 0)
> +		goto out;
> +
> +	sctx->cur_ino = key->objectid;
> +	sctx->cur_inode_new_gen = 0;
> +	sctx->send_progress = sctx->cur_ino;
> +
> +	if (result == BTRFS_COMPARE_TREE_NEW ||
> +	    result == BTRFS_COMPARE_TREE_CHANGED) {
> +		left_ii = btrfs_item_ptr(sctx->left_path->nodes[0],
> +				sctx->left_path->slots[0],
> +				struct btrfs_inode_item);
> +		left_gen = btrfs_inode_generation(sctx->left_path->nodes[0],
> +				left_ii);
> +	} else {
> +		right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
> +				sctx->right_path->slots[0],
> +				struct btrfs_inode_item);
> +		right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
> +				right_ii);
> +	}
> +	if (result == BTRFS_COMPARE_TREE_CHANGED) {
> +		right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
> +				sctx->right_path->slots[0],
> +				struct btrfs_inode_item);
> +
> +		right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
> +				right_ii);
> +		if (left_gen != right_gen)
> +			sctx->cur_inode_new_gen = 1;
> +	}
> +
> +	if (result == BTRFS_COMPARE_TREE_NEW) {
> +		sctx->cur_inode_gen = left_gen;
> +		sctx->cur_inode_new = 1;
> +		sctx->cur_inode_deleted = 0;
> +		sctx->cur_inode_size = btrfs_inode_size(
> +				sctx->left_path->nodes[0], left_ii);
> +		sctx->cur_inode_mode = btrfs_inode_mode(
> +				sctx->left_path->nodes[0], left_ii);
> +		if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
> +			ret = send_create_inode(sctx, sctx->left_path,
> +					sctx->cmp_key);
> +	} else if (result == BTRFS_COMPARE_TREE_DELETED) {
> +		sctx->cur_inode_gen = right_gen;
> +		sctx->cur_inode_new = 0;
> +		sctx->cur_inode_deleted = 1;
> +		sctx->cur_inode_size = btrfs_inode_size(
> +				sctx->right_path->nodes[0], right_ii);
> +		sctx->cur_inode_mode = btrfs_inode_mode(
> +				sctx->right_path->nodes[0], right_ii);
> +	} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
> +		if (sctx->cur_inode_new_gen) {
> +			sctx->cur_inode_gen = right_gen;
> +			sctx->cur_inode_new = 0;
> +			sctx->cur_inode_deleted = 1;
> +			sctx->cur_inode_size = btrfs_inode_size(
> +					sctx->right_path->nodes[0], right_ii);
> +			sctx->cur_inode_mode = btrfs_inode_mode(
> +					sctx->right_path->nodes[0], right_ii);
> +			ret = process_all_refs(sctx,
> +					BTRFS_COMPARE_TREE_DELETED);
> +			if (ret < 0)
> +				goto out;
> +
> +			sctx->cur_inode_gen = left_gen;
> +			sctx->cur_inode_new = 1;
> +			sctx->cur_inode_deleted = 0;
> +			sctx->cur_inode_size = btrfs_inode_size(
> +					sctx->left_path->nodes[0], left_ii);
> +			sctx->cur_inode_mode = btrfs_inode_mode(
> +					sctx->left_path->nodes[0], left_ii);
> +			ret = send_create_inode(sctx, sctx->left_path,
> +					sctx->cmp_key);
> +			if (ret < 0)
> +				goto out;
> +
> +			ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
> +			if (ret < 0)
> +				goto out;
> +			ret = process_all_extents(sctx);
> +			if (ret < 0)
> +				goto out;
> +			ret = process_all_new_xattrs(sctx);
> +			if (ret < 0)
> +				goto out;
> +		} else {
> +			sctx->cur_inode_gen = left_gen;
> +			sctx->cur_inode_new = 0;
> +			sctx->cur_inode_new_gen = 0;
> +			sctx->cur_inode_deleted = 0;
> +			sctx->cur_inode_size = btrfs_inode_size(
> +					sctx->left_path->nodes[0], left_ii);
> +			sctx->cur_inode_mode = btrfs_inode_mode(
> +					sctx->left_path->nodes[0], left_ii);
> +		}
> +	}
> +
> +out:
> +	return ret;
> +}
> +
> +static int changed_ref(struct send_ctx *sctx,
> +		       enum btrfs_compare_tree_result result)
> +{
> +	int ret = 0;
> +
> +	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
> +
> +	if (!sctx->cur_inode_new_gen &&
> +	    sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
> +		if (result == BTRFS_COMPARE_TREE_NEW)
> +			ret = record_new_ref(sctx);
> +		else if (result == BTRFS_COMPARE_TREE_DELETED)
> +			ret = record_deleted_ref(sctx);
> +		else if (result == BTRFS_COMPARE_TREE_CHANGED)
> +			ret = record_changed_ref(sctx);
> +	}
> +
> +	return ret;
> +}
> +
> +static int changed_xattr(struct send_ctx *sctx,
> +			 enum btrfs_compare_tree_result result)
> +{
> +	int ret = 0;
> +
> +	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
> +
> +	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
> +		if (result == BTRFS_COMPARE_TREE_NEW)
> +			ret = process_new_xattr(sctx);
> +		else if (result == BTRFS_COMPARE_TREE_DELETED)
> +			ret = process_deleted_xattr(sctx);
> +		else if (result == BTRFS_COMPARE_TREE_CHANGED)
> +			ret = process_changed_xattr(sctx);
> +	}
> +
> +	return ret;
> +}
> +
> +static int changed_extent(struct send_ctx *sctx,
> +			  enum btrfs_compare_tree_result result)
> +{
> +	int ret = 0;
> +
> +	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
> +
> +	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
> +		if (result != BTRFS_COMPARE_TREE_DELETED)
> +			ret = process_extent(sctx, sctx->left_path,
> +					sctx->cmp_key);
> +	}
> +
> +	return ret;
> +}
> +
> +
> +static int changed_cb(struct btrfs_root *left_root,
> +		      struct btrfs_root *right_root,
> +		      struct btrfs_path *left_path,
> +		      struct btrfs_path *right_path,
> +		      struct btrfs_key *key,
> +		      enum btrfs_compare_tree_result result,
> +		      void *ctx)
> +{
> +	int ret = 0;
> +	struct send_ctx *sctx = ctx;
> +
> +	sctx->left_path = left_path;
> +	sctx->right_path = right_path;
> +	sctx->cmp_key = key;
> +
> +	ret = finish_inode_if_needed(sctx, 0);
> +	if (ret < 0)
> +		goto out;
> +
> +	if (key->type == BTRFS_INODE_ITEM_KEY)
> +		ret = changed_inode(sctx, result);
> +	else if (key->type == BTRFS_INODE_REF_KEY)
> +		ret = changed_ref(sctx, result);
> +	else if (key->type == BTRFS_XATTR_ITEM_KEY)
> +		ret = changed_xattr(sctx, result);
> +	else if (key->type == BTRFS_EXTENT_DATA_KEY)
> +		ret = changed_extent(sctx, result);
> +
> +out:
> +	return ret;
> +}
> +
> +static int full_send_tree(struct send_ctx *sctx)
> +{
> +	int ret;
> +	struct btrfs_trans_handle *trans = NULL;
> +	struct btrfs_root *send_root = sctx->send_root;
> +	struct btrfs_key key;
> +	struct btrfs_key found_key;
> +	struct btrfs_path *path;
> +	struct extent_buffer *eb;
> +	int slot;
> +	u64 start_ctransid;
> +	u64 ctransid;
> +
> +	path = alloc_path_for_send();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	spin_lock(&send_root->root_times_lock);
> +	start_ctransid = btrfs_root_ctransid(&send_root->root_item);
> +	spin_unlock(&send_root->root_times_lock);
> +
> +	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
> +	key.type = BTRFS_INODE_ITEM_KEY;
> +	key.offset = 0;
> +
> +join_trans:
> +	/*
> +	 * We need to make sure the transaction does not get committed
> +	 * while we do anything on commit roots. Join a transaction to prevent
> +	 * this.
> +	 */
> +	trans = btrfs_join_transaction(send_root);
> +	if (IS_ERR(trans)) {
> +		ret = PTR_ERR(trans);
> +		trans = NULL;
> +		goto out;
> +	}
> +
> +	/*
> +	 * Make sure the tree has not changed
> +	 */
> +	spin_lock(&send_root->root_times_lock);
> +	ctransid = btrfs_root_ctransid(&send_root->root_item);
> +	spin_unlock(&send_root->root_times_lock);
> +
> +	if (ctransid != start_ctransid) {
> +		WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
> +				     "send was modified in between. This is "
> +				     "probably a bug.\n");

What is the purpose of getting the ctransid outside the
transaction anyway?

> +		ret = -EIO;
> +		goto out;
> +	}
> +
> +	ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
> +	if (ret < 0)
> +		goto out;
> +	if (ret)
> +		goto out_finish;
> +
> +	while (1) {
> +		/*
> +		 * When someone want to commit while we iterate, end the
> +		 * joined transaction and rejoin.
> +		 */
> +		if (btrfs_should_end_transaction(trans, send_root)) {
> +			ret = btrfs_end_transaction(trans, send_root);
> +			trans = NULL;
> +			if (ret < 0)
> +				goto out;
> +			btrfs_release_path(path);
> +			goto join_trans;
> +		}
> +
> +		eb = path->nodes[0];
> +		slot = path->slots[0];
> +		btrfs_item_key_to_cpu(eb, &found_key, slot);
> +
> +		ret = changed_cb(send_root, NULL, path, NULL,
> +				&found_key, BTRFS_COMPARE_TREE_NEW, sctx);
> +		if (ret < 0)
> +			goto out;
> +
> +		key.objectid = found_key.objectid;
> +		key.type = found_key.type;
> +		key.offset = found_key.offset + 1;

shouldn't this just be before the goto join_trans?

> +
> +		ret = btrfs_next_item(send_root, path);
> +		if (ret < 0)
> +			goto out;
> +		if (ret) {
> +			ret  = 0;
> +			break;
> +		}
> +	}
> +
> +out_finish:
> +	ret = finish_inode_if_needed(sctx, 1);
> +
> +out:
> +	btrfs_free_path(path);
> +	if (trans) {
> +		if (!ret)
> +			ret = btrfs_end_transaction(trans, send_root);
> +		else
> +			btrfs_end_transaction(trans, send_root);
> +	}
> +	return ret;
> +}
> +
> +static int send_subvol(struct send_ctx *sctx)
> +{
> +	int ret;
> +
> +	ret = send_header(sctx);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = send_subvol_begin(sctx);
> +	if (ret < 0)
> +		goto out;
> +
> +	if (sctx->parent_root) {
> +		ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
> +				changed_cb, sctx);
> +		if (ret < 0)
> +			goto out;
> +		ret = finish_inode_if_needed(sctx, 1);
> +		if (ret < 0)
> +			goto out;
> +	} else {
> +		ret = full_send_tree(sctx);
> +		if (ret < 0)
> +			goto out;
> +	}
> +
> +out:
> +	if (!ret)
> +		ret = close_cur_inode_file(sctx);
> +	else
> +		close_cur_inode_file(sctx);
> +
> +	free_recorded_refs(sctx);
> +	return ret;
> +}
> +
> +long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
> +{
> +	int ret = 0;
> +	struct btrfs_root *send_root;
> +	struct btrfs_root *clone_root;
> +	struct btrfs_fs_info *fs_info;
> +	struct btrfs_ioctl_send_args *arg = NULL;
> +	struct btrfs_key key;
> +	struct file *filp = NULL;
> +	struct send_ctx *sctx = NULL;
> +	u32 i;
> +	u64 *clone_sources_tmp = NULL;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	send_root = BTRFS_I(fdentry(mnt_file)->d_inode)->root;
> +	fs_info = send_root->fs_info;
> +
> +	arg = memdup_user(arg_, sizeof(*arg));
> +	if (IS_ERR(arg)) {
> +		ret = PTR_ERR(arg);
> +		arg = NULL;
> +		goto out;
> +	}
> +
> +	if (!access_ok(VERIFY_READ, arg->clone_sources,
> +			sizeof(*arg->clone_sources *
> +			arg->clone_sources_count))) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
> +	if (!sctx) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	INIT_LIST_HEAD(&sctx->new_refs);
> +	INIT_LIST_HEAD(&sctx->deleted_refs);
> +	INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
> +	INIT_LIST_HEAD(&sctx->name_cache_list);
> +
> +	sctx->send_filp = fget(arg->send_fd);
> +	if (IS_ERR(sctx->send_filp)) {
> +		ret = PTR_ERR(sctx->send_filp);
> +		goto out;
> +	}
> +
> +	sctx->mnt = mnt_file->f_path.mnt;
> +
> +	sctx->send_root = send_root;
> +	sctx->clone_roots_cnt = arg->clone_sources_count;
> +
> +	sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
> +	sctx->send_buf = vmalloc(sctx->send_max_size);
> +	if (!sctx->send_buf) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
> +	if (!sctx->read_buf) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
> +			(arg->clone_sources_count + 1));
> +	if (!sctx->clone_roots) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	if (arg->clone_sources_count) {
> +		clone_sources_tmp = vmalloc(arg->clone_sources_count *
> +				sizeof(*arg->clone_sources));
> +		if (!clone_sources_tmp) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
> +				arg->clone_sources_count *
> +				sizeof(*arg->clone_sources));
> +		if (ret) {
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +
> +		for (i = 0; i < arg->clone_sources_count; i++) {
> +			key.objectid = clone_sources_tmp[i];
> +			key.type = BTRFS_ROOT_ITEM_KEY;
> +			key.offset = (u64)-1;
> +			clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
> +			if (!clone_root) {
> +				ret = -EINVAL;
> +				goto out;
> +			}
> +			if (IS_ERR(clone_root)) {
> +				ret = PTR_ERR(clone_root);
> +				goto out;
> +			}
> +			sctx->clone_roots[i].root = clone_root;
> +		}
> +		vfree(clone_sources_tmp);
> +		clone_sources_tmp = NULL;
> +	}
> +
> +	if (arg->parent_root) {
> +		key.objectid = arg->parent_root;
> +		key.type = BTRFS_ROOT_ITEM_KEY;
> +		key.offset = (u64)-1;
> +		sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
> +		if (!sctx->parent_root) {
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +	}
> +
> +	/*
> +	 * Clones from send_root are allowed, but only if the clone source
> +	 * is behind the current send position. This is checked while searching
> +	 * for possible clone sources.
> +	 */
> +	sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root;
> +
> +	/* We do a bsearch later */
> +	sort(sctx->clone_roots, sctx->clone_roots_cnt,
> +			sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
> +			NULL);
> +
> +	ret = send_subvol(sctx);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = begin_cmd(sctx, BTRFS_SEND_C_END);
> +	if (ret < 0)
> +		goto out;
> +	ret = send_cmd(sctx);
> +	if (ret < 0)
> +		goto out;
> +
> +out:
> +	if (filp)
> +		fput(filp);
> +	kfree(arg);
> +	vfree(clone_sources_tmp);
> +
> +	if (sctx) {
> +		if (sctx->send_filp)
> +			fput(sctx->send_filp);
> +
> +		vfree(sctx->clone_roots);
> +		vfree(sctx->send_buf);
> +		vfree(sctx->read_buf);
> +
> +		name_cache_free(sctx);
> +
> +		kfree(sctx);
> +	}
> +
> +	return ret;
> +}
> diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
> index a4c23ee..53f8ee7 100644
> --- a/fs/btrfs/send.h
> +++ b/fs/btrfs/send.h
> @@ -124,3 +124,7 @@ enum {
>  	__BTRFS_SEND_A_MAX,
>  };
>  #define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
> +
> +#ifdef __KERNEL__
> +long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
> +#endif

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Lyakas July 23, 2012, 3:17 p.m. UTC | #3
Hi Alexander,
I did some testing of the case where same inode, but with a different
generation, exists both in send_root and in parent_root.
I know that this can happen primarily when "inode_cache" option is
enabled. So first I just tested some differential sends, where parent
and root are unrelated subvolumes. Here are some issues:

1) The top subvolume inode (ino=BTRFS_FIRST_FREE_OBJECTID) is treated
also as deleted + recreated. So the code goes into process_all_refs()
path and does several strange things, such as trying to orphanize the
top inode. Also get_cur_path() always returns "" for the top subvolume
(without checking whether it is an orphan).  Another complication for
the top inode is that its parent dir is itself.
I made the following fix:
@@ -3782,7 +3972,13 @@ static int changed_inode(struct send_ctx *sctx,

                right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
                                right_ii);
-               if (left_gen != right_gen)
+               if (left_gen != right_gen && sctx->cur_ino !=
BTRFS_FIRST_FREE_OBJECTID)
                        sctx->cur_inode_new_gen = 1;

So basically, don't try to delete and re-create it, but treat it like
a change. Since the top subvolume inode is S_IFDIR, and dir can have
only one hardlink (and hopefully it is always ".."), we will never
need to change anything for this INODE_REF. I also added:

@@ -2526,6 +2615,14 @@ static int process_recorded_refs(struct send_ctx *sctx)
        int did_overwrite = 0;
        int is_orphan = 0;

+       BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);

2) After I fixed this, I hit another issue, where inodes under the top
subvolume dir, attempt to rmdir() the top dir, while iterating over
check_dirs in process_recorded_refs(), because (top_dir_ino,
top_dir_gen) indicate that it was deleted. So I added:

@@ -2714,10 +2857,19 @@ verbose_printk("btrfs: process_recorded_refs
%llu\n", sctx->cur_ino);
         */
        ULIST_ITER_INIT(&uit);
        while ((un = ulist_next(check_dirs, &uit))) {
+               /* Do not attempt to rmdir it the top subvolume dir */
+               if (un->val == BTRFS_FIRST_FREE_OBJECTID)
+                       continue;
+
                if (un->val > sctx->cur_ino)
                        continue;

3) process_recorded_refs() always increments the send_progress:
	/*
	 * Current inode is now at it's new position, so we must increase
	 * send_progress
	 */
	sctx->send_progress = sctx->cur_ino + 1;

However, in the changed_inode() path I am testing, process_all_refs()
is called twice with the same inode (once for deleted inode, once for
the recreated inode), so after the first call, send_progress is
incremented and doesn't match the inode anymore. I don't think I hit
any issues because of this, just that it's confusing.

4)

> +/*
> + * Record and process all refs at once. Needed when an inode changes the
> + * generation number, which means that it was deleted and recreated.
> + */
> +static int process_all_refs(struct send_ctx *sctx,
> +                           enum btrfs_compare_tree_result cmd)
> +{
> +       int ret;
> +       struct btrfs_root *root;
> +       struct btrfs_path *path;
> +       struct btrfs_key key;
> +       struct btrfs_key found_key;
> +       struct extent_buffer *eb;
> +       int slot;
> +       iterate_inode_ref_t cb;
> +
> +       path = alloc_path_for_send();
> +       if (!path)
> +               return -ENOMEM;
> +
> +       if (cmd == BTRFS_COMPARE_TREE_NEW) {
> +               root = sctx->send_root;
> +               cb = __record_new_ref;
> +       } else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
> +               root = sctx->parent_root;
> +               cb = __record_deleted_ref;
> +       } else {
> +               BUG();
> +       }
> +
> +       key.objectid = sctx->cmp_key->objectid;
> +       key.type = BTRFS_INODE_REF_KEY;
> +       key.offset = 0;
> +       while (1) {
> +               ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
> +               if (ret < 0) {
> +                       btrfs_release_path(path);
> +                       goto out;
> +               }
> +               if (ret) {
> +                       btrfs_release_path(path);
> +                       break;
> +               }
> +
> +               eb = path->nodes[0];
> +               slot = path->slots[0];
> +               btrfs_item_key_to_cpu(eb, &found_key, slot);
> +
> +               if (found_key.objectid != key.objectid ||
> +                   found_key.type != key.type) {
> +                       btrfs_release_path(path);
> +                       break;
> +               }
> +
> +               ret = iterate_inode_ref(sctx, sctx->parent_root, path,
> +                               &found_key, 0, cb, sctx);

Shouldn't it be the root that you calculated eariler and not
sctx->parent_root? I guess in this case it doesn't matter, because
"resolve" is 0, and the passed root is only used for resolve. But
still confusing.

5) When I started testing with "inode_cache" enabled, I hit another
issue. When this mount option is enabled, then FREE_INO and FREE_SPACE
items now appear in the file tree. As a result, the code tries to
create the FREE_INO item with an orphan name, then tries to find its
INODE_REF, but fails because it has no INODE_REFs. So

@@ -3923,6 +4127,13 @@ static int changed_cb(struct btrfs_root *left_root,
        int ret = 0;
        struct send_ctx *sctx = ctx;

+       /* Ignore non-FS objects */
+       if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
+               key->objectid == BTRFS_FREE_SPACE_OBJECTID)
+               return 0;

makes sense?

Thanks,
Alex.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Lyakas July 23, 2012, 3:28 p.m. UTC | #4
Hi Arne,

(pls don't take this as if I pretend to have understood the code
better than you, because I have a list of questions for Alexander
too).

>> +/*
>> + * Helper function to generate a file name that is unique in the root of
>> + * send_root and parent_root. This is used to generate names for orphan inodes.
>> + */
>> +static int gen_unique_name(struct send_ctx *sctx,
>> +                        u64 ino, u64 gen,
>> +                        struct fs_path *dest)
>> +{
>> +     int ret = 0;
>> +     struct btrfs_path *path;
>> +     struct btrfs_dir_item *di;
>> +     char tmp[64];
>> +     int len;
>> +     u64 idx = 0;
>> +
>> +     path = alloc_path_for_send();
>> +     if (!path)
>> +             return -ENOMEM;
>> +
>> +     while (1) {
>> +             len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu",
>> +                             ino, gen, idx);
>
> wouldn't it be easier to just take a uuid? This would save you a lot
> of code and especially the need to verify that the name is really
> unique, saving seeks.

As far as I understand the logic of orphans, the unique name should
depend only on the send_root and parent_root contents, which are both
frozen. So when you re-generate this name for a particular (ino,gen),
you must receive the same exact name every time. If the user has kind
of oXXX-YY-Z file(s) in the top dir by accident, then they are the
same every time we recalculate the orhpan name, so we get the same
result every time. Does it make sense?
So did you mean to generate a uuid here, and save it somewhere
in-memory, and later look it up based on (ino,gen)? Or you mean some
other improvement?

Thanks,
Alex.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Block July 25, 2012, 1:37 p.m. UTC | #5
On Tue, Jul 10, 2012 at 5:26 PM, Alex Lyakas
<alex.bolshoy.btrfs@gmail.com> wrote:
> Alexander,
> this focuses on area of sending file extents:
>
>> +static int is_extent_unchanged(struct send_ctx *sctx,
>> +                              struct btrfs_path *left_path,
>> +                              struct btrfs_key *ekey)
>> +{
>> +       int ret = 0;
>> +       struct btrfs_key key;
>> +       struct btrfs_path *path = NULL;
>> +       struct extent_buffer *eb;
>> +       int slot;
>> +       struct btrfs_key found_key;
>> +       struct btrfs_file_extent_item *ei;
>> +       u64 left_disknr;
>> +       u64 right_disknr;
>> +       u64 left_offset;
>> +       u64 right_offset;
>> +       u64 left_len;
>> +       u64 right_len;
>> +       u8 left_type;
>> +       u8 right_type;
>> +
>> +       path = alloc_path_for_send();
>> +       if (!path)
>> +               return -ENOMEM;
>> +
>> +       eb = left_path->nodes[0];
>> +       slot = left_path->slots[0];
>> +
>> +       ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
>> +       left_type = btrfs_file_extent_type(eb, ei);
>> +       left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
>> +       left_len = btrfs_file_extent_num_bytes(eb, ei);
>> +       left_offset = btrfs_file_extent_offset(eb, ei);
>> +
>> +       if (left_type != BTRFS_FILE_EXTENT_REG) {
>> +               ret = 0;
>> +               goto out;
>> +       }
>> +
>> +       key.objectid = ekey->objectid;
>> +       key.type = BTRFS_EXTENT_DATA_KEY;
>> +       key.offset = ekey->offset;
>> +
>> +       while (1) {
>> +               ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path,
>> +                               0, 0);
>> +               if (ret < 0)
>> +                       goto out;
>> +               if (ret) {
>> +                       ret = 0;
>> +                       goto out;
>> +               }
>> +               btrfs_item_key_to_cpu(path->nodes[0], &found_key,
>> +                               path->slots[0]);
>> +               if (found_key.objectid != key.objectid ||
>> +                   found_key.type != key.type) {
>> +                       ret = 0;
>> +                       goto out;
>> +               }
>> +
>> +               eb = path->nodes[0];
>> +               slot = path->slots[0];
>> +
>> +               ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
>> +               right_type = btrfs_file_extent_type(eb, ei);
>> +               right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
>> +               right_len = btrfs_file_extent_num_bytes(eb, ei);
>> +               right_offset = btrfs_file_extent_offset(eb, ei);
>> +               btrfs_release_path(path);
>> +
>> +               if (right_type != BTRFS_FILE_EXTENT_REG) {
>> +                       ret = 0;
>> +                       goto out;
>> +               }
>> +
>> +               if (left_disknr != right_disknr) {
>> +                       ret = 0;
>> +                       goto out;
>> +               }
>> +
>> +               key.offset = found_key.offset + right_len;
>> +               if (key.offset >= ekey->offset + left_len) {
>> +                       ret = 1;
>> +                       goto out;
>> +               }
>> +       }
>> +
>> +out:
>> +       btrfs_free_path(path);
>> +       return ret;
>> +}
>> +
>
> Should we always treat left extent with bytenr==0 as not changed?
No, as we may have bytenr!=0 on the right side.
> Because right now, it simply reads and sends data of such extent,
> while bytenr==0 means "no data allocated here". Since we always do
> send_truncate() afterwards, file size will always be correct, so we
> can just skip bytenr==0 extents.
This is something that could be done for full sends only. Full sends
however do not call is_extent_unchanged, so the optimization is
something for process_extent.
In the incremental case, it may happen that left_disknr==0 and
right_disknr!=0 or vice versa, so we need to do the compare no matter
if one of them is ==0. process_extents could then again do some
optimization and send a special command to instruct a preallocated
zero block.
> Same is true for BTRFS_FILE_EXTENT_PREALLOC extents, I think. Those
> also don't contain real data.
> So something like:
> if (left_disknr == 0 || left_type == BTRFS_FILE_EXTENT_REG) {
>         ret = 1;
>         goto out;
> }
Do you mean "|| left_type == BTRFS_FILE_EXTENT_PREALLOC"?
> before we check for BTRFS_FILE_EXTENT_REG.
>
> Now I have a question about the rest of the logic that decides that
> extent is unchanged. I understand that if we see the same extent (same
> disk_bytenr) shared between parent_root and send_root, then it must
> contain the same data, even in nodatacow mode, because on a first
> write to such shared extent, it is cow'ed even with nodatacow.
>
> However, shouldn't we check btrfs_file_extent_offset(), to make sure
> that both send_root and parent_root point at the same offset into
> extent from the same file offset? Because if extent_offset values are
> different, then the data of the file might different, even though we
> are talking about the same extent.
>
> So I am thinking about something like:
>
> - ekey.offset points at data at logical address
> left_disknr+left_offset (logical address within CHUNK_ITEM address
> space) for left_len bytes
> - found_key.offset points at data at logical address
> right_disknr+right_offset for right_len
> - we know that found_key.offset <= ekey.offset
>
> So we need to ensure that left_disknr==right_disknr and also:
> right_disknr+right_offset + (ekey.offset - found_key.offset) ==
> left_disknr+left_offset
> or does this while loop somehow ensures this equation?
Ay...you're absolutely right :) Fixed that and pushing later today.
>
> However, I must admit I don't fully understand the logic behind
> deciding that extent is unchanged. Can you pls explain what this tries
> to accomplish, and why it decides that extent is unchanged here:
> key.offset = found_key.offset + right_len;
This line is to advance to the next extent on the right side.
> if (key.offset >= ekey->offset + left_len) {
>         ret = 1;
>         goto out;
> }
This if checks if the advancing would put us behind the left extent.
If that is true, we're done with the extent that we're checking now.
As we did not bail out before, we know that the extent is unchanged.
>
> Also: when searching for the next extent, should we use
> btrfs_file_extent_num_bytes() or btrfs_file_extent_disk_num_bytes()?
> They are not equal sometimes...not sure at which offset the next
> extent (if any) should be. What about holes in files? Then we will
> have non-consecutive offsets.
We have to use num_bytes, as it is the *uncompressed* number of bytes.
We're working on file extents, and their offsets are always
uncompressed. Also, num_bytes is the number of bytes after splitting,
while disk_num_bytes is always the size of the whole extent on disk.

I have changed the way I iterate the extents now. I use
btrfs_next_item instead of advancing key.offset now. Also, I have
added some ASCII graphics to illustrate what happens. I hope that
helps understanding this. Will push that later today.
>
> Thanks,
> Alex.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Lyakas July 25, 2012, 5:20 p.m. UTC | #6
Alexander,

>> Same is true for BTRFS_FILE_EXTENT_PREALLOC extents, I think. Those
>> also don't contain real data.
>> So something like:
>> if (left_disknr == 0 || left_type == BTRFS_FILE_EXTENT_REG) {
>>         ret = 1;
>>         goto out;
>> }
> Do you mean "|| left_type == BTRFS_FILE_EXTENT_PREALLOC"?

I see your point about bytenr==0, I missed that on the parent tree it
can be something else.

As for PREALLOC: can it happen that on differential send we see extent
of type BTRFS_FILE_EXTENT_PREALLOC? And can it happen that parent had
some real data extent in that place? I don't know the answer, but if
yes, then we must treat PREALLOC as normal extent. So this case is
similar to bytenr==0.

Thanks,
Alex.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Block July 25, 2012, 5:41 p.m. UTC | #7
On Wed, Jul 25, 2012 at 7:20 PM, Alex Lyakas
<alex.bolshoy.btrfs@gmail.com> wrote:
> Alexander,
>
>>> Same is true for BTRFS_FILE_EXTENT_PREALLOC extents, I think. Those
>>> also don't contain real data.
>>> So something like:
>>> if (left_disknr == 0 || left_type == BTRFS_FILE_EXTENT_REG) {
>>>         ret = 1;
>>>         goto out;
>>> }
>> Do you mean "|| left_type == BTRFS_FILE_EXTENT_PREALLOC"?
>
> I see your point about bytenr==0, I missed that on the parent tree it
> can be something else.
>
> As for PREALLOC: can it happen that on differential send we see extent
> of type BTRFS_FILE_EXTENT_PREALLOC? And can it happen that parent had
> some real data extent in that place? I don't know the answer, but if
> yes, then we must treat PREALLOC as normal extent. So this case is
> similar to bytenr==0.
>
I also don't know if that may happen. Currently, only REG extents are
checked by is_extent_unchanged. All other types are regarded as
changed and will be sent. So in the worst case the stream gets larget
then it should be, but we won't loose data. I need to leave in a few
minutes and will continue working on btrfs send/receive v2 later
today. We should probably postpone "optimizations" (actually bug
fixing) here for later...don't know if I find enough time to
investigate more.

> Thanks,
> Alex.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Block July 28, 2012, 1:49 p.m. UTC | #8
On 07/23/2012 01:16 PM, Arne Jansen wrote:
> This is a first review run. I ask for more comments in several places.
> Maybe these comments can help to dive deeper into a functional review
> in a second run.
> I'd really appreciate it if you could write a few pages about the
> concepts how you decide what to send and when.
> It seems there's still a lot of headroom for performance optimizations
> cpu/seek-wise.
I started do document stuff in
http://btrfs.wiki.kernel.org/index.php/Btrfs_Send/Receive
There is also a collection for optimizations that I have in mind for later.
> All in all I really like this work.
>
> On 04.07.2012 15:38, Alexander Block wrote:
>> This is the second part of the splitted BTRFS_IOC_SEND patch which
>> contains the actual send logic.
>>
>> Signed-off-by: Alexander Block<ablock84@googlemail.com>
>> ---
>>   fs/btrfs/ioctl.c |    3 +
>>   fs/btrfs/send.c  | 3246 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   fs/btrfs/send.h  |    4 +
>>   3 files changed, 3253 insertions(+)
>>
>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>> index 8d258cb..9173867 100644
>> --- a/fs/btrfs/ioctl.c
>> +++ b/fs/btrfs/ioctl.c
>> @@ -54,6 +54,7 @@
>>   #include "inode-map.h"
>>   #include "backref.h"
>>   #include "rcu-string.h"
>> +#include "send.h"
>>
>>   /* Mask out flags that are inappropriate for the given type of inode. */
>>   static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
>> @@ -3567,6 +3568,8 @@ long btrfs_ioctl(struct file *file, unsigned int
>>   		return btrfs_ioctl_balance_progress(root, argp);
>>   	case BTRFS_IOC_SET_RECEIVED_SUBVOL:
>>   		return btrfs_ioctl_set_received_subvol(file, argp);
>> +	case BTRFS_IOC_SEND:
>> +		return btrfs_ioctl_send(file, argp);
>>   	case BTRFS_IOC_GET_DEV_STATS:
>>   		return btrfs_ioctl_get_dev_stats(root, argp, 0);
>>   	case BTRFS_IOC_GET_AND_RESET_DEV_STATS:
>> diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
>> index 47a2557..4d3fcfc 100644
>> --- a/fs/btrfs/send.c
>> +++ b/fs/btrfs/send.c
>> @@ -1007,3 +1007,3249 @@ out:
>>   	return ret;
>>   }
>>
>> +struct backref_ctx {
>> +	struct send_ctx *sctx;
>> +
>> +	/* number of total found references */
>> +	u64 found;
>> +
>> +	/*
>> +	 * used for clones found in send_root. clones found behind cur_objectid
>> +	 * and cur_offset are not considered as allowed clones.
>> +	 */
>> +	u64 cur_objectid;
>> +	u64 cur_offset;
>> +
>> +	/* may be truncated in case it's the last extent in a file */
>> +	u64 extent_len;
>> +
>> +	/* Just to check for bugs in backref resolving */
>> +	int found_in_send_root;
>> +};
>> +
>> +static int __clone_root_cmp_bsearch(const void *key, const void *elt)
>> +{
>> +	u64 root = (u64)key;
>> +	struct clone_root *cr = (struct clone_root *)elt;
>> +
>> +	if (root<  cr->root->objectid)
>> +		return -1;
>> +	if (root>  cr->root->objectid)
>> +		return 1;
>> +	return 0;
>> +}
>> +
>> +static int __clone_root_cmp_sort(const void *e1, const void *e2)
>> +{
>> +	struct clone_root *cr1 = (struct clone_root *)e1;
>> +	struct clone_root *cr2 = (struct clone_root *)e2;
>> +
>> +	if (cr1->root->objectid<  cr2->root->objectid)
>> +		return -1;
>> +	if (cr1->root->objectid>  cr2->root->objectid)
>> +		return 1;
>> +	return 0;
>> +}
>> +
>> +/*
>> + * Called for every backref that is found for the current extent.
>
> Comment: results are collected in sctx->clone_roots->ino/offset/found_refs
>
>> + */
>> +static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
>> +{
>> +	struct backref_ctx *bctx = ctx_;
>> +	struct clone_root *found;
>> +	int ret;
>> +	u64 i_size;
>> +
>> +	/* First check if the root is in the list of accepted clone sources */
>> +	found = bsearch((void *)root, bctx->sctx->clone_roots,
>> +			bctx->sctx->clone_roots_cnt,
>> +			sizeof(struct clone_root),
>> +			__clone_root_cmp_bsearch);
>> +	if (!found)
>> +		return 0;
>> +
>> +	if (found->root == bctx->sctx->send_root&&
>> +	    ino == bctx->cur_objectid&&
>> +	    offset == bctx->cur_offset) {
>> +		bctx->found_in_send_root = 1;
>
> found_in_send_root_and_cur_ino_offset?
I renamed it to found_itself. Hope that's more clear.
>
>> +	}
>> +
>> +	/*
>> +	 * There are inodes that have extents that lie behind it's i_size. Don't
>                                                                its
Fixed.
>> +	 * accept clones from these extents.
>> +	 */
>> +	ret = get_inode_info(found->root, ino,&i_size, NULL, NULL, NULL, NULL);
>> +	if (ret<  0)
>> +		return ret;
>> +
>> +	if (offset + bctx->extent_len>  i_size)
>> +		return 0;
>> +
>> +	/*
>> +	 * Make sure we don't consider clones from send_root that are
>> +	 * behind the current inode/offset.
>> +	 */
>> +	if (found->root == bctx->sctx->send_root) {
>> +		/*
>> +		 * TODO for the moment we don't accept clones from the inode
>> +		 * that is currently send. We may change this when
>> +		 * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same
>> +		 * file.
>> +		 */
>> +		if (ino>= bctx->cur_objectid)
>> +			return 0;
>> +		/*if (ino>  ctx->cur_objectid)
>> +			return 0;
>> +		if (offset + ctx->extent_len>  ctx->cur_offset)
>> +			return 0;*/
>
> #if 0 ... #else ... #endif
Fixed.
>
>> +
>> +		bctx->found++;
>> +		found->found_refs++;
>> +		found->ino = ino;
>> +		found->offset = offset;
>
> only the last ino is kept?
>
I removed that return path. Now the code below the if handles that too.
>> +		return 0;
>> +	}
>> +
>> +	bctx->found++;
>> +	found->found_refs++;
>> +	if (ino<  found->ino) {
>> +		found->ino = ino;
>> +		found->offset = offset;
>
> whereas here only the lowest ino is kept. Why?
>
I take the lowest because then we have the same file as clone source 
every time when extents got cloned multiple times.
>> +	} else if (found->ino == ino) {
>> +		/*
>> +		 * same extent found more then once in the same file.
>> +		 */
>> +		if (found->offset>  offset + bctx->extent_len)
>> +			found->offset = offset;
>
> This is unclear to me. Seems to mean something like
> 'find the lowest offset', but not exactly. Some explaination
> would be good.
Hmm...I don't remember why it was needed. I remember that I added it 
later for some reason but can't see now why.
>
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +/*
>> + * path must point to the extent item when called.
>> + */
>
> What is the purpose of this function? I probably will figure it out
> when reading on, but a comment would be nice here.
>
Added a description of the function.
>> +static int find_extent_clone(struct send_ctx *sctx,
>> +			     struct btrfs_path *path,
>> +			     u64 ino, u64 data_offset,
>> +			     u64 ino_size,
>> +			     struct clone_root **found)
>> +{
>> +	int ret;
>> +	int extent_type;
>> +	u64 logical;
>> +	u64 num_bytes;
>> +	u64 extent_item_pos;
>> +	struct btrfs_file_extent_item *fi;
>> +	struct extent_buffer *eb = path->nodes[0];
>> +	struct backref_ctx backref_ctx;
>
> currently it's still small enough to keep in on stack, maybe a
> comment in struct backref_ctx that it is kept on stack would be
> nice.
>
To make sure, I've removed it from the stack and use kmalloc now.
>> +	struct clone_root *cur_clone_root;
>> +	struct btrfs_key found_key;
>> +	struct btrfs_path *tmp_path;
>> +	u32 i;
>> +
>> +	tmp_path = alloc_path_for_send();
>> +	if (!tmp_path)
>> +		return -ENOMEM;
>> +
>> +	if (data_offset>= ino_size) {
>> +		/*
>> +		 * There may be extents that lie behind the file's size.
>> +		 * I at least had this in combination with snapshotting while
>> +		 * writing large files.
>> +		 */
>> +		ret = 0;
>> +		goto out;
>> +	}
>> +
>> +	fi = btrfs_item_ptr(eb, path->slots[0],
>> +			struct btrfs_file_extent_item);
>> +	extent_type = btrfs_file_extent_type(eb, fi);
>> +	if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
>> +		ret = -ENOENT;
>> +		goto out;
>> +	}
>> +
>> +	num_bytes = btrfs_file_extent_num_bytes(eb, fi);
>> +	logical = btrfs_file_extent_disk_bytenr(eb, fi);
>> +	if (logical == 0) {
>> +		ret = -ENOENT;
>> +		goto out;
>> +	}
>> +	logical += btrfs_file_extent_offset(eb, fi);
>> +
>> +	ret = extent_from_logical(sctx->send_root->fs_info,
>> +			logical, tmp_path,&found_key);
>> +	btrfs_release_path(tmp_path);
>> +
>> +	if (ret<  0)
>> +		goto out;
>> +	if (ret&  BTRFS_EXTENT_FLAG_TREE_BLOCK) {
>> +		ret = -EIO;
>> +		goto out;
>> +	}
>> +
>> +	/*
>> +	 * Setup the clone roots.
>> +	 */
>> +	for (i = 0; i<  sctx->clone_roots_cnt; i++) {
>> +		cur_clone_root = sctx->clone_roots + i;
>> +		cur_clone_root->ino = (u64)-1;
>> +		cur_clone_root->offset = 0;
>> +		cur_clone_root->found_refs = 0;
>> +	}
>> +
>> +	backref_ctx.sctx = sctx;
>> +	backref_ctx.found = 0;
>> +	backref_ctx.cur_objectid = ino;
>> +	backref_ctx.cur_offset = data_offset;
>> +	backref_ctx.found_in_send_root = 0;
>> +	backref_ctx.extent_len = num_bytes;
>> +
>> +	/*
>> +	 * The last extent of a file may be too large due to page alignment.
>> +	 * We need to adjust extent_len in this case so that the checks in
>> +	 * __iterate_backrefs work.
>> +	 */
>> +	if (data_offset + num_bytes>= ino_size)
>> +		backref_ctx.extent_len = ino_size - data_offset;
>> +
>> +	/*
>> +	 * Now collect all backrefs.
>> +	 */
>> +	extent_item_pos = logical - found_key.objectid;
>> +	ret = iterate_extent_inodes(sctx->send_root->fs_info,
>> +					found_key.objectid, extent_item_pos, 1,
>> +					__iterate_backrefs,&backref_ctx);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	if (!backref_ctx.found_in_send_root) {
>> +		/* found a bug in backref code? */
>> +		ret = -EIO;
>> +		printk(KERN_ERR "btrfs: ERROR did not find backref in "
>> +				"send_root. inode=%llu, offset=%llu, "
>> +				"logical=%llu\n",
>> +				ino, data_offset, logical);
>> +		goto out;
>> +	}
>> +
>> +verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
>> +		"ino=%llu, "
>> +		"num_bytes=%llu, logical=%llu\n",
>> +		data_offset, ino, num_bytes, logical);
>> +
>> +	if (!backref_ctx.found)
>> +		verbose_printk("btrfs:    no clones found\n");
>> +
>> +	cur_clone_root = NULL;
>> +	for (i = 0; i<  sctx->clone_roots_cnt; i++) {
>> +		if (sctx->clone_roots[i].found_refs) {
>> +			if (!cur_clone_root)
>> +				cur_clone_root = sctx->clone_roots + i;
>> +			else if (sctx->clone_roots[i].root == sctx->send_root)
>> +				/* prefer clones from send_root over others */
>> +				cur_clone_root = sctx->clone_roots + i;
>> +			break;
>
> If you break after the first found ref, you might miss the send_root.
>
Ay, that's true. Removed the break.
>> +		}
>> +
>> +	}
>> +
>> +	if (cur_clone_root) {
>> +		*found = cur_clone_root;
>> +		ret = 0;
>> +	} else {
>> +		ret = -ENOENT;
>> +	}
>> +
>> +out:
>> +	btrfs_free_path(tmp_path);
>> +	return ret;
>> +}
>> +
>> +static int read_symlink(struct send_ctx *sctx,
>> +			struct btrfs_root *root,
>> +			u64 ino,
>> +			struct fs_path *dest)
>> +{
>> +	int ret;
>> +	struct btrfs_path *path;
>> +	struct btrfs_key key;
>> +	struct btrfs_file_extent_item *ei;
>> +	u8 type;
>> +	u8 compression;
>> +	unsigned long off;
>> +	int len;
>> +
>> +	path = alloc_path_for_send();
>> +	if (!path)
>> +		return -ENOMEM;
>> +
>> +	key.objectid = ino;
>> +	key.type = BTRFS_EXTENT_DATA_KEY;
>> +	key.offset = 0;
>> +	ret = btrfs_search_slot(NULL, root,&key, path, 0, 0);
>> +	if (ret<  0)
>> +		goto out;
>> +	BUG_ON(ret);
>> +
>> +	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
>> +			struct btrfs_file_extent_item);
>> +	type = btrfs_file_extent_type(path->nodes[0], ei);
>> +	compression = btrfs_file_extent_compression(path->nodes[0], ei);
>> +	BUG_ON(type != BTRFS_FILE_EXTENT_INLINE);
>> +	BUG_ON(compression);
>> +
>> +	off = btrfs_file_extent_inline_start(ei);
>> +	len = btrfs_file_extent_inline_len(path->nodes[0], ei);
>> +
>> +	ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
>> +	if (ret<  0)
>> +		goto out;
>
> superfluous
>
>> +
>> +out:
>> +	btrfs_free_path(path);
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Helper function to generate a file name that is unique in the root of
>> + * send_root and parent_root. This is used to generate names for orphan inodes.
>> + */
>> +static int gen_unique_name(struct send_ctx *sctx,
>> +			   u64 ino, u64 gen,
>> +			   struct fs_path *dest)
>> +{
>> +	int ret = 0;
>> +	struct btrfs_path *path;
>> +	struct btrfs_dir_item *di;
>> +	char tmp[64];
>> +	int len;
>> +	u64 idx = 0;
>> +
>> +	path = alloc_path_for_send();
>> +	if (!path)
>> +		return -ENOMEM;
>> +
>> +	while (1) {
>> +		len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu",
>> +				ino, gen, idx);
>
> wouldn't it be easier to just take a uuid? This would save you a lot
> of code and especially the need to verify that the name is really
> unique, saving seeks.
The answer from Alex Lyakas is correct here. We generate unique names 
that must be the same for every call with the same ino/gen combination.
>
>> +		if (len>= sizeof(tmp)) {
>> +			/* should really not happen */
>> +			ret = -EOVERFLOW;
>> +			goto out;
>> +		}
>> +
>> +		di = btrfs_lookup_dir_item(NULL, sctx->send_root,
>> +				path, BTRFS_FIRST_FREE_OBJECTID,
>> +				tmp, strlen(tmp), 0);
>> +		btrfs_release_path(path);
>> +		if (IS_ERR(di)) {
>> +			ret = PTR_ERR(di);
>> +			goto out;
>> +		}
>> +		if (di) {
>> +			/* not unique, try again */
>> +			idx++;
>> +			continue;
>> +		}
>> +
>> +		if (!sctx->parent_root) {
>> +			/* unique */
>> +			ret = 0;
>> +			break;
>> +		}
>> +
>> +		di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
>> +				path, BTRFS_FIRST_FREE_OBJECTID,
>> +				tmp, strlen(tmp), 0);
>> +		btrfs_release_path(path);
>> +		if (IS_ERR(di)) {
>> +			ret = PTR_ERR(di);
>> +			goto out;
>> +		}
>> +		if (di) {
>> +			/* not unique, try again */
>> +			idx++;
>> +			continue;
>> +		}
>> +		/* unique */
>> +		break;
>> +	}
>> +
>> +	ret = fs_path_add(dest, tmp, strlen(tmp));
>> +
>> +out:
>> +	btrfs_free_path(path);
>> +	return ret;
>> +}
>> +
>> +enum inode_state {
>> +	inode_state_no_change,
>> +	inode_state_will_create,
>> +	inode_state_did_create,
>> +	inode_state_will_delete,
>> +	inode_state_did_delete,
>> +};
>> +
>> +static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
>
> don't you want to return a enum inode_state instead of int?
>
The function also returns error codes. Is it in such cases still 
preferred to return an enum?
>> +{
>> +	int ret;
>> +	int left_ret;
>> +	int right_ret;
>> +	u64 left_gen;
>> +	u64 right_gen;
>> +
>> +	ret = get_inode_info(sctx->send_root, ino, NULL,&left_gen, NULL, NULL,
>> +			NULL);
>> +	if (ret<  0&&  ret != -ENOENT)
>> +		goto out;
>> +	left_ret = ret;
>> +
>> +	if (!sctx->parent_root) {
>> +		right_ret = -ENOENT;
>> +	} else {
>> +		ret = get_inode_info(sctx->parent_root, ino, NULL,&right_gen,
>> +				NULL, NULL, NULL);
>> +		if (ret<  0&&  ret != -ENOENT)
>> +			goto out;
>> +		right_ret = ret;
>> +	}
>> +
>> +	if (!left_ret&&  !right_ret) {
>> +		if (left_gen == gen&&  right_gen == gen)
>
> Please also use {} here
>
Fixed.
>> +			ret = inode_state_no_change;
>> +		else if (left_gen == gen) {
>> +			if (ino<  sctx->send_progress)
>> +				ret = inode_state_did_create;
>> +			else
>> +				ret = inode_state_will_create;
>> +		} else if (right_gen == gen) {
>> +			if (ino<  sctx->send_progress)
>> +				ret = inode_state_did_delete;
>> +			else
>> +				ret = inode_state_will_delete;
>> +		} else  {
>> +			ret = -ENOENT;
>> +		}
>> +	} else if (!left_ret) {
>> +		if (left_gen == gen) {
>> +			if (ino<  sctx->send_progress)
>> +				ret = inode_state_did_create;
>> +			else
>> +				ret = inode_state_will_create;
>> +		} else {
>> +			ret = -ENOENT;
>> +		}
>> +	} else if (!right_ret) {
>> +		if (right_gen == gen) {
>> +			if (ino<  sctx->send_progress)
>> +				ret = inode_state_did_delete;
>> +			else
>> +				ret = inode_state_will_delete;
>> +		} else {
>> +			ret = -ENOENT;
>> +		}
>> +	} else {
>> +		ret = -ENOENT;
>> +	}
>> +
>> +out:
>> +	return ret;
>> +}
>> +
>> +static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
>> +{
>> +	int ret;
>> +
>> +	ret = get_cur_inode_state(sctx, ino, gen);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	if (ret == inode_state_no_change ||
>> +	    ret == inode_state_did_create ||
>> +	    ret == inode_state_will_delete)
>> +		ret = 1;
>> +	else
>> +		ret = 0;
>> +
>> +out:
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Helper function to lookup a dir item in a dir.
>> + */
>> +static int lookup_dir_item_inode(struct btrfs_root *root,
>> +				 u64 dir, const char *name, int name_len,
>> +				 u64 *found_inode,
>> +				 u8 *found_type)
>> +{
>> +	int ret = 0;
>> +	struct btrfs_dir_item *di;
>> +	struct btrfs_key key;
>> +	struct btrfs_path *path;
>> +
>> +	path = alloc_path_for_send();
>> +	if (!path)
>> +		return -ENOMEM;
>> +
>> +	di = btrfs_lookup_dir_item(NULL, root, path,
>> +			dir, name, name_len, 0);
>> +	if (!di) {
>> +		ret = -ENOENT;
>> +		goto out;
>> +	}
>> +	if (IS_ERR(di)) {
>> +		ret = PTR_ERR(di);
>> +		goto out;
>> +	}
>> +	btrfs_dir_item_key_to_cpu(path->nodes[0], di,&key);
>> +	*found_inode = key.objectid;
>> +	*found_type = btrfs_dir_type(path->nodes[0], di);
>> +
>> +out:
>> +	btrfs_free_path(path);
>> +	return ret;
>> +}
>> +
>> +static int get_first_ref(struct send_ctx *sctx,
>
> The name does not reflect well what the function does.
> It's more like get_first_parent_dir or get_first_inode_ref
>
I did not rename it as we have much more uses of xxx_ref function names 
which all would need to be renamed. I added a comment which should help 
to understand the purpose of this function.
>> +			 struct btrfs_root *root, u64 ino,
>> +			 u64 *dir, u64 *dir_gen, struct fs_path *name)
>> +{
>> +	int ret;
>> +	struct btrfs_key key;
>> +	struct btrfs_key found_key;
>> +	struct btrfs_path *path;
>> +	struct btrfs_inode_ref *iref;
>> +	int len;
>> +
>> +	path = alloc_path_for_send();
>> +	if (!path)
>> +		return -ENOMEM;
>> +
>> +	key.objectid = ino;
>> +	key.type = BTRFS_INODE_REF_KEY;
>> +	key.offset = 0;
>> +
>> +	ret = btrfs_search_slot_for_read(root,&key, path, 1, 0);
>> +	if (ret<  0)
>> +		goto out;
>> +	if (!ret)
>> +		btrfs_item_key_to_cpu(path->nodes[0],&found_key,
>> +				path->slots[0]);
>> +	if (ret || found_key.objectid != key.objectid ||
>> +	    found_key.type != key.type) {
>> +		ret = -ENOENT;
>> +		goto out;
>> +	}
>> +
>> +	iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
>> +			struct btrfs_inode_ref);
>> +	len = btrfs_inode_ref_name_len(path->nodes[0], iref);
>> +	ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
>> +			(unsigned long)(iref + 1), len);
>> +	if (ret<  0)
>> +		goto out;
>> +	btrfs_release_path(path);
>> +
>> +	ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL,
>> +			NULL);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	*dir = found_key.offset;
>> +
>> +out:
>> +	btrfs_free_path(path);
>> +	return ret;
>> +}
>> +
>> +static int is_first_ref(struct send_ctx *sctx,
>> +			struct btrfs_root *root,
>> +			u64 ino, u64 dir,
>> +			const char *name, int name_len)
>> +{
>> +	int ret;
>> +	struct fs_path *tmp_name;
>> +	u64 tmp_dir;
>> +	u64 tmp_dir_gen;
>> +
>> +	tmp_name = fs_path_alloc(sctx);
>> +	if (!tmp_name)
>> +		return -ENOMEM;
>> +
>> +	ret = get_first_ref(sctx, root, ino,&tmp_dir,&tmp_dir_gen, tmp_name);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	if (name_len != fs_path_len(tmp_name)) {
>> +		ret = 0;
>> +		goto out;
>> +	}
>> +
>> +	ret = memcmp(tmp_name->start, name, name_len);
>
> or just ret = !memcmp...?
>
Changed to !memcmp.
>> +	if (ret)
>> +		ret = 0;
>> +	else
>> +		ret = 1;
>> +
>> +out:
>> +	fs_path_free(sctx, tmp_name);
>> +	return ret;
>> +}
>> +
>> +static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
>> +			      const char *name, int name_len,
>> +			      u64 *who_ino, u64 *who_gen)
>> +{
>> +	int ret = 0;
>> +	u64 other_inode = 0;
>> +	u8 other_type = 0;
>> +
>> +	if (!sctx->parent_root)
>> +		goto out;
>> +
>> +	ret = is_inode_existent(sctx, dir, dir_gen);
>> +	if (ret<= 0)
>> +		goto out;
>> +
>> +	ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
>> +			&other_inode,&other_type);
>> +	if (ret<  0&&  ret != -ENOENT)
>> +		goto out;
>> +	if (ret) {
>> +		ret = 0;
>> +		goto out;
>> +	}
>> +
>> +	if (other_inode>  sctx->send_progress) {
>
> I haven't really grasped what this function does (a comment would be
> nice), but I have a feeling that renames might break things when the
> parent is not a direct ancenstor. Maybe it gets clearer when I read
> on ;)
>
Hmm in my tests it worked. We have problems when the snapshots have no 
relation at all, which were reported by Alex Lyakas and which I'll go 
through later.
Added a comment.
>> +		ret = get_inode_info(sctx->parent_root, other_inode, NULL,
>> +				who_gen, NULL, NULL, NULL);
>> +		if (ret<  0)
>> +			goto out;
>> +
>> +		ret = 1;
>> +		*who_ino = other_inode;
>> +	} else {
>> +		ret = 0;
>> +	}
>> +
>> +out:
>> +	return ret;
>> +}
>> +
Added a comment here.
>> +static int did_overwrite_ref(struct send_ctx *sctx,
>> +			    u64 dir, u64 dir_gen,
>> +			    u64 ino, u64 ino_gen,
>> +			    const char *name, int name_len)
>> +{
>> +	int ret = 0;
>> +	u64 gen;
>> +	u64 ow_inode;
>> +	u8 other_type;
>> +
>> +	if (!sctx->parent_root)
>> +		goto out;
>> +
>> +	ret = is_inode_existent(sctx, dir, dir_gen);
>> +	if (ret<= 0)
>> +		goto out;
>> +
>> +	/* check if the ref was overwritten by another ref */
>> +	ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
>> +			&ow_inode,&other_type);
>> +	if (ret<  0&&  ret != -ENOENT)
>> +		goto out;
>> +	if (ret) {
>> +		/* was never and will never be overwritten */
>> +		ret = 0;
>> +		goto out;
>> +	}
>> +
>> +	ret = get_inode_info(sctx->send_root, ow_inode, NULL,&gen, NULL, NULL,
>> +			NULL);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	if (ow_inode == ino&&  gen == ino_gen) {
>> +		ret = 0;
>> +		goto out;
>> +	}
>> +
>> +	/* we know that it is or will be overwritten. check this now */
>> +	if (ow_inode<  sctx->send_progress)
>> +		ret = 1;
>> +	else
>> +		ret = 0;
>> +
>> +out:
>> +	return ret;
>> +}
>> +
Added a comment here.
>> +static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
>> +{
>> +	int ret = 0;
>> +	struct fs_path *name = NULL;
>> +	u64 dir;
>> +	u64 dir_gen;
>> +
>> +	if (!sctx->parent_root)
>> +		goto out;
>> +
>> +	name = fs_path_alloc(sctx);
>> +	if (!name)
>> +		return -ENOMEM;
>> +
>> +	ret = get_first_ref(sctx, sctx->parent_root, ino,&dir,&dir_gen, name);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
>> +			name->start, fs_path_len(name));
>
>> +	if (ret<  0)
>> +		goto out;
>
> superfluous
>
Removed.
>> +
>> +out:
>> +	fs_path_free(sctx, name);
>> +	return ret;
>> +}
>> +
>> +static int name_cache_insert(struct send_ctx *sctx,
>> +			     struct name_cache_entry *nce)
>> +{
>> +	int ret = 0;
>> +	struct name_cache_entry **ncea;
>> +
>> +	ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
>
> attention: radix_trees take an unsigned long as index, and ino
> is a u64. You're in trouble on 32 bit.
>
Fixed by using the lower 32bits only for 32bit kernels and an additional 
list to store clashes.
>> +	if (ncea) {
>> +		if (!ncea[0])
>> +			ncea[0] = nce;
>> +		else if (!ncea[1])
>> +			ncea[1] = nce;
>> +		else
>> +			BUG();
>> +	} else {
>> +		ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS);
>> +		if (!ncea)
>> +			return -ENOMEM;
>> +
>> +		ncea[0] = nce;
>> +		ncea[1] = NULL;
>> +		ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea);
>> +		if (ret<  0)
>> +			return ret;
>> +	}
>> +	list_add_tail(&nce->list,&sctx->name_cache_list);
>> +	sctx->name_cache_size++;
>> +
>> +	return ret;
>> +}
>> +
>> +static void name_cache_delete(struct send_ctx *sctx,
>> +			      struct name_cache_entry *nce)
>> +{
>> +	struct name_cache_entry **ncea;
>> +
>> +	ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
>> +	BUG_ON(!ncea);
>> +
>> +	if (ncea[0] == nce)
>> +		ncea[0] = NULL;
>> +	else if (ncea[1] == nce)
>> +		ncea[1] = NULL;
>> +	else
>> +		BUG();
>> +
>> +	if (!ncea[0]&&  !ncea[1]) {
>> +		radix_tree_delete(&sctx->name_cache, nce->ino);
>> +		kfree(ncea);
>> +	}
>> +
>> +	list_del(&nce->list);
>> +
>> +	sctx->name_cache_size--;
>> +}
>> +
>> +static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
>> +						    u64 ino, u64 gen)
>> +{
>> +	struct name_cache_entry **ncea;
>> +
>> +	ncea = radix_tree_lookup(&sctx->name_cache, ino);
>> +	if (!ncea)
>> +		return NULL;
>> +
>> +	if (ncea[0]&&  ncea[0]->gen == gen)
>> +		return ncea[0];
>> +	else if (ncea[1]&&  ncea[1]->gen == gen)
>> +		return ncea[1];
>> +	return NULL;
>> +}
>> +
>> +static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
>> +{
>> +	list_del(&nce->list);
>> +	list_add_tail(&nce->list,&sctx->name_cache_list);
>> +}
>> +
>> +static void name_cache_clean_unused(struct send_ctx *sctx)
>> +{
>> +	struct name_cache_entry *nce;
>> +
>> +	if (sctx->name_cache_size<  SEND_CTX_NAME_CACHE_CLEAN_SIZE)
>> +		return;
>
> superfluous, the while condition below is enough.
>
Please note that the if and the while use different constants. I want to 
trigger cleanup only after some time and then clean up multiple entries 
at once.
>> +
>> +	while (sctx->name_cache_size>  SEND_CTX_MAX_NAME_CACHE_SIZE) {
>> +		nce = list_entry(sctx->name_cache_list.next,
>> +				struct name_cache_entry, list);
>> +		name_cache_delete(sctx, nce);
>> +		kfree(nce);
>> +	}
>> +}
>> +
>> +static void name_cache_free(struct send_ctx *sctx)
>> +{
>> +	struct name_cache_entry *nce;
>> +	struct name_cache_entry *tmp;
>> +
>> +	list_for_each_entry_safe(nce, tmp,&sctx->name_cache_list, list) {
>
> it's easier to just always delete the head until the list is empty.
> Saves you the tmp-var.
>
Changed it to while(!list_empty...
>> +		name_cache_delete(sctx, nce);
>> +	}
>> +}
>> +
>> +static int __get_cur_name_and_parent(struct send_ctx *sctx,
>> +				     u64 ino, u64 gen,
>> +				     u64 *parent_ino,
>> +				     u64 *parent_gen,
>> +				     struct fs_path *dest)
>> +{
>> +	int ret;
>> +	int nce_ret;
>> +	struct btrfs_path *path = NULL;
>> +	struct name_cache_entry *nce = NULL;
>> +
>> +	nce = name_cache_search(sctx, ino, gen);
>> +	if (nce) {
>> +		if (ino<  sctx->send_progress&&  nce->need_later_update) {
>> +			name_cache_delete(sctx, nce);
>> +			kfree(nce);
>> +			nce = NULL;
>> +		} else {
>> +			name_cache_used(sctx, nce);
>> +			*parent_ino = nce->parent_ino;
>> +			*parent_gen = nce->parent_gen;
>> +			ret = fs_path_add(dest, nce->name, nce->name_len);
>> +			if (ret<  0)
>> +				goto out;
>> +			ret = nce->ret;
>> +			goto out;
>> +		}
>> +	}
>> +
>> +	path = alloc_path_for_send();
>> +	if (!path)
>> +		return -ENOMEM;
>> +
>> +	ret = is_inode_existent(sctx, ino, gen);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	if (!ret) {
>> +		ret = gen_unique_name(sctx, ino, gen, dest);
>> +		if (ret<  0)
>> +			goto out;
>> +		ret = 1;
>> +		goto out_cache;
>> +	}
>> +
>> +	if (ino<  sctx->send_progress)
>> +		ret = get_first_ref(sctx, sctx->send_root, ino,
>> +				parent_ino, parent_gen, dest);
>> +	else
>> +		ret = get_first_ref(sctx, sctx->parent_root, ino,
>> +				parent_ino, parent_gen, dest);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
>> +			dest->start, dest->end - dest->start);
>> +	if (ret<  0)
>> +		goto out;
>> +	if (ret) {
>> +		fs_path_reset(dest);
>> +		ret = gen_unique_name(sctx, ino, gen, dest);
>> +		if (ret<  0)
>> +			goto out;
>> +		ret = 1;
>> +	}
>> +
>> +out_cache:
>> +	nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
>> +	if (!nce) {
>> +		ret = -ENOMEM;
>> +		goto out;
>> +	}
>> +
>> +	nce->ino = ino;
>> +	nce->gen = gen;
>> +	nce->parent_ino = *parent_ino;
>> +	nce->parent_gen = *parent_gen;
>> +	nce->name_len = fs_path_len(dest);
>> +	nce->ret = ret;
>
> This is a bit too magic for me. ret == 1 iff it's a unique_name?
Added more comments. 1 means get_cur_path needs to stop and that we have 
an orphan inode.
>
>> +	strcpy(nce->name, dest->start);
>> +	memset(&nce->use_list, 0, sizeof(nce->use_list));
>
> use_list is unused, anyway, it's a strange way to initialize a
> list_head. There's the INIT_LIST_HEAD macro.
use_list is removed now.
>
>> +
>> +	if (ino<  sctx->send_progress)
>> +		nce->need_later_update = 0;
>> +	else
>> +		nce->need_later_update = 1;
>> +
>> +	nce_ret = name_cache_insert(sctx, nce);
>> +	if (nce_ret<  0)
>> +		ret = nce_ret;
>> +	name_cache_clean_unused(sctx);
>> +
>> +out:
>> +	btrfs_free_path(path);
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Magic happens here. This function returns the first ref to an inode as it
>> + * would look like while receiving the stream at this point in time.
>> + * We walk the path up to the root. For every inode in between, we check if it
>> + * was already processed/sent. If yes, we continue with the parent as found
>> + * in send_root. If not, we continue with the parent as found in parent_root.
>> + * If we encounter an inode that was deleted at this point in time, we use the
>> + * inodes "orphan" name instead of the real name and stop. Same with new inodes
>> + * that were not created yet and overwritten inodes/refs.
>> + *
>> + * When do we have have orphan inodes:
>> + * 1. When an inode is freshly created and thus no valid refs are available yet
>> + * 2. When a directory lost all it's refs (deleted) but still has dir items
>> + *    inside which were not processed yet (pending for move/delete). If anyone
>> + *    tried to get the path to the dir items, it would get a path inside that
>> + *    orphan directory.
>> + * 3. When an inode is moved around or gets new links, it may overwrite the ref
>> + *    of an unprocessed inode. If in that case the first ref would be
>> + *    overwritten, the overwritten inode gets "orphanized". Later when we
>> + *    process this overwritten inode, it is restored at a new place by moving
>> + *    the orphan inode.
>> + *
>> + * sctx->send_progress tells this function at which point in time receiving
>> + * would be.
>> + */
>
> Thanks for the comment :)
>
>> +static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
>> +			struct fs_path *dest)
>> +{
>> +	int ret = 0;
>> +	struct fs_path *name = NULL;
>> +	u64 parent_inode = 0;
>> +	u64 parent_gen = 0;
>> +	int stop = 0;
>> +
>> +	name = fs_path_alloc(sctx);
>> +	if (!name) {
>> +		ret = -ENOMEM;
>> +		goto out;
>> +	}
>> +
>> +	dest->reversed = 1;
>> +	fs_path_reset(dest);
>> +
>> +	while (!stop&&  ino != BTRFS_FIRST_FREE_OBJECTID) {
>> +		fs_path_reset(name);
>> +
>> +		ret = __get_cur_name_and_parent(sctx, ino, gen,
>> +				&parent_inode,&parent_gen, name);
>> +		if (ret<  0)
>> +			goto out;
>> +		if (ret)
>> +			stop = 1;
>> +
>> +		ret = fs_path_add_path(dest, name);
>> +		if (ret<  0)
>> +			goto out;
>> +
>> +		ino = parent_inode;
>> +		gen = parent_gen;
>> +	}
>> +
>> +out:
>> +	fs_path_free(sctx, name);
>> +	if (!ret)
>> +		fs_path_unreverse(dest);
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Called for regular files when sending extents data. Opens a struct file
>> + * to read from the file.
>> + */
>> +static int open_cur_inode_file(struct send_ctx *sctx)
>> +{
>> +	int ret = 0;
>> +	struct btrfs_key key;
>> +	struct vfsmount *mnt;
>> +	struct inode *inode;
>> +	struct dentry *dentry;
>> +	struct file *filp;
>> +	int new = 0;
>> +
>> +	if (sctx->cur_inode_filp)
>> +		goto out;
>> +
>> +	key.objectid = sctx->cur_ino;
>> +	key.type = BTRFS_INODE_ITEM_KEY;
>> +	key.offset = 0;
>> +
>> +	inode = btrfs_iget(sctx->send_root->fs_info->sb,&key, sctx->send_root,
>> +			&new);
>> +	if (IS_ERR(inode)) {
>> +		ret = PTR_ERR(inode);
>> +		goto out;
>> +	}
>> +
>> +	dentry = d_obtain_alias(inode);
>> +	inode = NULL;
>> +	if (IS_ERR(dentry)) {
>> +		ret = PTR_ERR(dentry);
>> +		goto out;
>> +	}
>> +
>> +	mnt = mntget(sctx->mnt);
>> +	filp = dentry_open(dentry, mnt, O_RDONLY | O_LARGEFILE, current_cred());
>> +	dentry = NULL;
>> +	mnt = NULL;
>
> It would be good if this part could be reviewed by someone with
> deep vfs knowledge. Maybe you can compile those parts into a
> separate patch and send it to the appropriate ppl for review.
>
Linus had to merge parts of this function and did not complain. I will 
probably still send a new mail regarding this and the other vfs parts.
>> +	if (IS_ERR(filp)) {
>> +		ret = PTR_ERR(filp);
>> +		goto out;
>> +	}
>> +	sctx->cur_inode_filp = filp;
>> +
>> +out:
>> +	/*
>> +	 * no xxxput required here as every vfs op
>> +	 * does it by itself on failure
>> +	 */
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Closes the struct file that was created in open_cur_inode_file
>> + */
>> +static int close_cur_inode_file(struct send_ctx *sctx)
>> +{
>> +	int ret = 0;
>> +
>> +	if (!sctx->cur_inode_filp)
>> +		goto out;
>> +
>> +	ret = filp_close(sctx->cur_inode_filp, NULL);
>> +	sctx->cur_inode_filp = NULL;
>> +
>> +out:
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
>> + */
>> +static int send_subvol_begin(struct send_ctx *sctx)
>> +{
>> +	int ret;
>> +	struct btrfs_root *send_root = sctx->send_root;
>> +	struct btrfs_root *parent_root = sctx->parent_root;
>> +	struct btrfs_path *path;
>> +	struct btrfs_key key;
>> +	struct btrfs_root_ref *ref;
>> +	struct extent_buffer *leaf;
>> +	char *name = NULL;
>> +	int namelen;
>> +
>> +	path = alloc_path_for_send();
>> +	if (!path)
>> +		return -ENOMEM;
>> +
>> +	name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS);
>> +	if (!name) {
>> +		btrfs_free_path(path);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	key.objectid = send_root->objectid;
>> +	key.type = BTRFS_ROOT_BACKREF_KEY;
>> +	key.offset = 0;
>> +
>> +	ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
>> +				&key, path, 1, 0);
>> +	if (ret<  0)
>> +		goto out;
>> +	if (ret) {
>> +		ret = -ENOENT;
>> +		goto out;
>> +	}
>> +
>> +	leaf = path->nodes[0];
>> +	btrfs_item_key_to_cpu(leaf,&key, path->slots[0]);
>> +	if (key.type != BTRFS_ROOT_BACKREF_KEY ||
>> +	    key.objectid != send_root->objectid) {
>> +		ret = -ENOENT;
>> +		goto out;
>> +	}
>
> It looks like we could use a helper for finding the first entry
> with a specific objectid+key...
>
Hmm yepp, I have a lot of places where things like this happen. Will do 
that later.
>> +	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
>> +	namelen = btrfs_root_ref_name_len(leaf, ref);
>> +	read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
>> +	btrfs_release_path(path);
>> +
>> +	if (ret<  0)
>> +		goto out;
>
> How can ret be<  0 here?
Whoops, a leftover. Removed.
>
>> +
>> +	if (parent_root) {
>> +		ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
>> +		if (ret<  0)
>> +			goto out;
>> +	} else {
>> +		ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
>> +		if (ret<  0)
>> +			goto out;
>> +	}
>> +
>> +	TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
>
> It's called PATH, but it seems to be only the last path component.
> What about subvols that are ancored deeper in the dir tree?
>
Sounds like btrfs_root_ref_name_len does not contain the full path? If 
not, need to handle that.
>> +	TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
>> +			sctx->send_root->root_item.uuid);
>> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
>> +			sctx->send_root->root_item.ctransid);
>> +	if (parent_root) {
>
> The name of the parent is not sent?
>
Nope. We can't use it for anything. And when we later allow to receive 
to an arbitrary path, we can't count on the parent path/name but only on 
the uuid of the parent.
>> +		TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
>> +				sctx->parent_root->root_item.uuid);
>> +		TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
>> +				sctx->parent_root->root_item.ctransid);
>> +	}
>> +
>> +	ret = send_cmd(sctx);
>> +
>> +tlv_put_failure:
>> +out:
>> +	btrfs_free_path(path);
>> +	kfree(name);
>> +	return ret;
>> +}
>> +
>> +static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
>> +{
>> +	int ret = 0;
>> +	struct fs_path *p;
>> +
>> +verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
>> +
>> +	p = fs_path_alloc(sctx);
>> +	if (!p)
>> +		return -ENOMEM;
>> +
>> +	ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = get_cur_path(sctx, ino, gen, p);
>> +	if (ret<  0)
>> +		goto out;
>> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
>> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
>> +
>> +	ret = send_cmd(sctx);
>> +
>> +tlv_put_failure:
>> +out:
>> +	fs_path_free(sctx, p);
>> +	return ret;
>> +}
>> +
>> +static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
>> +{
>> +	int ret = 0;
>> +	struct fs_path *p;
>> +
>> +verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
>> +
>> +	p = fs_path_alloc(sctx);
>> +	if (!p)
>> +		return -ENOMEM;
>> +
>> +	ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = get_cur_path(sctx, ino, gen, p);
>> +	if (ret<  0)
>> +		goto out;
>> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
>> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode&  07777);
>
> four 7?
>
>> +
>> +	ret = send_cmd(sctx);
>> +
>> +tlv_put_failure:
>> +out:
>> +	fs_path_free(sctx, p);
>> +	return ret;
>> +}
>> +
>> +static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
>> +{
>> +	int ret = 0;
>> +	struct fs_path *p;
>> +
>> +verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
>> +
>> +	p = fs_path_alloc(sctx);
>> +	if (!p)
>> +		return -ENOMEM;
>> +
>> +	ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = get_cur_path(sctx, ino, gen, p);
>> +	if (ret<  0)
>> +		goto out;
>> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
>> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
>> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
>> +
>> +	ret = send_cmd(sctx);
>> +
>> +tlv_put_failure:
>> +out:
>> +	fs_path_free(sctx, p);
>> +	return ret;
>> +}
>> +
>> +static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
>> +{
>> +	int ret = 0;
>> +	struct fs_path *p = NULL;
>> +	struct btrfs_inode_item *ii;
>> +	struct btrfs_path *path = NULL;
>> +	struct extent_buffer *eb;
>> +	struct btrfs_key key;
>> +	int slot;
>> +
>> +verbose_printk("btrfs: send_utimes %llu\n", ino);
>> +
>> +	p = fs_path_alloc(sctx);
>> +	if (!p)
>> +		return -ENOMEM;
>> +
>> +	path = alloc_path_for_send();
>> +	if (!path) {
>> +		ret = -ENOMEM;
>> +		goto out;
>> +	}
>> +
>> +	key.objectid = ino;
>> +	key.type = BTRFS_INODE_ITEM_KEY;
>> +	key.offset = 0;
>> +	ret = btrfs_search_slot(NULL, sctx->send_root,&key, path, 0, 0);
>> +	if (ret<  0)
>> +		goto out;
>
> you don't check for existence. I guess you know it exists, otherwise
> you wouldn't end up here...
>
Yepp, calling this function (and other send_xxx functions) is only 
allowed on send_root and with existing inodes.
>> +
>> +	eb = path->nodes[0];
>> +	slot = path->slots[0];
>> +	ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
>> +
>> +	ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = get_cur_path(sctx, ino, gen, p);
>> +	if (ret<  0)
>> +		goto out;
>> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
>> +	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb,
>> +			btrfs_inode_atime(ii));
>> +	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb,
>> +			btrfs_inode_mtime(ii));
>> +	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
>> +			btrfs_inode_ctime(ii));
>> +	/* TODO otime? */
>
> yes, please :)
>
Can't do that for now. We need to wait for the otime patches to come 
into upstream before. I changed the comment to make this more clear and 
also added a TODO to
https://btrfs.wiki.kernel.org/index.php/Btrfs_Send/Receive
>> +
>> +	ret = send_cmd(sctx);
>> +
>> +tlv_put_failure:
>> +out:
>> +	fs_path_free(sctx, p);
>> +	btrfs_free_path(path);
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
>> + * a valid path yet because we did not process the refs yet. So, the inode
>> + * is created as orphan.
>> + */
>> +static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path,
>> +			     struct btrfs_key *key)
>> +{
>> +	int ret = 0;
>> +	struct extent_buffer *eb = path->nodes[0];
>> +	struct btrfs_inode_item *ii;
>> +	struct fs_path *p;
>> +	int slot = path->slots[0];
>> +	int cmd;
>> +	u64 mode;
>> +
>> +verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
>> +
>> +	p = fs_path_alloc(sctx);
>> +	if (!p)
>> +		return -ENOMEM;
>> +
>> +	ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
>> +	mode = btrfs_inode_mode(eb, ii);
>> +
>> +	if (S_ISREG(mode))
>> +		cmd = BTRFS_SEND_C_MKFILE;
>> +	else if (S_ISDIR(mode))
>> +		cmd = BTRFS_SEND_C_MKDIR;
>> +	else if (S_ISLNK(mode))
>> +		cmd = BTRFS_SEND_C_SYMLINK;
>> +	else if (S_ISCHR(mode) || S_ISBLK(mode))
>> +		cmd = BTRFS_SEND_C_MKNOD;
>> +	else if (S_ISFIFO(mode))
>> +		cmd = BTRFS_SEND_C_MKFIFO;
>> +	else if (S_ISSOCK(mode))
>> +		cmd = BTRFS_SEND_C_MKSOCK;
>> +	else {
>
> normally you'd put {} in all cases if you need it for one.
>
Fixed that.
>> +		printk(KERN_WARNING "btrfs: unexpected inode type %o",
>> +				(int)(mode&  S_IFMT));
>> +		ret = -ENOTSUPP;
>> +		goto out;
>> +	}
>> +
>> +	ret = begin_cmd(sctx, cmd);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
>> +
>> +	if (S_ISLNK(mode)) {
>> +		fs_path_reset(p);
>> +		ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p);
>> +		if (ret<  0)
>> +			goto out;
>> +		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
>> +	} else if (S_ISCHR(mode) || S_ISBLK(mode) ||
>> +		   S_ISFIFO(mode) || S_ISSOCK(mode)) {
>> +		TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii));
>> +	}
>> +
>> +	ret = send_cmd(sctx);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +
>> +tlv_put_failure:
>> +out:
>> +	fs_path_free(sctx, p);
>> +	return ret;
>> +}
>> +
>> +struct recorded_ref {
>> +	struct list_head list;
>> +	char *dir_path;
>> +	char *name;
>> +	struct fs_path *full_path;
>> +	u64 dir;
>> +	u64 dir_gen;
>> +	int dir_path_len;
>> +	int name_len;
>> +};
>> +
>> +/*
>> + * We need to process new refs before deleted refs, but compare_tree gives us
>> + * everything mixed. So we first record all refs and later process them.
>> + * This function is a helper to record one ref.
>> + */
>> +static int record_ref(struct list_head *head, u64 dir,
>> +		      u64 dir_gen, struct fs_path *path)
>> +{
>> +	struct recorded_ref *ref;
>> +	char *tmp;
>> +
>> +	ref = kmalloc(sizeof(*ref), GFP_NOFS);
>> +	if (!ref)
>> +		return -ENOMEM;
>> +
>> +	ref->dir = dir;
>> +	ref->dir_gen = dir_gen;
>> +	ref->full_path = path;
>> +
>> +	tmp = strrchr(ref->full_path->start, '/');
>> +	if (!tmp) {
>> +		ref->name_len = ref->full_path->end - ref->full_path->start;
>> +		ref->name = ref->full_path->start;
>> +		ref->dir_path_len = 0;
>> +		ref->dir_path = ref->full_path->start;
>> +	} else {
>> +		tmp++;
>> +		ref->name_len = ref->full_path->end - tmp;
>> +		ref->name = tmp;
>> +		ref->dir_path = ref->full_path->start;
>> +		ref->dir_path_len = ref->full_path->end -
>> +				ref->full_path->start - 1 - ref->name_len;
>> +	}
>> +
>> +	list_add_tail(&ref->list, head);
>> +	return 0;
>> +}
>> +
>> +static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
>> +{
>> +	struct recorded_ref *cur;
>> +	struct recorded_ref *tmp;
>> +
>> +	list_for_each_entry_safe(cur, tmp, head, list) {
>> +		fs_path_free(sctx, cur->full_path);
>> +		kfree(cur);
>> +	}
>> +	INIT_LIST_HEAD(head);
>
> This is a bit non-obvious. You use the _safe-macro as if you're
> going to delete each entry, but then you don't delete it and
> instead just reset the head. I'd prefer a while(!list_empty())-
> list_del-loop here.
>
Changed to use while(!list_empty...
>> +}
>> +
>> +static void free_recorded_refs(struct send_ctx *sctx)
>> +{
>> +	__free_recorded_refs(sctx,&sctx->new_refs);
>> +	__free_recorded_refs(sctx,&sctx->deleted_refs);
>> +}
>> +
>> +/*
>> + * Renames/moves a file/dir to it's orphan name. Used when the first
>                                    its
>
>> + * ref of an unprocessed inode gets overwritten and for all non empty
>> + * directories.
>> + */
>> +static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
>> +			  struct fs_path *path)
>> +{
>> +	int ret;
>> +	struct fs_path *orphan;
>> +
>> +	orphan = fs_path_alloc(sctx);
>> +	if (!orphan)
>> +		return -ENOMEM;
>> +
>> +	ret = gen_unique_name(sctx, ino, gen, orphan);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = send_rename(sctx, path, orphan);
>> +
>> +out:
>> +	fs_path_free(sctx, orphan);
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Returns 1 if a directory can be removed at this point in time.
>> + * We check this by iterating all dir items and checking if the inode behind
>> + * the dir item was already processed.
>> + */
>> +static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
>> +{
>> +	int ret = 0;
>> +	struct btrfs_root *root = sctx->parent_root;
>> +	struct btrfs_path *path;
>> +	struct btrfs_key key;
>> +	struct btrfs_key found_key;
>> +	struct btrfs_key loc;
>> +	struct btrfs_dir_item *di;
>> +
>> +	path = alloc_path_for_send();
>> +	if (!path)
>> +		return -ENOMEM;
>> +
>> +	key.objectid = dir;
>> +	key.type = BTRFS_DIR_INDEX_KEY;
>> +	key.offset = 0;
>> +
>> +	while (1) {
>> +		ret = btrfs_search_slot_for_read(root,&key, path, 1, 0);
>> +		if (ret<  0)
>> +			goto out;
>> +		if (!ret) {
>> +			btrfs_item_key_to_cpu(path->nodes[0],&found_key,
>> +					path->slots[0]);
>> +		}
>> +		if (ret || found_key.objectid != key.objectid ||
>> +		    found_key.type != key.type) {
>> +			break;
>> +		}
>
> another case for the above mentioned helper...
>
>> +
>> +		di = btrfs_item_ptr(path->nodes[0], path->slots[0],
>> +				struct btrfs_dir_item);
>> +		btrfs_dir_item_key_to_cpu(path->nodes[0], di,&loc);
>> +
>> +		if (loc.objectid>  send_progress) {
>> +			ret = 0;
>> +			goto out;
>> +		}
>> +
>> +		btrfs_release_path(path);
>> +		key.offset = found_key.offset + 1;
>> +	}
>> +
>> +	ret = 1;
>> +
>> +out:
>> +	btrfs_free_path(path);
>> +	return ret;
>> +}
>> +
>> +/*
>> + * This does all the move/link/unlink/rmdir magic.
>> + */
>> +static int process_recorded_refs(struct send_ctx *sctx)
>> +{
>> +	int ret = 0;
>> +	struct recorded_ref *cur;
>> +	struct ulist *check_dirs = NULL;
>> +	struct ulist_iterator uit;
>> +	struct ulist_node *un;
>> +	struct fs_path *valid_path = NULL;
>> +	u64 ow_inode;
>> +	u64 ow_gen;
>> +	int did_overwrite = 0;
>> +	int is_orphan = 0;
>> +
>> +verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
>> +
>> +	valid_path = fs_path_alloc(sctx);
>> +	if (!valid_path) {
>> +		ret = -ENOMEM;
>> +		goto out;
>> +	}
>> +
>> +	check_dirs = ulist_alloc(GFP_NOFS);
>> +	if (!check_dirs) {
>> +		ret = -ENOMEM;
>> +		goto out;
>> +	}
>> +
>> +	/*
>> +	 * First, check if the first ref of the current inode was overwritten
>> +	 * before. If yes, we know that the current inode was already orphanized
>> +	 * and thus use the orphan name. If not, we can use get_cur_path to
>> +	 * get the path of the first ref as it would like while receiving at
>> +	 * this point in time.
>> +	 * New inodes are always orphan at the beginning, so force to use the
>> +	 * orphan name in this case.
>> +	 * The first ref is stored in valid_path and will be updated if it
>> +	 * gets moved around.
>> +	 */
>> +	if (!sctx->cur_inode_new) {
>> +		ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
>> +				sctx->cur_inode_gen);
>> +		if (ret<  0)
>> +			goto out;
>> +		if (ret)
>> +			did_overwrite = 1;
>> +	}
>> +	if (sctx->cur_inode_new || did_overwrite) {
>> +		ret = gen_unique_name(sctx, sctx->cur_ino,
>> +				sctx->cur_inode_gen, valid_path);
>> +		if (ret<  0)
>> +			goto out;
>> +		is_orphan = 1;
>> +	} else {
>> +		ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
>> +				valid_path);
>> +		if (ret<  0)
>> +			goto out;
>> +	}
>> +
>> +	list_for_each_entry(cur,&sctx->new_refs, list) {
>> +		/*
>> +		 * Check if this new ref would overwrite the first ref of
>> +		 * another unprocessed inode. If yes, orphanize the
>> +		 * overwritten inode. If we find an overwritten ref that is
>> +		 * not the first ref, simply unlink it.
>> +		 */
>> +		ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
>> +				cur->name, cur->name_len,
>> +				&ow_inode,&ow_gen);
>> +		if (ret<  0)
>> +			goto out;
>> +		if (ret) {
>> +			ret = is_first_ref(sctx, sctx->parent_root,
>> +					ow_inode, cur->dir, cur->name,
>> +					cur->name_len);
>> +			if (ret<  0)
>> +				goto out;
>> +			if (ret) {
>> +				ret = orphanize_inode(sctx, ow_inode, ow_gen,
>> +						cur->full_path);
>> +				if (ret<  0)
>> +					goto out;
>> +			} else {
>> +				ret = send_unlink(sctx, cur->full_path);
>> +				if (ret<  0)
>> +					goto out;
>> +			}
>> +		}
>> +
>> +		/*
>> +		 * link/move the ref to the new place. If we have an orphan
>> +		 * inode, move it and update valid_path. If not, link or move
>> +		 * it depending on the inode mode.
>> +		 */
>> +		if (is_orphan) {
>> +			ret = send_rename(sctx, valid_path, cur->full_path);
>> +			if (ret<  0)
>> +				goto out;
>> +			is_orphan = 0;
>> +			ret = fs_path_copy(valid_path, cur->full_path);
>> +			if (ret<  0)
>> +				goto out;
>> +		} else {
>> +			if (S_ISDIR(sctx->cur_inode_mode)) {
>
> why not save a level of indentation here by using<else if>?
>
The if does not exist anymore due to a recent patch.
>> +				/*
>> +				 * Dirs can't be linked, so move it. For moved
>> +				 * dirs, we always have one new and one deleted
>> +				 * ref. The deleted ref is ignored later.
>> +				 */
>> +				ret = send_rename(sctx, valid_path,
>> +						cur->full_path);
>> +				if (ret<  0)
>> +					goto out;
>> +				ret = fs_path_copy(valid_path, cur->full_path);
>> +				if (ret<  0)
>> +					goto out;
>> +			} else {
>> +				ret = send_link(sctx, valid_path,
>> +						cur->full_path);
>> +				if (ret<  0)
>> +					goto out;
>> +			}
>> +		}
>> +		ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
>
> careful, aux is only an unsigned long, meant to be as large as a pointer.
>
Will make aux 64bit.
>> +				GFP_NOFS);
>> +		if (ret<  0)
>> +			goto out;
>> +	}
>> +
>> +	if (S_ISDIR(sctx->cur_inode_mode)&&  sctx->cur_inode_deleted) {
>> +		/*
>> +		 * Check if we can already rmdir the directory. If not,
>> +		 * orphanize it. For every dir item inside that gets deleted
>> +		 * later, we do this check again and rmdir it then if possible.
>> +		 * See the use of check_dirs for more details.
>> +		 */
>> +		ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino);
>> +		if (ret<  0)
>> +			goto out;
>> +		if (ret) {
>> +			ret = send_rmdir(sctx, valid_path);
>> +			if (ret<  0)
>> +				goto out;
>> +		} else if (!is_orphan) {
>> +			ret = orphanize_inode(sctx, sctx->cur_ino,
>> +					sctx->cur_inode_gen, valid_path);
>> +			if (ret<  0)
>> +				goto out;
>> +			is_orphan = 1;
>> +		}
>> +
>> +		list_for_each_entry(cur,&sctx->deleted_refs, list) {
>> +			ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
>> +					GFP_NOFS);
>> +			if (ret<  0)
>> +				goto out;
>> +		}
>> +	} else if (!S_ISDIR(sctx->cur_inode_mode)) {
>> +		/*
>> +		 * We have a non dir inode. Go through all deleted refs and
>> +		 * unlink them if they were not already overwritten by other
>> +		 * inodes.
>> +		 */
>> +		list_for_each_entry(cur,&sctx->deleted_refs, list) {
>> +			ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen,
>> +					sctx->cur_ino, sctx->cur_inode_gen,
>> +					cur->name, cur->name_len);
>> +			if (ret<  0)
>> +				goto out;
>> +			if (!ret) {
>> +				ret = send_unlink(sctx, cur->full_path);
>> +				if (ret<  0)
>> +					goto out;
>> +			}
>> +			ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
>> +					GFP_NOFS);
>> +			if (ret<  0)
>> +				goto out;
>> +		}
>> +
>> +		/*
>> +		 * If the inode is still orphan, unlink the orphan. This may
>> +		 * happen when a previous inode did overwrite the first ref
>> +		 * of this inode and no new refs were added for the current
>> +		 * inode.
>> +		 */
>> +		if (is_orphan) {
>> +			ret = send_unlink(sctx, valid_path);
>> +			if (ret<  0)
>> +				goto out;
>> +		}
>> +	}
>> +
>> +	/*
>> +	 * We did collect all parent dirs where cur_inode was once located. We
>> +	 * now go through all these dirs and check if they are pending for
>> +	 * deletion and if it's finally possible to perform the rmdir now.
>> +	 * We also update the inode stats of the parent dirs here.
>> +	 */
>> +	ULIST_ITER_INIT(&uit);
>> +	while ((un = ulist_next(check_dirs,&uit))) {
>> +		if (un->val>  sctx->cur_ino)
>> +			continue;
>> +
>> +		ret = get_cur_inode_state(sctx, un->val, un->aux);
>> +		if (ret<  0)
>> +			goto out;
>> +
>> +		if (ret == inode_state_did_create ||
>> +		    ret == inode_state_no_change) {
>> +			/* TODO delayed utimes */
>> +			ret = send_utimes(sctx, un->val, un->aux);
>> +			if (ret<  0)
>> +				goto out;
>> +		} else if (ret == inode_state_did_delete) {
>> +			ret = can_rmdir(sctx, un->val, sctx->cur_ino);
>> +			if (ret<  0)
>> +				goto out;
>> +			if (ret) {
>> +				ret = get_cur_path(sctx, un->val, un->aux,
>> +						valid_path);
>> +				if (ret<  0)
>> +					goto out;
>> +				ret = send_rmdir(sctx, valid_path);
>> +				if (ret<  0)
>> +					goto out;
>> +			}
>> +		}
>> +	}
>> +
>> +	/*
>> +	 * Current inode is now at it's new position, so we must increase
>                                     its
>> +	 * send_progress
>> +	 */
>> +	sctx->send_progress = sctx->cur_ino + 1;
>
> is this the right place for it, or should be done at the calling
> site?
>
You're right, the caller should update send_progress. Especially as 
currently send_progress gets updated too early in the cur_inode_new_gen 
case (as reported by Alex Lyakas).
>> +
>> +	ret = 0;
>> +
>> +out:
>> +	free_recorded_refs(sctx);
>> +	ulist_free(check_dirs);
>> +	fs_path_free(sctx, valid_path);
>> +	return ret;
>> +}
>> +
>> +static int __record_new_ref(int num, u64 dir, int index,
>> +			    struct fs_path *name,
>> +			    void *ctx)
>> +{
>> +	int ret = 0;
>> +	struct send_ctx *sctx = ctx;
>> +	struct fs_path *p;
>> +	u64 gen;
>> +
>> +	p = fs_path_alloc(sctx);
>> +	if (!p)
>> +		return -ENOMEM;
>> +
>> +	ret = get_inode_info(sctx->send_root, dir, NULL,&gen, NULL, NULL,
>> +			NULL);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = get_cur_path(sctx, dir, gen, p);
>> +	if (ret<  0)
>> +		goto out;
>> +	ret = fs_path_add_path(p, name);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = record_ref(&sctx->new_refs, dir, gen, p);
>> +
>> +out:
>> +	if (ret)
>> +		fs_path_free(sctx, p);
>> +	return ret;
>> +}
>> +
>> +static int __record_deleted_ref(int num, u64 dir, int index,
>> +				struct fs_path *name,
>> +				void *ctx)
>> +{
>> +	int ret = 0;
>> +	struct send_ctx *sctx = ctx;
>> +	struct fs_path *p;
>> +	u64 gen;
>> +
>> +	p = fs_path_alloc(sctx);
>> +	if (!p)
>> +		return -ENOMEM;
>> +
>> +	ret = get_inode_info(sctx->parent_root, dir, NULL,&gen, NULL, NULL,
>> +			NULL);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = get_cur_path(sctx, dir, gen, p);
>> +	if (ret<  0)
>> +		goto out;
>> +	ret = fs_path_add_path(p, name);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = record_ref(&sctx->deleted_refs, dir, gen, p);
>> +
>> +out:
>> +	if (ret)
>> +		fs_path_free(sctx, p);
>> +	return ret;
>> +}
>> +
>> +static int record_new_ref(struct send_ctx *sctx)
>> +{
>> +	int ret;
>> +
>> +	ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
>> +			sctx->cmp_key, 0, __record_new_ref, sctx);
>> +
>> +	return ret;
>> +}
>> +
>> +static int record_deleted_ref(struct send_ctx *sctx)
>> +{
>> +	int ret;
>> +
>> +	ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
>> +			sctx->cmp_key, 0, __record_deleted_ref, sctx);
>> +	return ret;
>> +}
>> +
>> +struct find_ref_ctx {
>> +	u64 dir;
>> +	struct fs_path *name;
>> +	int found_idx;
>> +};
>> +
>> +static int __find_iref(int num, u64 dir, int index,
>> +		       struct fs_path *name,
>> +		       void *ctx_)
>> +{
>> +	struct find_ref_ctx *ctx = ctx_;
>> +
>> +	if (dir == ctx->dir&&  fs_path_len(name) == fs_path_len(ctx->name)&&
>> +	    strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) {
>> +		ctx->found_idx = num;
>> +		return 1;
>> +	}
>> +	return 0;
>> +}
>> +
>> +static int find_iref(struct send_ctx *sctx,
>> +		     struct btrfs_root *root,
>> +		     struct btrfs_path *path,
>> +		     struct btrfs_key *key,
>> +		     u64 dir, struct fs_path *name)
>> +{
>> +	int ret;
>> +	struct find_ref_ctx ctx;
>> +
>> +	ctx.dir = dir;
>> +	ctx.name = name;
>> +	ctx.found_idx = -1;
>> +
>> +	ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref,&ctx);
>> +	if (ret<  0)
>> +		return ret;
>> +
>> +	if (ctx.found_idx == -1)
>> +		return -ENOENT;
>> +
>> +	return ctx.found_idx;
>> +}
>> +
>> +static int __record_changed_new_ref(int num, u64 dir, int index,
>> +				    struct fs_path *name,
>> +				    void *ctx)
>> +{
>> +	int ret;
>> +	struct send_ctx *sctx = ctx;
>> +
>> +	ret = find_iref(sctx, sctx->parent_root, sctx->right_path,
>> +			sctx->cmp_key, dir, name);
>> +	if (ret == -ENOENT)
>> +		ret = __record_new_ref(num, dir, index, name, sctx);
>> +	else if (ret>  0)
>> +		ret = 0;
>> +
>> +	return ret;
>> +}
>> +
>> +static int __record_changed_deleted_ref(int num, u64 dir, int index,
>> +					struct fs_path *name,
>> +					void *ctx)
>> +{
>> +	int ret;
>> +	struct send_ctx *sctx = ctx;
>> +
>> +	ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
>> +			dir, name);
>> +	if (ret == -ENOENT)
>> +		ret = __record_deleted_ref(num, dir, index, name, sctx);
>> +	else if (ret>  0)
>> +		ret = 0;
>> +
>> +	return ret;
>> +}
>> +
>> +static int record_changed_ref(struct send_ctx *sctx)
>> +{
>> +	int ret = 0;
>> +
>> +	ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
>> +			sctx->cmp_key, 0, __record_changed_new_ref, sctx);
>> +	if (ret<  0)
>> +		goto out;
>> +	ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
>> +			sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
>> +
>> +out:
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Record and process all refs at once. Needed when an inode changes the
>> + * generation number, which means that it was deleted and recreated.
>> + */
>> +static int process_all_refs(struct send_ctx *sctx,
>> +			    enum btrfs_compare_tree_result cmd)
>> +{
>> +	int ret;
>> +	struct btrfs_root *root;
>> +	struct btrfs_path *path;
>> +	struct btrfs_key key;
>> +	struct btrfs_key found_key;
>> +	struct extent_buffer *eb;
>> +	int slot;
>> +	iterate_inode_ref_t cb;
>> +
>> +	path = alloc_path_for_send();
>> +	if (!path)
>> +		return -ENOMEM;
>> +
>> +	if (cmd == BTRFS_COMPARE_TREE_NEW) {
>> +		root = sctx->send_root;
>> +		cb = __record_new_ref;
>> +	} else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
>> +		root = sctx->parent_root;
>> +		cb = __record_deleted_ref;
>> +	} else {
>> +		BUG();
>> +	}
>> +
>> +	key.objectid = sctx->cmp_key->objectid;
>> +	key.type = BTRFS_INODE_REF_KEY;
>> +	key.offset = 0;
>> +	while (1) {
>> +		ret = btrfs_search_slot_for_read(root,&key, path, 1, 0);
>> +		if (ret<  0) {
>> +			btrfs_release_path(path);
>
> not needed
Removed.
>
>> +			goto out;
>> +		}
>> +		if (ret) {
>> +			btrfs_release_path(path);
>
> ditto
Was needed here as we have a call to process_recorded_refs after the 
break. But I moved the release out of the loop so that we don't have 
that duplicated for the next break.
>
>> +			break;
>> +		}
>> +
>> +		eb = path->nodes[0];
>> +		slot = path->slots[0];
>> +		btrfs_item_key_to_cpu(eb,&found_key, slot);
>> +
>> +		if (found_key.objectid != key.objectid ||
>> +		    found_key.type != key.type) {
>> +			btrfs_release_path(path);
>
> and here
See above.
>
>> +			break;
>> +		}
>
> helper :)
>
>> +
>> +		ret = iterate_inode_ref(sctx, sctx->parent_root, path,
>> +				&found_key, 0, cb, sctx);
>> +		btrfs_release_path(path);
>> +		if (ret<  0)
>> +			goto out;
>> +
>> +		key.offset = found_key.offset + 1;
>> +	}
>> +
>> +	ret = process_recorded_refs(sctx);
>> +
>> +out:
>> +	btrfs_free_path(path);
>> +	return ret;
>> +}
>> +
>> +static int send_set_xattr(struct send_ctx *sctx,
>> +			  struct fs_path *path,
>> +			  const char *name, int name_len,
>> +			  const char *data, int data_len)
>> +{
>> +	int ret = 0;
>> +
>> +	ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
>> +	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
>> +	TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
>> +
>> +	ret = send_cmd(sctx);
>> +
>> +tlv_put_failure:
>> +out:
>> +	return ret;
>> +}
>> +
>> +static int send_remove_xattr(struct send_ctx *sctx,
>> +			  struct fs_path *path,
>> +			  const char *name, int name_len)
>> +{
>> +	int ret = 0;
>> +
>> +	ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
>> +	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
>> +
>> +	ret = send_cmd(sctx);
>> +
>> +tlv_put_failure:
>> +out:
>> +	return ret;
>> +}
>> +
>> +static int __process_new_xattr(int num, const char *name, int name_len,
>> +			       const char *data, int data_len,
>> +			       u8 type, void *ctx)
>> +{
>> +	int ret;
>> +	struct send_ctx *sctx = ctx;
>> +	struct fs_path *p;
>> +	posix_acl_xattr_header dummy_acl;
>> +
>> +	p = fs_path_alloc(sctx);
>> +	if (!p)
>> +		return -ENOMEM;
>> +
>> +	/*
>> +	 * This hack is needed because empty acl's are stored as zero byte
>> +	 * data in xattrs. Problem with that is, that receiving these zero byte
>> +	 * acl's will fail later. To fix this, we send a dummy acl list that
>> +	 * only contains the version number and no entries.
>> +	 */
>> +	if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
>> +	    !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
>> +		if (data_len == 0) {
>> +			dummy_acl.a_version =
>> +					cpu_to_le32(POSIX_ACL_XATTR_VERSION);
>> +			data = (char *)&dummy_acl;
>> +			data_len = sizeof(dummy_acl);
>> +		}
>> +	}
>> +
>> +	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
>> +
>> +out:
>> +	fs_path_free(sctx, p);
>> +	return ret;
>> +}
>> +
>> +static int __process_deleted_xattr(int num, const char *name, int name_len,
>> +				   const char *data, int data_len,
>> +				   u8 type, void *ctx)
>> +{
>> +	int ret;
>> +	struct send_ctx *sctx = ctx;
>> +	struct fs_path *p;
>> +
>> +	p = fs_path_alloc(sctx);
>> +	if (!p)
>> +		return -ENOMEM;
>> +
>> +	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = send_remove_xattr(sctx, p, name, name_len);
>> +
>> +out:
>> +	fs_path_free(sctx, p);
>> +	return ret;
>> +}
>> +
>> +static int process_new_xattr(struct send_ctx *sctx)
>> +{
>> +	int ret = 0;
>> +
>> +	ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
>> +			sctx->cmp_key, __process_new_xattr, sctx);
>> +
>> +	return ret;
>> +}
>> +
>> +static int process_deleted_xattr(struct send_ctx *sctx)
>> +{
>> +	int ret;
>> +
>> +	ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
>> +			sctx->cmp_key, __process_deleted_xattr, sctx);
>> +
>> +	return ret;
>> +}
>> +
>> +struct find_xattr_ctx {
>> +	const char *name;
>> +	int name_len;
>> +	int found_idx;
>> +	char *found_data;
>> +	int found_data_len;
>> +};
>> +
>> +static int __find_xattr(int num, const char *name, int name_len,
>> +			const char *data, int data_len,
>> +			u8 type, void *vctx)
>> +{
>> +	struct find_xattr_ctx *ctx = vctx;
>> +
>> +	if (name_len == ctx->name_len&&
>> +	    strncmp(name, ctx->name, name_len) == 0) {
>> +		ctx->found_idx = num;
>> +		ctx->found_data_len = data_len;
>> +		ctx->found_data = kmalloc(data_len, GFP_NOFS);
>> +		if (!ctx->found_data)
>> +			return -ENOMEM;
>> +		memcpy(ctx->found_data, data, data_len);
>> +		return 1;
>> +	}
>> +	return 0;
>> +}
>> +
>> +static int find_xattr(struct send_ctx *sctx,
>> +		      struct btrfs_root *root,
>> +		      struct btrfs_path *path,
>> +		      struct btrfs_key *key,
>> +		      const char *name, int name_len,
>> +		      char **data, int *data_len)
>> +{
>> +	int ret;
>> +	struct find_xattr_ctx ctx;
>> +
>> +	ctx.name = name;
>> +	ctx.name_len = name_len;
>> +	ctx.found_idx = -1;
>> +	ctx.found_data = NULL;
>> +	ctx.found_data_len = 0;
>> +
>> +	ret = iterate_dir_item(sctx, root, path, key, __find_xattr,&ctx);
>> +	if (ret<  0)
>> +		return ret;
>> +
>> +	if (ctx.found_idx == -1)
>> +		return -ENOENT;
>> +	if (data) {
>> +		*data = ctx.found_data;
>> +		*data_len = ctx.found_data_len;
>> +	} else {
>> +		kfree(ctx.found_data);
>> +	}
>> +	return ctx.found_idx;
>> +}
>> +
>> +
>> +static int __process_changed_new_xattr(int num, const char *name, int name_len,
>> +				       const char *data, int data_len,
>> +				       u8 type, void *ctx)
>> +{
>> +	int ret;
>> +	struct send_ctx *sctx = ctx;
>> +	char *found_data = NULL;
>> +	int found_data_len  = 0;
>> +	struct fs_path *p = NULL;
>> +
>> +	ret = find_xattr(sctx, sctx->parent_root, sctx->right_path,
>> +			sctx->cmp_key, name, name_len,&found_data,
>> +			&found_data_len);
>> +	if (ret == -ENOENT) {
>> +		ret = __process_new_xattr(num, name, name_len, data, data_len,
>> +				type, ctx);
>> +	} else if (ret>= 0) {
>> +		if (data_len != found_data_len ||
>> +		    memcmp(data, found_data, data_len)) {
>> +			ret = __process_new_xattr(num, name, name_len, data,
>> +					data_len, type, ctx);
>> +		} else {
>> +			ret = 0;
>> +		}
>> +	}
>> +
>> +	kfree(found_data);
>> +	fs_path_free(sctx, p);
>> +	return ret;
>> +}
>> +
>> +static int __process_changed_deleted_xattr(int num, const char *name,
>> +					   int name_len,
>> +					   const char *data, int data_len,
>> +					   u8 type, void *ctx)
>> +{
>> +	int ret;
>> +	struct send_ctx *sctx = ctx;
>> +
>> +	ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
>> +			name, name_len, NULL, NULL);
>> +	if (ret == -ENOENT)
>> +		ret = __process_deleted_xattr(num, name, name_len, data,
>> +				data_len, type, ctx);
>> +	else if (ret>= 0)
>> +		ret = 0;
>> +
>> +	return ret;
>> +}
>> +
>> +static int process_changed_xattr(struct send_ctx *sctx)
>> +{
>> +	int ret = 0;
>> +
>> +	ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
>> +			sctx->cmp_key, __process_changed_new_xattr, sctx);
>> +	if (ret<  0)
>> +		goto out;
>> +	ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
>> +			sctx->cmp_key, __process_changed_deleted_xattr, sctx);
>> +
>> +out:
>> +	return ret;
>> +}
>> +
>> +static int process_all_new_xattrs(struct send_ctx *sctx)
>> +{
>> +	int ret;
>> +	struct btrfs_root *root;
>> +	struct btrfs_path *path;
>> +	struct btrfs_key key;
>> +	struct btrfs_key found_key;
>> +	struct extent_buffer *eb;
>> +	int slot;
>> +
>> +	path = alloc_path_for_send();
>> +	if (!path)
>> +		return -ENOMEM;
>> +
>> +	root = sctx->send_root;
>> +
>> +	key.objectid = sctx->cmp_key->objectid;
>> +	key.type = BTRFS_XATTR_ITEM_KEY;
>> +	key.offset = 0;
>> +	while (1) {
>> +		ret = btrfs_search_slot_for_read(root,&key, path, 1, 0);
>> +		if (ret<  0)
>> +			goto out;
>> +		if (ret) {
>> +			ret = 0;
>> +			goto out;
>> +		}
>> +
>> +		eb = path->nodes[0];
>> +		slot = path->slots[0];
>> +		btrfs_item_key_to_cpu(eb,&found_key, slot);
>> +
>> +		if (found_key.objectid != key.objectid ||
>> +		    found_key.type != key.type) {
>> +			ret = 0;
>> +			goto out;
>> +		}
>
> helper...
>
>> +
>> +		ret = iterate_dir_item(sctx, root, path,&found_key,
>> +				__process_new_xattr, sctx);
>> +		if (ret<  0)
>> +			goto out;
>> +
>> +		btrfs_release_path(path);
>> +		key.offset = found_key.offset + 1;
>> +	}
>> +
>> +out:
>> +	btrfs_free_path(path);
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Read some bytes from the current inode/file and send a write command to
>> + * user space.
>> + */
>> +static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
>> +{
>> +	int ret = 0;
>> +	struct fs_path *p;
>> +	loff_t pos = offset;
>> +	int readed;
>> +	mm_segment_t old_fs;
>> +
>> +	p = fs_path_alloc(sctx);
>> +	if (!p)
>> +		return -ENOMEM;
>> +
>> +	/*
>> +	 * vfs normally only accepts user space buffers for security reasons.
>> +	 * we only read from the file and also only provide the read_buf buffer
>> +	 * to vfs. As this buffer does not come from a user space call, it's
>> +	 * ok to temporary allow kernel space buffers.
>> +	 */
>> +	old_fs = get_fs();
>> +	set_fs(KERNEL_DS);
>> +
>> +verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
>> +
>> +	ret = open_cur_inode_file(sctx);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len,&pos);
>> +	if (ret<  0)
>> +		goto out;
>> +	readed = ret;
>
> num_read?
>
renamed.
>> +	if (!readed)
>> +		goto out;
>> +
>> +	ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
>> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
>> +	TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed);
>> +
>> +	ret = send_cmd(sctx);
>> +
>> +tlv_put_failure:
>> +out:
>> +	fs_path_free(sctx, p);
>> +	set_fs(old_fs);
>> +	if (ret<  0)
>> +		return ret;
>> +	return readed;
>> +}
>> +
>> +/*
>> + * Send a clone command to user space.
>> + */
>> +static int send_clone(struct send_ctx *sctx,
>> +		      u64 offset, u32 len,
>> +		      struct clone_root *clone_root)
>> +{
>> +	int ret = 0;
>> +	struct btrfs_root *clone_root2 = clone_root->root;
>
> a name from hell :)
>
Removed that one completely and using clone_root->root below.
>> +	struct fs_path *p;
>> +	u64 gen;
>> +
>> +verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
>> +	       "clone_inode=%llu, clone_offset=%llu\n", offset, len,
>> +		clone_root->root->objectid, clone_root->ino,
>> +		clone_root->offset);
>> +
>> +	p = fs_path_alloc(sctx);
>> +	if (!p)
>> +		return -ENOMEM;
>> +
>> +	ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
>> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
>> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
>> +
>> +	if (clone_root2 == sctx->send_root) {
>> +		ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
>> +				&gen, NULL, NULL, NULL);
>> +		if (ret<  0)
>> +			goto out;
>> +		ret = get_cur_path(sctx, clone_root->ino, gen, p);
>> +	} else {
>> +		ret = get_inode_path(sctx, clone_root2, clone_root->ino, p);
>> +	}
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
>> +			clone_root2->root_item.uuid);
>> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
>> +			clone_root2->root_item.ctransid);
>> +	TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
>> +	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
>> +			clone_root->offset);
>> +
>> +	ret = send_cmd(sctx);
>> +
>> +tlv_put_failure:
>> +out:
>> +	fs_path_free(sctx, p);
>> +	return ret;
>> +}
>> +
>> +static int send_write_or_clone(struct send_ctx *sctx,
>> +			       struct btrfs_path *path,
>> +			       struct btrfs_key *key,
>> +			       struct clone_root *clone_root)
>> +{
>> +	int ret = 0;
>> +	struct btrfs_file_extent_item *ei;
>> +	u64 offset = key->offset;
>> +	u64 pos = 0;
>> +	u64 len;
>> +	u32 l;
>> +	u8 type;
>> +
>> +	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
>> +			struct btrfs_file_extent_item);
>> +	type = btrfs_file_extent_type(path->nodes[0], ei);
>> +	if (type == BTRFS_FILE_EXTENT_INLINE)
>> +		len = btrfs_file_extent_inline_len(path->nodes[0], ei);
>> +	else
>> +		len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
>
> BTRFS_FILE_EXTENT_PREALLOC?
>
Isn't num_bytes also valid for PREALLOC?
>> +
>> +	if (offset + len>  sctx->cur_inode_size)
>> +		len = sctx->cur_inode_size - offset;
>> +	if (len == 0) {
>> +		ret = 0;
>> +		goto out;
>> +	}
>> +
>> +	if (!clone_root) {
>> +		while (pos<  len) {
>> +			l = len - pos;
>> +			if (l>  BTRFS_SEND_READ_SIZE)
>> +				l = BTRFS_SEND_READ_SIZE;
>> +			ret = send_write(sctx, pos + offset, l);
>> +			if (ret<  0)
>> +				goto out;
>> +			if (!ret)
>> +				break;
>> +			pos += ret;
>> +		}
>> +		ret = 0;
>> +	} else {
>> +		ret = send_clone(sctx, offset, len, clone_root);
>> +	}
>> +
>> +out:
>> +	return ret;
>> +}
>> +
>> +static int is_extent_unchanged(struct send_ctx *sctx,
>> +			       struct btrfs_path *left_path,
>> +			       struct btrfs_key *ekey)
>> +{
>> +	int ret = 0;
>> +	struct btrfs_key key;
>> +	struct btrfs_path *path = NULL;
>> +	struct extent_buffer *eb;
>> +	int slot;
>> +	struct btrfs_key found_key;
>> +	struct btrfs_file_extent_item *ei;
>> +	u64 left_disknr;
>> +	u64 right_disknr;
>> +	u64 left_offset;
>> +	u64 right_offset;
>> +	u64 left_len;
>> +	u64 right_len;
>> +	u8 left_type;
>> +	u8 right_type;
>> +
>> +	path = alloc_path_for_send();
>> +	if (!path)
>> +		return -ENOMEM;
>> +
>> +	eb = left_path->nodes[0];
>> +	slot = left_path->slots[0];
>> +
>> +	ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
>> +	left_type = btrfs_file_extent_type(eb, ei);
>> +	left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
>> +	left_len = btrfs_file_extent_num_bytes(eb, ei);
>> +	left_offset = btrfs_file_extent_offset(eb, ei);
>> +
>> +	if (left_type != BTRFS_FILE_EXTENT_REG) {
>> +		ret = 0;
>> +		goto out;
>> +	}
>> +
>> +	key.objectid = ekey->objectid;
>> +	key.type = BTRFS_EXTENT_DATA_KEY;
>> +	key.offset = ekey->offset;
>> +
>> +	while (1) {
>> +		ret = btrfs_search_slot_for_read(sctx->parent_root,&key, path,
>> +				0, 0);
>> +		if (ret<  0)
>> +			goto out;
>> +		if (ret) {
>> +			ret = 0;
>> +			goto out;
>> +		}
>> +		btrfs_item_key_to_cpu(path->nodes[0],&found_key,
>> +				path->slots[0]);
>> +		if (found_key.objectid != key.objectid ||
>> +		    found_key.type != key.type) {
>> +			ret = 0;
>> +			goto out;
>> +		}
>> +
>
> helper...
>
>> +		eb = path->nodes[0];
>> +		slot = path->slots[0];
>> +
>> +		ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
>> +		right_type = btrfs_file_extent_type(eb, ei);
>> +		right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
>> +		right_len = btrfs_file_extent_num_bytes(eb, ei);
>> +		right_offset = btrfs_file_extent_offset(eb, ei);
>> +		btrfs_release_path(path);
>> +
>> +		if (right_type != BTRFS_FILE_EXTENT_REG) {
>> +			ret = 0;
>> +			goto out;
>> +		}
>> +
>> +		if (left_disknr != right_disknr) {
>> +			ret = 0;
>> +			goto out;
>> +		}
>> +
>> +		key.offset = found_key.offset + right_len;
>> +		if (key.offset>= ekey->offset + left_len) {
>> +			ret = 1;
>> +			goto out;
>> +		}
>> +	}
>> +
>> +out:
>> +	btrfs_free_path(path);
>> +	return ret;
>> +}
>> +
>> +static int process_extent(struct send_ctx *sctx,
>> +			  struct btrfs_path *path,
>> +			  struct btrfs_key *key)
>> +{
>> +	int ret = 0;
>> +	struct clone_root *found_clone = NULL;
>> +
>> +	if (S_ISLNK(sctx->cur_inode_mode))
>> +		return 0;
>> +
>> +	if (sctx->parent_root&&  !sctx->cur_inode_new) {
>> +		ret = is_extent_unchanged(sctx, path, key);
>> +		if (ret<  0)
>> +			goto out;
>> +		if (ret) {
>> +			ret = 0;
>> +			goto out;
>> +		}
>> +	}
>> +
>> +	ret = find_extent_clone(sctx, path, key->objectid, key->offset,
>> +			sctx->cur_inode_size,&found_clone);
>> +	if (ret != -ENOENT&&  ret<  0)
>> +		goto out;
>> +
>> +	ret = send_write_or_clone(sctx, path, key, found_clone);
>> +
>> +out:
>> +	return ret;
>> +}
>> +
>> +static int process_all_extents(struct send_ctx *sctx)
>> +{
>> +	int ret;
>> +	struct btrfs_root *root;
>> +	struct btrfs_path *path;
>> +	struct btrfs_key key;
>> +	struct btrfs_key found_key;
>> +	struct extent_buffer *eb;
>> +	int slot;
>> +
>> +	root = sctx->send_root;
>> +	path = alloc_path_for_send();
>> +	if (!path)
>> +		return -ENOMEM;
>> +
>> +	key.objectid = sctx->cmp_key->objectid;
>> +	key.type = BTRFS_EXTENT_DATA_KEY;
>> +	key.offset = 0;
>> +	while (1) {
>> +		ret = btrfs_search_slot_for_read(root,&key, path, 1, 0);
>> +		if (ret<  0)
>> +			goto out;
>> +		if (ret) {
>> +			ret = 0;
>> +			goto out;
>> +		}
>> +
>> +		eb = path->nodes[0];
>> +		slot = path->slots[0];
>> +		btrfs_item_key_to_cpu(eb,&found_key, slot);
>> +
>> +		if (found_key.objectid != key.objectid ||
>> +		    found_key.type != key.type) {
>> +			ret = 0;
>> +			goto out;
>> +		}
>> +
>> +		ret = process_extent(sctx, path,&found_key);
>> +		if (ret<  0)
>> +			goto out;
>> +
>> +		btrfs_release_path(path);
>> +		key.offset = found_key.offset + 1;
>> +	}
>> +
>> +out:
>> +	btrfs_free_path(path);
>> +	return ret;
>> +}
>> +
>> +static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
>> +{
>> +	int ret = 0;
>> +
>> +	if (sctx->cur_ino == 0)
>> +		goto out;
>> +	if (!at_end&&  sctx->cur_ino == sctx->cmp_key->objectid&&
>> +	    sctx->cmp_key->type<= BTRFS_INODE_REF_KEY)
>> +		goto out;
>> +	if (list_empty(&sctx->new_refs)&&  list_empty(&sctx->deleted_refs))
>> +		goto out;
>> +
>> +	ret = process_recorded_refs(sctx);
>> +
>> +out:
>> +	return ret;
>> +}
>> +
>> +static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
>> +{
>> +	int ret = 0;
>> +	u64 left_mode;
>> +	u64 left_uid;
>> +	u64 left_gid;
>> +	u64 right_mode;
>> +	u64 right_uid;
>> +	u64 right_gid;
>> +	int need_chmod = 0;
>> +	int need_chown = 0;
>> +
>> +	ret = process_recorded_refs_if_needed(sctx, at_end);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
>> +		goto out;
>> +	if (!at_end&&  sctx->cmp_key->objectid == sctx->cur_ino)
>> +		goto out;
>> +
>> +	ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
>> +			&left_mode,&left_uid,&left_gid);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	if (!S_ISLNK(sctx->cur_inode_mode)) {
>> +		if (!sctx->parent_root || sctx->cur_inode_new) {
>> +			need_chmod = 1;
>> +			need_chown = 1;
>> +		} else {
>> +			ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
>> +					NULL, NULL,&right_mode,&right_uid,
>> +					&right_gid);
>> +			if (ret<  0)
>> +				goto out;
>> +
>> +			if (left_uid != right_uid || left_gid != right_gid)
>> +				need_chown = 1;
>> +			if (left_mode != right_mode)
>> +				need_chmod = 1;
>> +		}
>> +	}
>> +
>> +	if (S_ISREG(sctx->cur_inode_mode)) {
>> +		ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
>> +				sctx->cur_inode_size);
>> +		if (ret<  0)
>> +			goto out;
>> +	}
>> +
>> +	if (need_chown) {
>> +		ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen,
>> +				left_uid, left_gid);
>> +		if (ret<  0)
>> +			goto out;
>> +	}
>> +	if (need_chmod) {
>> +		ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen,
>> +				left_mode);
>> +		if (ret<  0)
>> +			goto out;
>> +	}
>> +
>> +	/*
>> +	 * Need to send that every time, no matter if it actually changed
>> +	 * between the two trees as we have done changes to the inode before.
>> +	 */
>> +	ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +out:
>> +	return ret;
>> +}
>> +
>> +static int changed_inode(struct send_ctx *sctx,
>> +			 enum btrfs_compare_tree_result result)
>> +{
>> +	int ret = 0;
>> +	struct btrfs_key *key = sctx->cmp_key;
>> +	struct btrfs_inode_item *left_ii = NULL;
>> +	struct btrfs_inode_item *right_ii = NULL;
>> +	u64 left_gen = 0;
>> +	u64 right_gen = 0;
>> +
>> +	ret = close_cur_inode_file(sctx);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	sctx->cur_ino = key->objectid;
>> +	sctx->cur_inode_new_gen = 0;
>> +	sctx->send_progress = sctx->cur_ino;
>> +
>> +	if (result == BTRFS_COMPARE_TREE_NEW ||
>> +	    result == BTRFS_COMPARE_TREE_CHANGED) {
>> +		left_ii = btrfs_item_ptr(sctx->left_path->nodes[0],
>> +				sctx->left_path->slots[0],
>> +				struct btrfs_inode_item);
>> +		left_gen = btrfs_inode_generation(sctx->left_path->nodes[0],
>> +				left_ii);
>> +	} else {
>> +		right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
>> +				sctx->right_path->slots[0],
>> +				struct btrfs_inode_item);
>> +		right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
>> +				right_ii);
>> +	}
>> +	if (result == BTRFS_COMPARE_TREE_CHANGED) {
>> +		right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
>> +				sctx->right_path->slots[0],
>> +				struct btrfs_inode_item);
>> +
>> +		right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
>> +				right_ii);
>> +		if (left_gen != right_gen)
>> +			sctx->cur_inode_new_gen = 1;
>> +	}
>> +
>> +	if (result == BTRFS_COMPARE_TREE_NEW) {
>> +		sctx->cur_inode_gen = left_gen;
>> +		sctx->cur_inode_new = 1;
>> +		sctx->cur_inode_deleted = 0;
>> +		sctx->cur_inode_size = btrfs_inode_size(
>> +				sctx->left_path->nodes[0], left_ii);
>> +		sctx->cur_inode_mode = btrfs_inode_mode(
>> +				sctx->left_path->nodes[0], left_ii);
>> +		if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
>> +			ret = send_create_inode(sctx, sctx->left_path,
>> +					sctx->cmp_key);
>> +	} else if (result == BTRFS_COMPARE_TREE_DELETED) {
>> +		sctx->cur_inode_gen = right_gen;
>> +		sctx->cur_inode_new = 0;
>> +		sctx->cur_inode_deleted = 1;
>> +		sctx->cur_inode_size = btrfs_inode_size(
>> +				sctx->right_path->nodes[0], right_ii);
>> +		sctx->cur_inode_mode = btrfs_inode_mode(
>> +				sctx->right_path->nodes[0], right_ii);
>> +	} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
>> +		if (sctx->cur_inode_new_gen) {
>> +			sctx->cur_inode_gen = right_gen;
>> +			sctx->cur_inode_new = 0;
>> +			sctx->cur_inode_deleted = 1;
>> +			sctx->cur_inode_size = btrfs_inode_size(
>> +					sctx->right_path->nodes[0], right_ii);
>> +			sctx->cur_inode_mode = btrfs_inode_mode(
>> +					sctx->right_path->nodes[0], right_ii);
>> +			ret = process_all_refs(sctx,
>> +					BTRFS_COMPARE_TREE_DELETED);
>> +			if (ret<  0)
>> +				goto out;
>> +
>> +			sctx->cur_inode_gen = left_gen;
>> +			sctx->cur_inode_new = 1;
>> +			sctx->cur_inode_deleted = 0;
>> +			sctx->cur_inode_size = btrfs_inode_size(
>> +					sctx->left_path->nodes[0], left_ii);
>> +			sctx->cur_inode_mode = btrfs_inode_mode(
>> +					sctx->left_path->nodes[0], left_ii);
>> +			ret = send_create_inode(sctx, sctx->left_path,
>> +					sctx->cmp_key);
>> +			if (ret<  0)
>> +				goto out;
>> +
>> +			ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
>> +			if (ret<  0)
>> +				goto out;
>> +			ret = process_all_extents(sctx);
>> +			if (ret<  0)
>> +				goto out;
>> +			ret = process_all_new_xattrs(sctx);
>> +			if (ret<  0)
>> +				goto out;
>> +		} else {
>> +			sctx->cur_inode_gen = left_gen;
>> +			sctx->cur_inode_new = 0;
>> +			sctx->cur_inode_new_gen = 0;
>> +			sctx->cur_inode_deleted = 0;
>> +			sctx->cur_inode_size = btrfs_inode_size(
>> +					sctx->left_path->nodes[0], left_ii);
>> +			sctx->cur_inode_mode = btrfs_inode_mode(
>> +					sctx->left_path->nodes[0], left_ii);
>> +		}
>> +	}
>> +
>> +out:
>> +	return ret;
>> +}
>> +
>> +static int changed_ref(struct send_ctx *sctx,
>> +		       enum btrfs_compare_tree_result result)
>> +{
>> +	int ret = 0;
>> +
>> +	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
>> +
>> +	if (!sctx->cur_inode_new_gen&&
>> +	    sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
>> +		if (result == BTRFS_COMPARE_TREE_NEW)
>> +			ret = record_new_ref(sctx);
>> +		else if (result == BTRFS_COMPARE_TREE_DELETED)
>> +			ret = record_deleted_ref(sctx);
>> +		else if (result == BTRFS_COMPARE_TREE_CHANGED)
>> +			ret = record_changed_ref(sctx);
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +static int changed_xattr(struct send_ctx *sctx,
>> +			 enum btrfs_compare_tree_result result)
>> +{
>> +	int ret = 0;
>> +
>> +	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
>> +
>> +	if (!sctx->cur_inode_new_gen&&  !sctx->cur_inode_deleted) {
>> +		if (result == BTRFS_COMPARE_TREE_NEW)
>> +			ret = process_new_xattr(sctx);
>> +		else if (result == BTRFS_COMPARE_TREE_DELETED)
>> +			ret = process_deleted_xattr(sctx);
>> +		else if (result == BTRFS_COMPARE_TREE_CHANGED)
>> +			ret = process_changed_xattr(sctx);
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +static int changed_extent(struct send_ctx *sctx,
>> +			  enum btrfs_compare_tree_result result)
>> +{
>> +	int ret = 0;
>> +
>> +	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
>> +
>> +	if (!sctx->cur_inode_new_gen&&  !sctx->cur_inode_deleted) {
>> +		if (result != BTRFS_COMPARE_TREE_DELETED)
>> +			ret = process_extent(sctx, sctx->left_path,
>> +					sctx->cmp_key);
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +
>> +static int changed_cb(struct btrfs_root *left_root,
>> +		      struct btrfs_root *right_root,
>> +		      struct btrfs_path *left_path,
>> +		      struct btrfs_path *right_path,
>> +		      struct btrfs_key *key,
>> +		      enum btrfs_compare_tree_result result,
>> +		      void *ctx)
>> +{
>> +	int ret = 0;
>> +	struct send_ctx *sctx = ctx;
>> +
>> +	sctx->left_path = left_path;
>> +	sctx->right_path = right_path;
>> +	sctx->cmp_key = key;
>> +
>> +	ret = finish_inode_if_needed(sctx, 0);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	if (key->type == BTRFS_INODE_ITEM_KEY)
>> +		ret = changed_inode(sctx, result);
>> +	else if (key->type == BTRFS_INODE_REF_KEY)
>> +		ret = changed_ref(sctx, result);
>> +	else if (key->type == BTRFS_XATTR_ITEM_KEY)
>> +		ret = changed_xattr(sctx, result);
>> +	else if (key->type == BTRFS_EXTENT_DATA_KEY)
>> +		ret = changed_extent(sctx, result);
>> +
>> +out:
>> +	return ret;
>> +}
>> +
>> +static int full_send_tree(struct send_ctx *sctx)
>> +{
>> +	int ret;
>> +	struct btrfs_trans_handle *trans = NULL;
>> +	struct btrfs_root *send_root = sctx->send_root;
>> +	struct btrfs_key key;
>> +	struct btrfs_key found_key;
>> +	struct btrfs_path *path;
>> +	struct extent_buffer *eb;
>> +	int slot;
>> +	u64 start_ctransid;
>> +	u64 ctransid;
>> +
>> +	path = alloc_path_for_send();
>> +	if (!path)
>> +		return -ENOMEM;
>> +
>> +	spin_lock(&send_root->root_times_lock);
>> +	start_ctransid = btrfs_root_ctransid(&send_root->root_item);
>> +	spin_unlock(&send_root->root_times_lock);
>> +
>> +	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
>> +	key.type = BTRFS_INODE_ITEM_KEY;
>> +	key.offset = 0;
>> +
>> +join_trans:
>> +	/*
>> +	 * We need to make sure the transaction does not get committed
>> +	 * while we do anything on commit roots. Join a transaction to prevent
>> +	 * this.
>> +	 */
>> +	trans = btrfs_join_transaction(send_root);
>> +	if (IS_ERR(trans)) {
>> +		ret = PTR_ERR(trans);
>> +		trans = NULL;
>> +		goto out;
>> +	}
>> +
>> +	/*
>> +	 * Make sure the tree has not changed
>> +	 */
>> +	spin_lock(&send_root->root_times_lock);
>> +	ctransid = btrfs_root_ctransid(&send_root->root_item);
>> +	spin_unlock(&send_root->root_times_lock);
>> +
>> +	if (ctransid != start_ctransid) {
>> +		WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
>> +				     "send was modified in between. This is "
>> +				     "probably a bug.\n");
>
> What is the purpose of getting the ctransid outside the
> transaction anyway?
>
Hmm I don't understand the question...
>> +		ret = -EIO;
>> +		goto out;
>> +	}
>> +
>> +	ret = btrfs_search_slot_for_read(send_root,&key, path, 1, 0);
>> +	if (ret<  0)
>> +		goto out;
>> +	if (ret)
>> +		goto out_finish;
>> +
>> +	while (1) {
>> +		/*
>> +		 * When someone want to commit while we iterate, end the
>> +		 * joined transaction and rejoin.
>> +		 */
>> +		if (btrfs_should_end_transaction(trans, send_root)) {
>> +			ret = btrfs_end_transaction(trans, send_root);
>> +			trans = NULL;
>> +			if (ret<  0)
>> +				goto out;
>> +			btrfs_release_path(path);
>> +			goto join_trans;
>> +		}
>> +
>> +		eb = path->nodes[0];
>> +		slot = path->slots[0];
>> +		btrfs_item_key_to_cpu(eb,&found_key, slot);
>> +
>> +		ret = changed_cb(send_root, NULL, path, NULL,
>> +				&found_key, BTRFS_COMPARE_TREE_NEW, sctx);
>> +		if (ret<  0)
>> +			goto out;
>> +
>> +		key.objectid = found_key.objectid;
>> +		key.type = found_key.type;
>> +		key.offset = found_key.offset + 1;
>
> shouldn't this just be before the goto join_trans?
>
Hmm I don't think so. Am I missing something?
>> +
>> +		ret = btrfs_next_item(send_root, path);
>> +		if (ret<  0)
>> +			goto out;
>> +		if (ret) {
>> +			ret  = 0;
>> +			break;
>> +		}
>> +	}
>> +
>> +out_finish:
>> +	ret = finish_inode_if_needed(sctx, 1);
>> +
>> +out:
>> +	btrfs_free_path(path);
>> +	if (trans) {
>> +		if (!ret)
>> +			ret = btrfs_end_transaction(trans, send_root);
>> +		else
>> +			btrfs_end_transaction(trans, send_root);
>> +	}
>> +	return ret;
>> +}
>> +
>> +static int send_subvol(struct send_ctx *sctx)
>> +{
>> +	int ret;
>> +
>> +	ret = send_header(sctx);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = send_subvol_begin(sctx);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	if (sctx->parent_root) {
>> +		ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
>> +				changed_cb, sctx);
>> +		if (ret<  0)
>> +			goto out;
>> +		ret = finish_inode_if_needed(sctx, 1);
>> +		if (ret<  0)
>> +			goto out;
>> +	} else {
>> +		ret = full_send_tree(sctx);
>> +		if (ret<  0)
>> +			goto out;
>> +	}
>> +
>> +out:
>> +	if (!ret)
>> +		ret = close_cur_inode_file(sctx);
>> +	else
>> +		close_cur_inode_file(sctx);
>> +
>> +	free_recorded_refs(sctx);
>> +	return ret;
>> +}
>> +
>> +long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
>> +{
>> +	int ret = 0;
>> +	struct btrfs_root *send_root;
>> +	struct btrfs_root *clone_root;
>> +	struct btrfs_fs_info *fs_info;
>> +	struct btrfs_ioctl_send_args *arg = NULL;
>> +	struct btrfs_key key;
>> +	struct file *filp = NULL;
>> +	struct send_ctx *sctx = NULL;
>> +	u32 i;
>> +	u64 *clone_sources_tmp = NULL;
>> +
>> +	if (!capable(CAP_SYS_ADMIN))
>> +		return -EPERM;
>> +
>> +	send_root = BTRFS_I(fdentry(mnt_file)->d_inode)->root;
>> +	fs_info = send_root->fs_info;
>> +
>> +	arg = memdup_user(arg_, sizeof(*arg));
>> +	if (IS_ERR(arg)) {
>> +		ret = PTR_ERR(arg);
>> +		arg = NULL;
>> +		goto out;
>> +	}
>> +
>> +	if (!access_ok(VERIFY_READ, arg->clone_sources,
>> +			sizeof(*arg->clone_sources *
>> +			arg->clone_sources_count))) {
>> +		ret = -EFAULT;
>> +		goto out;
>> +	}
>> +
>> +	sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
>> +	if (!sctx) {
>> +		ret = -ENOMEM;
>> +		goto out;
>> +	}
>> +
>> +	INIT_LIST_HEAD(&sctx->new_refs);
>> +	INIT_LIST_HEAD(&sctx->deleted_refs);
>> +	INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
>> +	INIT_LIST_HEAD(&sctx->name_cache_list);
>> +
>> +	sctx->send_filp = fget(arg->send_fd);
>> +	if (IS_ERR(sctx->send_filp)) {
>> +		ret = PTR_ERR(sctx->send_filp);
>> +		goto out;
>> +	}
>> +
>> +	sctx->mnt = mnt_file->f_path.mnt;
>> +
>> +	sctx->send_root = send_root;
>> +	sctx->clone_roots_cnt = arg->clone_sources_count;
>> +
>> +	sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
>> +	sctx->send_buf = vmalloc(sctx->send_max_size);
>> +	if (!sctx->send_buf) {
>> +		ret = -ENOMEM;
>> +		goto out;
>> +	}
>> +
>> +	sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
>> +	if (!sctx->read_buf) {
>> +		ret = -ENOMEM;
>> +		goto out;
>> +	}
>> +
>> +	sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
>> +			(arg->clone_sources_count + 1));
>> +	if (!sctx->clone_roots) {
>> +		ret = -ENOMEM;
>> +		goto out;
>> +	}
>> +
>> +	if (arg->clone_sources_count) {
>> +		clone_sources_tmp = vmalloc(arg->clone_sources_count *
>> +				sizeof(*arg->clone_sources));
>> +		if (!clone_sources_tmp) {
>> +			ret = -ENOMEM;
>> +			goto out;
>> +		}
>> +
>> +		ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
>> +				arg->clone_sources_count *
>> +				sizeof(*arg->clone_sources));
>> +		if (ret) {
>> +			ret = -EFAULT;
>> +			goto out;
>> +		}
>> +
>> +		for (i = 0; i<  arg->clone_sources_count; i++) {
>> +			key.objectid = clone_sources_tmp[i];
>> +			key.type = BTRFS_ROOT_ITEM_KEY;
>> +			key.offset = (u64)-1;
>> +			clone_root = btrfs_read_fs_root_no_name(fs_info,&key);
>> +			if (!clone_root) {
>> +				ret = -EINVAL;
>> +				goto out;
>> +			}
>> +			if (IS_ERR(clone_root)) {
>> +				ret = PTR_ERR(clone_root);
>> +				goto out;
>> +			}
>> +			sctx->clone_roots[i].root = clone_root;
>> +		}
>> +		vfree(clone_sources_tmp);
>> +		clone_sources_tmp = NULL;
>> +	}
>> +
>> +	if (arg->parent_root) {
>> +		key.objectid = arg->parent_root;
>> +		key.type = BTRFS_ROOT_ITEM_KEY;
>> +		key.offset = (u64)-1;
>> +		sctx->parent_root = btrfs_read_fs_root_no_name(fs_info,&key);
>> +		if (!sctx->parent_root) {
>> +			ret = -EINVAL;
>> +			goto out;
>> +		}
>> +	}
>> +
>> +	/*
>> +	 * Clones from send_root are allowed, but only if the clone source
>> +	 * is behind the current send position. This is checked while searching
>> +	 * for possible clone sources.
>> +	 */
>> +	sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root;
>> +
>> +	/* We do a bsearch later */
>> +	sort(sctx->clone_roots, sctx->clone_roots_cnt,
>> +			sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
>> +			NULL);
>> +
>> +	ret = send_subvol(sctx);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +	ret = begin_cmd(sctx, BTRFS_SEND_C_END);
>> +	if (ret<  0)
>> +		goto out;
>> +	ret = send_cmd(sctx);
>> +	if (ret<  0)
>> +		goto out;
>> +
>> +out:
>> +	if (filp)
>> +		fput(filp);
>> +	kfree(arg);
>> +	vfree(clone_sources_tmp);
>> +
>> +	if (sctx) {
>> +		if (sctx->send_filp)
>> +			fput(sctx->send_filp);
>> +
>> +		vfree(sctx->clone_roots);
>> +		vfree(sctx->send_buf);
>> +		vfree(sctx->read_buf);
>> +
>> +		name_cache_free(sctx);
>> +
>> +		kfree(sctx);
>> +	}
>> +
>> +	return ret;
>> +}
>> diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
>> index a4c23ee..53f8ee7 100644
>> --- a/fs/btrfs/send.h
>> +++ b/fs/btrfs/send.h
>> @@ -124,3 +124,7 @@ enum {
>>   	__BTRFS_SEND_A_MAX,
>>   };
>>   #define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
>> +
>> +#ifdef __KERNEL__
>> +long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
>> +#endif
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Block Aug. 1, 2012, 12:54 p.m. UTC | #9
On Mon, Jul 23, 2012 at 5:17 PM, Alex Lyakas
<alex.bolshoy.btrfs@gmail.com> wrote:
> Hi Alexander,
> I did some testing of the case where same inode, but with a different
> generation, exists both in send_root and in parent_root.
> I know that this can happen primarily when "inode_cache" option is
> enabled. So first I just tested some differential sends, where parent
> and root are unrelated subvolumes. Here are some issues:
>
> 1) The top subvolume inode (ino=BTRFS_FIRST_FREE_OBJECTID) is treated
> also as deleted + recreated. So the code goes into process_all_refs()
> path and does several strange things, such as trying to orphanize the
> top inode. Also get_cur_path() always returns "" for the top subvolume
> (without checking whether it is an orphan).  Another complication for
> the top inode is that its parent dir is itself.
> I made the following fix:
> @@ -3782,7 +3972,13 @@ static int changed_inode(struct send_ctx *sctx,
>
>                 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
>                                 right_ii);
> -               if (left_gen != right_gen)
> +               if (left_gen != right_gen && sctx->cur_ino !=
> BTRFS_FIRST_FREE_OBJECTID)
>                         sctx->cur_inode_new_gen = 1;
>
> So basically, don't try to delete and re-create it, but treat it like
> a change. Since the top subvolume inode is S_IFDIR, and dir can have
> only one hardlink (and hopefully it is always ".."), we will never
> need to change anything for this INODE_REF. I also added:
>
> @@ -2526,6 +2615,14 @@ static int process_recorded_refs(struct send_ctx *sctx)
>         int did_overwrite = 0;
>         int is_orphan = 0;
>
> +       BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
I applied both fixes to for-chris now.
>
> 2) After I fixed this, I hit another issue, where inodes under the top
> subvolume dir, attempt to rmdir() the top dir, while iterating over
> check_dirs in process_recorded_refs(), because (top_dir_ino,
> top_dir_gen) indicate that it was deleted. So I added:
>
> @@ -2714,10 +2857,19 @@ verbose_printk("btrfs: process_recorded_refs
> %llu\n", sctx->cur_ino);
>          */
>         ULIST_ITER_INIT(&uit);
>         while ((un = ulist_next(check_dirs, &uit))) {
> +               /* Do not attempt to rmdir it the top subvolume dir */
> +               if (un->val == BTRFS_FIRST_FREE_OBJECTID)
> +                       continue;
> +
>                 if (un->val > sctx->cur_ino)
>                         continue;
I applied a similar fix by adding a check to can_rmdir. The way you
suggested would also skip utime updates for the top dir.
>
> 3) process_recorded_refs() always increments the send_progress:
>         /*
>          * Current inode is now at it's new position, so we must increase
>          * send_progress
>          */
>         sctx->send_progress = sctx->cur_ino + 1;
>
> However, in the changed_inode() path I am testing, process_all_refs()
> is called twice with the same inode (once for deleted inode, once for
> the recreated inode), so after the first call, send_progress is
> incremented and doesn't match the inode anymore. I don't think I hit
> any issues because of this, just that it's confusing.
I fixed this issue some days ago.
>
> 4)
>
>> +/*
>> + * Record and process all refs at once. Needed when an inode changes the
>> + * generation number, which means that it was deleted and recreated.
>> + */
>> +static int process_all_refs(struct send_ctx *sctx,
>> +                           enum btrfs_compare_tree_result cmd)
>> +{
>> +       int ret;
>> +       struct btrfs_root *root;
>> +       struct btrfs_path *path;
>> +       struct btrfs_key key;
>> +       struct btrfs_key found_key;
>> +       struct extent_buffer *eb;
>> +       int slot;
>> +       iterate_inode_ref_t cb;
>> +
>> +       path = alloc_path_for_send();
>> +       if (!path)
>> +               return -ENOMEM;
>> +
>> +       if (cmd == BTRFS_COMPARE_TREE_NEW) {
>> +               root = sctx->send_root;
>> +               cb = __record_new_ref;
>> +       } else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
>> +               root = sctx->parent_root;
>> +               cb = __record_deleted_ref;
>> +       } else {
>> +               BUG();
>> +       }
>> +
>> +       key.objectid = sctx->cmp_key->objectid;
>> +       key.type = BTRFS_INODE_REF_KEY;
>> +       key.offset = 0;
>> +       while (1) {
>> +               ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
>> +               if (ret < 0) {
>> +                       btrfs_release_path(path);
>> +                       goto out;
>> +               }
>> +               if (ret) {
>> +                       btrfs_release_path(path);
>> +                       break;
>> +               }
>> +
>> +               eb = path->nodes[0];
>> +               slot = path->slots[0];
>> +               btrfs_item_key_to_cpu(eb, &found_key, slot);
>> +
>> +               if (found_key.objectid != key.objectid ||
>> +                   found_key.type != key.type) {
>> +                       btrfs_release_path(path);
>> +                       break;
>> +               }
>> +
>> +               ret = iterate_inode_ref(sctx, sctx->parent_root, path,
>> +                               &found_key, 0, cb, sctx);
>
> Shouldn't it be the root that you calculated eariler and not
> sctx->parent_root? I guess in this case it doesn't matter, because
> "resolve" is 0, and the passed root is only used for resolve. But
> still confusing.
You're right, atm it does not matter which root we use here. It is
more correct to pass 'root' instead of parent_root.
>
> 5) When I started testing with "inode_cache" enabled, I hit another
> issue. When this mount option is enabled, then FREE_INO and FREE_SPACE
> items now appear in the file tree. As a result, the code tries to
> create the FREE_INO item with an orphan name, then tries to find its
> INODE_REF, but fails because it has no INODE_REFs. So
>
> @@ -3923,6 +4127,13 @@ static int changed_cb(struct btrfs_root *left_root,
>         int ret = 0;
>         struct send_ctx *sctx = ctx;
>
> +       /* Ignore non-FS objects */
> +       if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
> +               key->objectid == BTRFS_FREE_SPACE_OBJECTID)
> +               return 0;
>
> makes sense?
Yepp. I however added it after the finish_inode_if_needed call. The
call is still required to finish the previous inode.
>
> Thanks,
> Alex.

Thanks again. Pushed to for-chris.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8d258cb..9173867 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -54,6 +54,7 @@ 
 #include "inode-map.h"
 #include "backref.h"
 #include "rcu-string.h"
+#include "send.h"
 
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -3567,6 +3568,8 @@  long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_balance_progress(root, argp);
 	case BTRFS_IOC_SET_RECEIVED_SUBVOL:
 		return btrfs_ioctl_set_received_subvol(file, argp);
+	case BTRFS_IOC_SEND:
+		return btrfs_ioctl_send(file, argp);
 	case BTRFS_IOC_GET_DEV_STATS:
 		return btrfs_ioctl_get_dev_stats(root, argp, 0);
 	case BTRFS_IOC_GET_AND_RESET_DEV_STATS:
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 47a2557..4d3fcfc 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1007,3 +1007,3249 @@  out:
 	return ret;
 }
 
+struct backref_ctx {
+	struct send_ctx *sctx;
+
+	/* number of total found references */
+	u64 found;
+
+	/*
+	 * used for clones found in send_root. clones found behind cur_objectid
+	 * and cur_offset are not considered as allowed clones.
+	 */
+	u64 cur_objectid;
+	u64 cur_offset;
+
+	/* may be truncated in case it's the last extent in a file */
+	u64 extent_len;
+
+	/* Just to check for bugs in backref resolving */
+	int found_in_send_root;
+};
+
+static int __clone_root_cmp_bsearch(const void *key, const void *elt)
+{
+	u64 root = (u64)key;
+	struct clone_root *cr = (struct clone_root *)elt;
+
+	if (root < cr->root->objectid)
+		return -1;
+	if (root > cr->root->objectid)
+		return 1;
+	return 0;
+}
+
+static int __clone_root_cmp_sort(const void *e1, const void *e2)
+{
+	struct clone_root *cr1 = (struct clone_root *)e1;
+	struct clone_root *cr2 = (struct clone_root *)e2;
+
+	if (cr1->root->objectid < cr2->root->objectid)
+		return -1;
+	if (cr1->root->objectid > cr2->root->objectid)
+		return 1;
+	return 0;
+}
+
+/*
+ * Called for every backref that is found for the current extent.
+ */
+static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
+{
+	struct backref_ctx *bctx = ctx_;
+	struct clone_root *found;
+	int ret;
+	u64 i_size;
+
+	/* First check if the root is in the list of accepted clone sources */
+	found = bsearch((void *)root, bctx->sctx->clone_roots,
+			bctx->sctx->clone_roots_cnt,
+			sizeof(struct clone_root),
+			__clone_root_cmp_bsearch);
+	if (!found)
+		return 0;
+
+	if (found->root == bctx->sctx->send_root &&
+	    ino == bctx->cur_objectid &&
+	    offset == bctx->cur_offset) {
+		bctx->found_in_send_root = 1;
+	}
+
+	/*
+	 * There are inodes that have extents that lie behind it's i_size. Don't
+	 * accept clones from these extents.
+	 */
+	ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL);
+	if (ret < 0)
+		return ret;
+
+	if (offset + bctx->extent_len > i_size)
+		return 0;
+
+	/*
+	 * Make sure we don't consider clones from send_root that are
+	 * behind the current inode/offset.
+	 */
+	if (found->root == bctx->sctx->send_root) {
+		/*
+		 * TODO for the moment we don't accept clones from the inode
+		 * that is currently send. We may change this when
+		 * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same
+		 * file.
+		 */
+		if (ino >= bctx->cur_objectid)
+			return 0;
+		/*if (ino > ctx->cur_objectid)
+			return 0;
+		if (offset + ctx->extent_len > ctx->cur_offset)
+			return 0;*/
+
+		bctx->found++;
+		found->found_refs++;
+		found->ino = ino;
+		found->offset = offset;
+		return 0;
+	}
+
+	bctx->found++;
+	found->found_refs++;
+	if (ino < found->ino) {
+		found->ino = ino;
+		found->offset = offset;
+	} else if (found->ino == ino) {
+		/*
+		 * same extent found more then once in the same file.
+		 */
+		if (found->offset > offset + bctx->extent_len)
+			found->offset = offset;
+	}
+
+	return 0;
+}
+
+/*
+ * path must point to the extent item when called.
+ */
+static int find_extent_clone(struct send_ctx *sctx,
+			     struct btrfs_path *path,
+			     u64 ino, u64 data_offset,
+			     u64 ino_size,
+			     struct clone_root **found)
+{
+	int ret;
+	int extent_type;
+	u64 logical;
+	u64 num_bytes;
+	u64 extent_item_pos;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *eb = path->nodes[0];
+	struct backref_ctx backref_ctx;
+	struct clone_root *cur_clone_root;
+	struct btrfs_key found_key;
+	struct btrfs_path *tmp_path;
+	u32 i;
+
+	tmp_path = alloc_path_for_send();
+	if (!tmp_path)
+		return -ENOMEM;
+
+	if (data_offset >= ino_size) {
+		/*
+		 * There may be extents that lie behind the file's size.
+		 * I at least had this in combination with snapshotting while
+		 * writing large files.
+		 */
+		ret = 0;
+		goto out;
+	}
+
+	fi = btrfs_item_ptr(eb, path->slots[0],
+			struct btrfs_file_extent_item);
+	extent_type = btrfs_file_extent_type(eb, fi);
+	if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	num_bytes = btrfs_file_extent_num_bytes(eb, fi);
+	logical = btrfs_file_extent_disk_bytenr(eb, fi);
+	if (logical == 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+	logical += btrfs_file_extent_offset(eb, fi);
+
+	ret = extent_from_logical(sctx->send_root->fs_info,
+			logical, tmp_path, &found_key);
+	btrfs_release_path(tmp_path);
+
+	if (ret < 0)
+		goto out;
+	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		ret = -EIO;
+		goto out;
+	}
+
+	/*
+	 * Setup the clone roots.
+	 */
+	for (i = 0; i < sctx->clone_roots_cnt; i++) {
+		cur_clone_root = sctx->clone_roots + i;
+		cur_clone_root->ino = (u64)-1;
+		cur_clone_root->offset = 0;
+		cur_clone_root->found_refs = 0;
+	}
+
+	backref_ctx.sctx = sctx;
+	backref_ctx.found = 0;
+	backref_ctx.cur_objectid = ino;
+	backref_ctx.cur_offset = data_offset;
+	backref_ctx.found_in_send_root = 0;
+	backref_ctx.extent_len = num_bytes;
+
+	/*
+	 * The last extent of a file may be too large due to page alignment.
+	 * We need to adjust extent_len in this case so that the checks in
+	 * __iterate_backrefs work.
+	 */
+	if (data_offset + num_bytes >= ino_size)
+		backref_ctx.extent_len = ino_size - data_offset;
+
+	/*
+	 * Now collect all backrefs.
+	 */
+	extent_item_pos = logical - found_key.objectid;
+	ret = iterate_extent_inodes(sctx->send_root->fs_info,
+					found_key.objectid, extent_item_pos, 1,
+					__iterate_backrefs, &backref_ctx);
+	if (ret < 0)
+		goto out;
+
+	if (!backref_ctx.found_in_send_root) {
+		/* found a bug in backref code? */
+		ret = -EIO;
+		printk(KERN_ERR "btrfs: ERROR did not find backref in "
+				"send_root. inode=%llu, offset=%llu, "
+				"logical=%llu\n",
+				ino, data_offset, logical);
+		goto out;
+	}
+
+verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
+		"ino=%llu, "
+		"num_bytes=%llu, logical=%llu\n",
+		data_offset, ino, num_bytes, logical);
+
+	if (!backref_ctx.found)
+		verbose_printk("btrfs:    no clones found\n");
+
+	cur_clone_root = NULL;
+	for (i = 0; i < sctx->clone_roots_cnt; i++) {
+		if (sctx->clone_roots[i].found_refs) {
+			if (!cur_clone_root)
+				cur_clone_root = sctx->clone_roots + i;
+			else if (sctx->clone_roots[i].root == sctx->send_root)
+				/* prefer clones from send_root over others */
+				cur_clone_root = sctx->clone_roots + i;
+			break;
+		}
+
+	}
+
+	if (cur_clone_root) {
+		*found = cur_clone_root;
+		ret = 0;
+	} else {
+		ret = -ENOENT;
+	}
+
+out:
+	btrfs_free_path(tmp_path);
+	return ret;
+}
+
+static int read_symlink(struct send_ctx *sctx,
+			struct btrfs_root *root,
+			u64 ino,
+			struct fs_path *dest)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *ei;
+	u8 type;
+	u8 compression;
+	unsigned long off;
+	int len;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret);
+
+	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			struct btrfs_file_extent_item);
+	type = btrfs_file_extent_type(path->nodes[0], ei);
+	compression = btrfs_file_extent_compression(path->nodes[0], ei);
+	BUG_ON(type != BTRFS_FILE_EXTENT_INLINE);
+	BUG_ON(compression);
+
+	off = btrfs_file_extent_inline_start(ei);
+	len = btrfs_file_extent_inline_len(path->nodes[0], ei);
+
+	ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
+	if (ret < 0)
+		goto out;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Helper function to generate a file name that is unique in the root of
+ * send_root and parent_root. This is used to generate names for orphan inodes.
+ */
+static int gen_unique_name(struct send_ctx *sctx,
+			   u64 ino, u64 gen,
+			   struct fs_path *dest)
+{
+	int ret = 0;
+	struct btrfs_path *path;
+	struct btrfs_dir_item *di;
+	char tmp[64];
+	int len;
+	u64 idx = 0;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	while (1) {
+		len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu",
+				ino, gen, idx);
+		if (len >= sizeof(tmp)) {
+			/* should really not happen */
+			ret = -EOVERFLOW;
+			goto out;
+		}
+
+		di = btrfs_lookup_dir_item(NULL, sctx->send_root,
+				path, BTRFS_FIRST_FREE_OBJECTID,
+				tmp, strlen(tmp), 0);
+		btrfs_release_path(path);
+		if (IS_ERR(di)) {
+			ret = PTR_ERR(di);
+			goto out;
+		}
+		if (di) {
+			/* not unique, try again */
+			idx++;
+			continue;
+		}
+
+		if (!sctx->parent_root) {
+			/* unique */
+			ret = 0;
+			break;
+		}
+
+		di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
+				path, BTRFS_FIRST_FREE_OBJECTID,
+				tmp, strlen(tmp), 0);
+		btrfs_release_path(path);
+		if (IS_ERR(di)) {
+			ret = PTR_ERR(di);
+			goto out;
+		}
+		if (di) {
+			/* not unique, try again */
+			idx++;
+			continue;
+		}
+		/* unique */
+		break;
+	}
+
+	ret = fs_path_add(dest, tmp, strlen(tmp));
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+enum inode_state {
+	inode_state_no_change,
+	inode_state_will_create,
+	inode_state_did_create,
+	inode_state_will_delete,
+	inode_state_did_delete,
+};
+
+static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+	int ret;
+	int left_ret;
+	int right_ret;
+	u64 left_gen;
+	u64 right_gen;
+
+	ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
+			NULL);
+	if (ret < 0 && ret != -ENOENT)
+		goto out;
+	left_ret = ret;
+
+	if (!sctx->parent_root) {
+		right_ret = -ENOENT;
+	} else {
+		ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
+				NULL, NULL, NULL);
+		if (ret < 0 && ret != -ENOENT)
+			goto out;
+		right_ret = ret;
+	}
+
+	if (!left_ret && !right_ret) {
+		if (left_gen == gen && right_gen == gen)
+			ret = inode_state_no_change;
+		else if (left_gen == gen) {
+			if (ino < sctx->send_progress)
+				ret = inode_state_did_create;
+			else
+				ret = inode_state_will_create;
+		} else if (right_gen == gen) {
+			if (ino < sctx->send_progress)
+				ret = inode_state_did_delete;
+			else
+				ret = inode_state_will_delete;
+		} else  {
+			ret = -ENOENT;
+		}
+	} else if (!left_ret) {
+		if (left_gen == gen) {
+			if (ino < sctx->send_progress)
+				ret = inode_state_did_create;
+			else
+				ret = inode_state_will_create;
+		} else {
+			ret = -ENOENT;
+		}
+	} else if (!right_ret) {
+		if (right_gen == gen) {
+			if (ino < sctx->send_progress)
+				ret = inode_state_did_delete;
+			else
+				ret = inode_state_will_delete;
+		} else {
+			ret = -ENOENT;
+		}
+	} else {
+		ret = -ENOENT;
+	}
+
+out:
+	return ret;
+}
+
+static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+	int ret;
+
+	ret = get_cur_inode_state(sctx, ino, gen);
+	if (ret < 0)
+		goto out;
+
+	if (ret == inode_state_no_change ||
+	    ret == inode_state_did_create ||
+	    ret == inode_state_will_delete)
+		ret = 1;
+	else
+		ret = 0;
+
+out:
+	return ret;
+}
+
+/*
+ * Helper function to lookup a dir item in a dir.
+ */
+static int lookup_dir_item_inode(struct btrfs_root *root,
+				 u64 dir, const char *name, int name_len,
+				 u64 *found_inode,
+				 u8 *found_type)
+{
+	int ret = 0;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	di = btrfs_lookup_dir_item(NULL, root, path,
+			dir, name, name_len, 0);
+	if (!di) {
+		ret = -ENOENT;
+		goto out;
+	}
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto out;
+	}
+	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+	*found_inode = key.objectid;
+	*found_type = btrfs_dir_type(path->nodes[0], di);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int get_first_ref(struct send_ctx *sctx,
+			 struct btrfs_root *root, u64 ino,
+			 u64 *dir, u64 *dir_gen, struct fs_path *name)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+	struct btrfs_inode_ref *iref;
+	int len;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = ino;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
+	if (ret < 0)
+		goto out;
+	if (!ret)
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				path->slots[0]);
+	if (ret || found_key.objectid != key.objectid ||
+	    found_key.type != key.type) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			struct btrfs_inode_ref);
+	len = btrfs_inode_ref_name_len(path->nodes[0], iref);
+	ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
+			(unsigned long)(iref + 1), len);
+	if (ret < 0)
+		goto out;
+	btrfs_release_path(path);
+
+	ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL,
+			NULL);
+	if (ret < 0)
+		goto out;
+
+	*dir = found_key.offset;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int is_first_ref(struct send_ctx *sctx,
+			struct btrfs_root *root,
+			u64 ino, u64 dir,
+			const char *name, int name_len)
+{
+	int ret;
+	struct fs_path *tmp_name;
+	u64 tmp_dir;
+	u64 tmp_dir_gen;
+
+	tmp_name = fs_path_alloc(sctx);
+	if (!tmp_name)
+		return -ENOMEM;
+
+	ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
+	if (ret < 0)
+		goto out;
+
+	if (name_len != fs_path_len(tmp_name)) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = memcmp(tmp_name->start, name, name_len);
+	if (ret)
+		ret = 0;
+	else
+		ret = 1;
+
+out:
+	fs_path_free(sctx, tmp_name);
+	return ret;
+}
+
+static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+			      const char *name, int name_len,
+			      u64 *who_ino, u64 *who_gen)
+{
+	int ret = 0;
+	u64 other_inode = 0;
+	u8 other_type = 0;
+
+	if (!sctx->parent_root)
+		goto out;
+
+	ret = is_inode_existent(sctx, dir, dir_gen);
+	if (ret <= 0)
+		goto out;
+
+	ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
+			&other_inode, &other_type);
+	if (ret < 0 && ret != -ENOENT)
+		goto out;
+	if (ret) {
+		ret = 0;
+		goto out;
+	}
+
+	if (other_inode > sctx->send_progress) {
+		ret = get_inode_info(sctx->parent_root, other_inode, NULL,
+				who_gen, NULL, NULL, NULL);
+		if (ret < 0)
+			goto out;
+
+		ret = 1;
+		*who_ino = other_inode;
+	} else {
+		ret = 0;
+	}
+
+out:
+	return ret;
+}
+
+static int did_overwrite_ref(struct send_ctx *sctx,
+			    u64 dir, u64 dir_gen,
+			    u64 ino, u64 ino_gen,
+			    const char *name, int name_len)
+{
+	int ret = 0;
+	u64 gen;
+	u64 ow_inode;
+	u8 other_type;
+
+	if (!sctx->parent_root)
+		goto out;
+
+	ret = is_inode_existent(sctx, dir, dir_gen);
+	if (ret <= 0)
+		goto out;
+
+	/* check if the ref was overwritten by another ref */
+	ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
+			&ow_inode, &other_type);
+	if (ret < 0 && ret != -ENOENT)
+		goto out;
+	if (ret) {
+		/* was never and will never be overwritten */
+		ret = 0;
+		goto out;
+	}
+
+	ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
+			NULL);
+	if (ret < 0)
+		goto out;
+
+	if (ow_inode == ino && gen == ino_gen) {
+		ret = 0;
+		goto out;
+	}
+
+	/* we know that it is or will be overwritten. check this now */
+	if (ow_inode < sctx->send_progress)
+		ret = 1;
+	else
+		ret = 0;
+
+out:
+	return ret;
+}
+
+static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+	int ret = 0;
+	struct fs_path *name = NULL;
+	u64 dir;
+	u64 dir_gen;
+
+	if (!sctx->parent_root)
+		goto out;
+
+	name = fs_path_alloc(sctx);
+	if (!name)
+		return -ENOMEM;
+
+	ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name);
+	if (ret < 0)
+		goto out;
+
+	ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
+			name->start, fs_path_len(name));
+	if (ret < 0)
+		goto out;
+
+out:
+	fs_path_free(sctx, name);
+	return ret;
+}
+
+static int name_cache_insert(struct send_ctx *sctx,
+			     struct name_cache_entry *nce)
+{
+	int ret = 0;
+	struct name_cache_entry **ncea;
+
+	ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
+	if (ncea) {
+		if (!ncea[0])
+			ncea[0] = nce;
+		else if (!ncea[1])
+			ncea[1] = nce;
+		else
+			BUG();
+	} else {
+		ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS);
+		if (!ncea)
+			return -ENOMEM;
+
+		ncea[0] = nce;
+		ncea[1] = NULL;
+		ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea);
+		if (ret < 0)
+			return ret;
+	}
+	list_add_tail(&nce->list, &sctx->name_cache_list);
+	sctx->name_cache_size++;
+
+	return ret;
+}
+
+static void name_cache_delete(struct send_ctx *sctx,
+			      struct name_cache_entry *nce)
+{
+	struct name_cache_entry **ncea;
+
+	ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
+	BUG_ON(!ncea);
+
+	if (ncea[0] == nce)
+		ncea[0] = NULL;
+	else if (ncea[1] == nce)
+		ncea[1] = NULL;
+	else
+		BUG();
+
+	if (!ncea[0] && !ncea[1]) {
+		radix_tree_delete(&sctx->name_cache, nce->ino);
+		kfree(ncea);
+	}
+
+	list_del(&nce->list);
+
+	sctx->name_cache_size--;
+}
+
+static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
+						    u64 ino, u64 gen)
+{
+	struct name_cache_entry **ncea;
+
+	ncea = radix_tree_lookup(&sctx->name_cache, ino);
+	if (!ncea)
+		return NULL;
+
+	if (ncea[0] && ncea[0]->gen == gen)
+		return ncea[0];
+	else if (ncea[1] && ncea[1]->gen == gen)
+		return ncea[1];
+	return NULL;
+}
+
+static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
+{
+	list_del(&nce->list);
+	list_add_tail(&nce->list, &sctx->name_cache_list);
+}
+
+static void name_cache_clean_unused(struct send_ctx *sctx)
+{
+	struct name_cache_entry *nce;
+
+	if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
+		return;
+
+	while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
+		nce = list_entry(sctx->name_cache_list.next,
+				struct name_cache_entry, list);
+		name_cache_delete(sctx, nce);
+		kfree(nce);
+	}
+}
+
+static void name_cache_free(struct send_ctx *sctx)
+{
+	struct name_cache_entry *nce;
+	struct name_cache_entry *tmp;
+
+	list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) {
+		name_cache_delete(sctx, nce);
+	}
+}
+
+static int __get_cur_name_and_parent(struct send_ctx *sctx,
+				     u64 ino, u64 gen,
+				     u64 *parent_ino,
+				     u64 *parent_gen,
+				     struct fs_path *dest)
+{
+	int ret;
+	int nce_ret;
+	struct btrfs_path *path = NULL;
+	struct name_cache_entry *nce = NULL;
+
+	nce = name_cache_search(sctx, ino, gen);
+	if (nce) {
+		if (ino < sctx->send_progress && nce->need_later_update) {
+			name_cache_delete(sctx, nce);
+			kfree(nce);
+			nce = NULL;
+		} else {
+			name_cache_used(sctx, nce);
+			*parent_ino = nce->parent_ino;
+			*parent_gen = nce->parent_gen;
+			ret = fs_path_add(dest, nce->name, nce->name_len);
+			if (ret < 0)
+				goto out;
+			ret = nce->ret;
+			goto out;
+		}
+	}
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	ret = is_inode_existent(sctx, ino, gen);
+	if (ret < 0)
+		goto out;
+
+	if (!ret) {
+		ret = gen_unique_name(sctx, ino, gen, dest);
+		if (ret < 0)
+			goto out;
+		ret = 1;
+		goto out_cache;
+	}
+
+	if (ino < sctx->send_progress)
+		ret = get_first_ref(sctx, sctx->send_root, ino,
+				parent_ino, parent_gen, dest);
+	else
+		ret = get_first_ref(sctx, sctx->parent_root, ino,
+				parent_ino, parent_gen, dest);
+	if (ret < 0)
+		goto out;
+
+	ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
+			dest->start, dest->end - dest->start);
+	if (ret < 0)
+		goto out;
+	if (ret) {
+		fs_path_reset(dest);
+		ret = gen_unique_name(sctx, ino, gen, dest);
+		if (ret < 0)
+			goto out;
+		ret = 1;
+	}
+
+out_cache:
+	nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
+	if (!nce) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	nce->ino = ino;
+	nce->gen = gen;
+	nce->parent_ino = *parent_ino;
+	nce->parent_gen = *parent_gen;
+	nce->name_len = fs_path_len(dest);
+	nce->ret = ret;
+	strcpy(nce->name, dest->start);
+	memset(&nce->use_list, 0, sizeof(nce->use_list));
+
+	if (ino < sctx->send_progress)
+		nce->need_later_update = 0;
+	else
+		nce->need_later_update = 1;
+
+	nce_ret = name_cache_insert(sctx, nce);
+	if (nce_ret < 0)
+		ret = nce_ret;
+	name_cache_clean_unused(sctx);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Magic happens here. This function returns the first ref to an inode as it
+ * would look like while receiving the stream at this point in time.
+ * We walk the path up to the root. For every inode in between, we check if it
+ * was already processed/sent. If yes, we continue with the parent as found
+ * in send_root. If not, we continue with the parent as found in parent_root.
+ * If we encounter an inode that was deleted at this point in time, we use the
+ * inodes "orphan" name instead of the real name and stop. Same with new inodes
+ * that were not created yet and overwritten inodes/refs.
+ *
+ * When do we have have orphan inodes:
+ * 1. When an inode is freshly created and thus no valid refs are available yet
+ * 2. When a directory lost all it's refs (deleted) but still has dir items
+ *    inside which were not processed yet (pending for move/delete). If anyone
+ *    tried to get the path to the dir items, it would get a path inside that
+ *    orphan directory.
+ * 3. When an inode is moved around or gets new links, it may overwrite the ref
+ *    of an unprocessed inode. If in that case the first ref would be
+ *    overwritten, the overwritten inode gets "orphanized". Later when we
+ *    process this overwritten inode, it is restored at a new place by moving
+ *    the orphan inode.
+ *
+ * sctx->send_progress tells this function at which point in time receiving
+ * would be.
+ */
+static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
+			struct fs_path *dest)
+{
+	int ret = 0;
+	struct fs_path *name = NULL;
+	u64 parent_inode = 0;
+	u64 parent_gen = 0;
+	int stop = 0;
+
+	name = fs_path_alloc(sctx);
+	if (!name) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	dest->reversed = 1;
+	fs_path_reset(dest);
+
+	while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
+		fs_path_reset(name);
+
+		ret = __get_cur_name_and_parent(sctx, ino, gen,
+				&parent_inode, &parent_gen, name);
+		if (ret < 0)
+			goto out;
+		if (ret)
+			stop = 1;
+
+		ret = fs_path_add_path(dest, name);
+		if (ret < 0)
+			goto out;
+
+		ino = parent_inode;
+		gen = parent_gen;
+	}
+
+out:
+	fs_path_free(sctx, name);
+	if (!ret)
+		fs_path_unreverse(dest);
+	return ret;
+}
+
+/*
+ * Called for regular files when sending extents data. Opens a struct file
+ * to read from the file.
+ */
+static int open_cur_inode_file(struct send_ctx *sctx)
+{
+	int ret = 0;
+	struct btrfs_key key;
+	struct vfsmount *mnt;
+	struct inode *inode;
+	struct dentry *dentry;
+	struct file *filp;
+	int new = 0;
+
+	if (sctx->cur_inode_filp)
+		goto out;
+
+	key.objectid = sctx->cur_ino;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	inode = btrfs_iget(sctx->send_root->fs_info->sb, &key, sctx->send_root,
+			&new);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto out;
+	}
+
+	dentry = d_obtain_alias(inode);
+	inode = NULL;
+	if (IS_ERR(dentry)) {
+		ret = PTR_ERR(dentry);
+		goto out;
+	}
+
+	mnt = mntget(sctx->mnt);
+	filp = dentry_open(dentry, mnt, O_RDONLY | O_LARGEFILE, current_cred());
+	dentry = NULL;
+	mnt = NULL;
+	if (IS_ERR(filp)) {
+		ret = PTR_ERR(filp);
+		goto out;
+	}
+	sctx->cur_inode_filp = filp;
+
+out:
+	/*
+	 * no xxxput required here as every vfs op
+	 * does it by itself on failure
+	 */
+	return ret;
+}
+
+/*
+ * Closes the struct file that was created in open_cur_inode_file
+ */
+static int close_cur_inode_file(struct send_ctx *sctx)
+{
+	int ret = 0;
+
+	if (!sctx->cur_inode_filp)
+		goto out;
+
+	ret = filp_close(sctx->cur_inode_filp, NULL);
+	sctx->cur_inode_filp = NULL;
+
+out:
+	return ret;
+}
+
+/*
+ * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
+ */
+static int send_subvol_begin(struct send_ctx *sctx)
+{
+	int ret;
+	struct btrfs_root *send_root = sctx->send_root;
+	struct btrfs_root *parent_root = sctx->parent_root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_root_ref *ref;
+	struct extent_buffer *leaf;
+	char *name = NULL;
+	int namelen;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS);
+	if (!name) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+
+	key.objectid = send_root->objectid;
+	key.type = BTRFS_ROOT_BACKREF_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
+				&key, path, 1, 0);
+	if (ret < 0)
+		goto out;
+	if (ret) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	if (key.type != BTRFS_ROOT_BACKREF_KEY ||
+	    key.objectid != send_root->objectid) {
+		ret = -ENOENT;
+		goto out;
+	}
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+	namelen = btrfs_root_ref_name_len(leaf, ref);
+	read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
+	btrfs_release_path(path);
+
+	if (ret < 0)
+		goto out;
+
+	if (parent_root) {
+		ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
+		if (ret < 0)
+			goto out;
+	} else {
+		ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
+		if (ret < 0)
+			goto out;
+	}
+
+	TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
+	TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+			sctx->send_root->root_item.uuid);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
+			sctx->send_root->root_item.ctransid);
+	if (parent_root) {
+		TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+				sctx->parent_root->root_item.uuid);
+		TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
+				sctx->parent_root->root_item.ctransid);
+	}
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	btrfs_free_path(path);
+	kfree(name);
+	return ret;
+}
+
+static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
+{
+	int ret = 0;
+	struct fs_path *p;
+
+verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
+
+	p = fs_path_alloc(sctx);
+	if (!p)
+		return -ENOMEM;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, ino, gen, p);
+	if (ret < 0)
+		goto out;
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(sctx, p);
+	return ret;
+}
+
+static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
+{
+	int ret = 0;
+	struct fs_path *p;
+
+verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
+
+	p = fs_path_alloc(sctx);
+	if (!p)
+		return -ENOMEM;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, ino, gen, p);
+	if (ret < 0)
+		goto out;
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(sctx, p);
+	return ret;
+}
+
+static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
+{
+	int ret = 0;
+	struct fs_path *p;
+
+verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
+
+	p = fs_path_alloc(sctx);
+	if (!p)
+		return -ENOMEM;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, ino, gen, p);
+	if (ret < 0)
+		goto out;
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(sctx, p);
+	return ret;
+}
+
+static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+	int ret = 0;
+	struct fs_path *p = NULL;
+	struct btrfs_inode_item *ii;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *eb;
+	struct btrfs_key key;
+	int slot;
+
+verbose_printk("btrfs: send_utimes %llu\n", ino);
+
+	p = fs_path_alloc(sctx);
+	if (!p)
+		return -ENOMEM;
+
+	path = alloc_path_for_send();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = ino;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, ino, gen, p);
+	if (ret < 0)
+		goto out;
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb,
+			btrfs_inode_atime(ii));
+	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb,
+			btrfs_inode_mtime(ii));
+	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
+			btrfs_inode_ctime(ii));
+	/* TODO otime? */
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(sctx, p);
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
+ * a valid path yet because we did not process the refs yet. So, the inode
+ * is created as orphan.
+ */
+static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path,
+			     struct btrfs_key *key)
+{
+	int ret = 0;
+	struct extent_buffer *eb = path->nodes[0];
+	struct btrfs_inode_item *ii;
+	struct fs_path *p;
+	int slot = path->slots[0];
+	int cmd;
+	u64 mode;
+
+verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
+
+	p = fs_path_alloc(sctx);
+	if (!p)
+		return -ENOMEM;
+
+	ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
+	mode = btrfs_inode_mode(eb, ii);
+
+	if (S_ISREG(mode))
+		cmd = BTRFS_SEND_C_MKFILE;
+	else if (S_ISDIR(mode))
+		cmd = BTRFS_SEND_C_MKDIR;
+	else if (S_ISLNK(mode))
+		cmd = BTRFS_SEND_C_SYMLINK;
+	else if (S_ISCHR(mode) || S_ISBLK(mode))
+		cmd = BTRFS_SEND_C_MKNOD;
+	else if (S_ISFIFO(mode))
+		cmd = BTRFS_SEND_C_MKFIFO;
+	else if (S_ISSOCK(mode))
+		cmd = BTRFS_SEND_C_MKSOCK;
+	else {
+		printk(KERN_WARNING "btrfs: unexpected inode type %o",
+				(int)(mode & S_IFMT));
+		ret = -ENOTSUPP;
+		goto out;
+	}
+
+	ret = begin_cmd(sctx, cmd);
+	if (ret < 0)
+		goto out;
+
+	ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+
+	if (S_ISLNK(mode)) {
+		fs_path_reset(p);
+		ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p);
+		if (ret < 0)
+			goto out;
+		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
+	} else if (S_ISCHR(mode) || S_ISBLK(mode) ||
+		   S_ISFIFO(mode) || S_ISSOCK(mode)) {
+		TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii));
+	}
+
+	ret = send_cmd(sctx);
+	if (ret < 0)
+		goto out;
+
+
+tlv_put_failure:
+out:
+	fs_path_free(sctx, p);
+	return ret;
+}
+
+struct recorded_ref {
+	struct list_head list;
+	char *dir_path;
+	char *name;
+	struct fs_path *full_path;
+	u64 dir;
+	u64 dir_gen;
+	int dir_path_len;
+	int name_len;
+};
+
+/*
+ * We need to process new refs before deleted refs, but compare_tree gives us
+ * everything mixed. So we first record all refs and later process them.
+ * This function is a helper to record one ref.
+ */
+static int record_ref(struct list_head *head, u64 dir,
+		      u64 dir_gen, struct fs_path *path)
+{
+	struct recorded_ref *ref;
+	char *tmp;
+
+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
+
+	ref->dir = dir;
+	ref->dir_gen = dir_gen;
+	ref->full_path = path;
+
+	tmp = strrchr(ref->full_path->start, '/');
+	if (!tmp) {
+		ref->name_len = ref->full_path->end - ref->full_path->start;
+		ref->name = ref->full_path->start;
+		ref->dir_path_len = 0;
+		ref->dir_path = ref->full_path->start;
+	} else {
+		tmp++;
+		ref->name_len = ref->full_path->end - tmp;
+		ref->name = tmp;
+		ref->dir_path = ref->full_path->start;
+		ref->dir_path_len = ref->full_path->end -
+				ref->full_path->start - 1 - ref->name_len;
+	}
+
+	list_add_tail(&ref->list, head);
+	return 0;
+}
+
+static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
+{
+	struct recorded_ref *cur;
+	struct recorded_ref *tmp;
+
+	list_for_each_entry_safe(cur, tmp, head, list) {
+		fs_path_free(sctx, cur->full_path);
+		kfree(cur);
+	}
+	INIT_LIST_HEAD(head);
+}
+
+static void free_recorded_refs(struct send_ctx *sctx)
+{
+	__free_recorded_refs(sctx, &sctx->new_refs);
+	__free_recorded_refs(sctx, &sctx->deleted_refs);
+}
+
+/*
+ * Renames/moves a file/dir to it's orphan name. Used when the first
+ * ref of an unprocessed inode gets overwritten and for all non empty
+ * directories.
+ */
+static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
+			  struct fs_path *path)
+{
+	int ret;
+	struct fs_path *orphan;
+
+	orphan = fs_path_alloc(sctx);
+	if (!orphan)
+		return -ENOMEM;
+
+	ret = gen_unique_name(sctx, ino, gen, orphan);
+	if (ret < 0)
+		goto out;
+
+	ret = send_rename(sctx, path, orphan);
+
+out:
+	fs_path_free(sctx, orphan);
+	return ret;
+}
+
+/*
+ * Returns 1 if a directory can be removed at this point in time.
+ * We check this by iterating all dir items and checking if the inode behind
+ * the dir item was already processed.
+ */
+static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
+{
+	int ret = 0;
+	struct btrfs_root *root = sctx->parent_root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_key loc;
+	struct btrfs_dir_item *di;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = dir;
+	key.type = BTRFS_DIR_INDEX_KEY;
+	key.offset = 0;
+
+	while (1) {
+		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
+		if (ret < 0)
+			goto out;
+		if (!ret) {
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+					path->slots[0]);
+		}
+		if (ret || found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			break;
+		}
+
+		di = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				struct btrfs_dir_item);
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
+
+		if (loc.objectid > send_progress) {
+			ret = 0;
+			goto out;
+		}
+
+		btrfs_release_path(path);
+		key.offset = found_key.offset + 1;
+	}
+
+	ret = 1;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * This does all the move/link/unlink/rmdir magic.
+ */
+static int process_recorded_refs(struct send_ctx *sctx)
+{
+	int ret = 0;
+	struct recorded_ref *cur;
+	struct ulist *check_dirs = NULL;
+	struct ulist_iterator uit;
+	struct ulist_node *un;
+	struct fs_path *valid_path = NULL;
+	u64 ow_inode;
+	u64 ow_gen;
+	int did_overwrite = 0;
+	int is_orphan = 0;
+
+verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
+
+	valid_path = fs_path_alloc(sctx);
+	if (!valid_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	check_dirs = ulist_alloc(GFP_NOFS);
+	if (!check_dirs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * First, check if the first ref of the current inode was overwritten
+	 * before. If yes, we know that the current inode was already orphanized
+	 * and thus use the orphan name. If not, we can use get_cur_path to
+	 * get the path of the first ref as it would like while receiving at
+	 * this point in time.
+	 * New inodes are always orphan at the beginning, so force to use the
+	 * orphan name in this case.
+	 * The first ref is stored in valid_path and will be updated if it
+	 * gets moved around.
+	 */
+	if (!sctx->cur_inode_new) {
+		ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
+				sctx->cur_inode_gen);
+		if (ret < 0)
+			goto out;
+		if (ret)
+			did_overwrite = 1;
+	}
+	if (sctx->cur_inode_new || did_overwrite) {
+		ret = gen_unique_name(sctx, sctx->cur_ino,
+				sctx->cur_inode_gen, valid_path);
+		if (ret < 0)
+			goto out;
+		is_orphan = 1;
+	} else {
+		ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				valid_path);
+		if (ret < 0)
+			goto out;
+	}
+
+	list_for_each_entry(cur, &sctx->new_refs, list) {
+		/*
+		 * Check if this new ref would overwrite the first ref of
+		 * another unprocessed inode. If yes, orphanize the
+		 * overwritten inode. If we find an overwritten ref that is
+		 * not the first ref, simply unlink it.
+		 */
+		ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
+				cur->name, cur->name_len,
+				&ow_inode, &ow_gen);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = is_first_ref(sctx, sctx->parent_root,
+					ow_inode, cur->dir, cur->name,
+					cur->name_len);
+			if (ret < 0)
+				goto out;
+			if (ret) {
+				ret = orphanize_inode(sctx, ow_inode, ow_gen,
+						cur->full_path);
+				if (ret < 0)
+					goto out;
+			} else {
+				ret = send_unlink(sctx, cur->full_path);
+				if (ret < 0)
+					goto out;
+			}
+		}
+
+		/*
+		 * link/move the ref to the new place. If we have an orphan
+		 * inode, move it and update valid_path. If not, link or move
+		 * it depending on the inode mode.
+		 */
+		if (is_orphan) {
+			ret = send_rename(sctx, valid_path, cur->full_path);
+			if (ret < 0)
+				goto out;
+			is_orphan = 0;
+			ret = fs_path_copy(valid_path, cur->full_path);
+			if (ret < 0)
+				goto out;
+		} else {
+			if (S_ISDIR(sctx->cur_inode_mode)) {
+				/*
+				 * Dirs can't be linked, so move it. For moved
+				 * dirs, we always have one new and one deleted
+				 * ref. The deleted ref is ignored later.
+				 */
+				ret = send_rename(sctx, valid_path,
+						cur->full_path);
+				if (ret < 0)
+					goto out;
+				ret = fs_path_copy(valid_path, cur->full_path);
+				if (ret < 0)
+					goto out;
+			} else {
+				ret = send_link(sctx, valid_path,
+						cur->full_path);
+				if (ret < 0)
+					goto out;
+			}
+		}
+		ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
+				GFP_NOFS);
+		if (ret < 0)
+			goto out;
+	}
+
+	if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
+		/*
+		 * Check if we can already rmdir the directory. If not,
+		 * orphanize it. For every dir item inside that gets deleted
+		 * later, we do this check again and rmdir it then if possible.
+		 * See the use of check_dirs for more details.
+		 */
+		ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = send_rmdir(sctx, valid_path);
+			if (ret < 0)
+				goto out;
+		} else if (!is_orphan) {
+			ret = orphanize_inode(sctx, sctx->cur_ino,
+					sctx->cur_inode_gen, valid_path);
+			if (ret < 0)
+				goto out;
+			is_orphan = 1;
+		}
+
+		list_for_each_entry(cur, &sctx->deleted_refs, list) {
+			ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
+					GFP_NOFS);
+			if (ret < 0)
+				goto out;
+		}
+	} else if (!S_ISDIR(sctx->cur_inode_mode)) {
+		/*
+		 * We have a non dir inode. Go through all deleted refs and
+		 * unlink them if they were not already overwritten by other
+		 * inodes.
+		 */
+		list_for_each_entry(cur, &sctx->deleted_refs, list) {
+			ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen,
+					sctx->cur_ino, sctx->cur_inode_gen,
+					cur->name, cur->name_len);
+			if (ret < 0)
+				goto out;
+			if (!ret) {
+				ret = send_unlink(sctx, cur->full_path);
+				if (ret < 0)
+					goto out;
+			}
+			ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
+					GFP_NOFS);
+			if (ret < 0)
+				goto out;
+		}
+
+		/*
+		 * If the inode is still orphan, unlink the orphan. This may
+		 * happen when a previous inode did overwrite the first ref
+		 * of this inode and no new refs were added for the current
+		 * inode.
+		 */
+		if (is_orphan) {
+			ret = send_unlink(sctx, valid_path);
+			if (ret < 0)
+				goto out;
+		}
+	}
+
+	/*
+	 * We did collect all parent dirs where cur_inode was once located. We
+	 * now go through all these dirs and check if they are pending for
+	 * deletion and if it's finally possible to perform the rmdir now.
+	 * We also update the inode stats of the parent dirs here.
+	 */
+	ULIST_ITER_INIT(&uit);
+	while ((un = ulist_next(check_dirs, &uit))) {
+		if (un->val > sctx->cur_ino)
+			continue;
+
+		ret = get_cur_inode_state(sctx, un->val, un->aux);
+		if (ret < 0)
+			goto out;
+
+		if (ret == inode_state_did_create ||
+		    ret == inode_state_no_change) {
+			/* TODO delayed utimes */
+			ret = send_utimes(sctx, un->val, un->aux);
+			if (ret < 0)
+				goto out;
+		} else if (ret == inode_state_did_delete) {
+			ret = can_rmdir(sctx, un->val, sctx->cur_ino);
+			if (ret < 0)
+				goto out;
+			if (ret) {
+				ret = get_cur_path(sctx, un->val, un->aux,
+						valid_path);
+				if (ret < 0)
+					goto out;
+				ret = send_rmdir(sctx, valid_path);
+				if (ret < 0)
+					goto out;
+			}
+		}
+	}
+
+	/*
+	 * Current inode is now at it's new position, so we must increase
+	 * send_progress
+	 */
+	sctx->send_progress = sctx->cur_ino + 1;
+
+	ret = 0;
+
+out:
+	free_recorded_refs(sctx);
+	ulist_free(check_dirs);
+	fs_path_free(sctx, valid_path);
+	return ret;
+}
+
+static int __record_new_ref(int num, u64 dir, int index,
+			    struct fs_path *name,
+			    void *ctx)
+{
+	int ret = 0;
+	struct send_ctx *sctx = ctx;
+	struct fs_path *p;
+	u64 gen;
+
+	p = fs_path_alloc(sctx);
+	if (!p)
+		return -ENOMEM;
+
+	ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
+			NULL);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, dir, gen, p);
+	if (ret < 0)
+		goto out;
+	ret = fs_path_add_path(p, name);
+	if (ret < 0)
+		goto out;
+
+	ret = record_ref(&sctx->new_refs, dir, gen, p);
+
+out:
+	if (ret)
+		fs_path_free(sctx, p);
+	return ret;
+}
+
+static int __record_deleted_ref(int num, u64 dir, int index,
+				struct fs_path *name,
+				void *ctx)
+{
+	int ret = 0;
+	struct send_ctx *sctx = ctx;
+	struct fs_path *p;
+	u64 gen;
+
+	p = fs_path_alloc(sctx);
+	if (!p)
+		return -ENOMEM;
+
+	ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
+			NULL);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, dir, gen, p);
+	if (ret < 0)
+		goto out;
+	ret = fs_path_add_path(p, name);
+	if (ret < 0)
+		goto out;
+
+	ret = record_ref(&sctx->deleted_refs, dir, gen, p);
+
+out:
+	if (ret)
+		fs_path_free(sctx, p);
+	return ret;
+}
+
+static int record_new_ref(struct send_ctx *sctx)
+{
+	int ret;
+
+	ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
+			sctx->cmp_key, 0, __record_new_ref, sctx);
+
+	return ret;
+}
+
+static int record_deleted_ref(struct send_ctx *sctx)
+{
+	int ret;
+
+	ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
+			sctx->cmp_key, 0, __record_deleted_ref, sctx);
+	return ret;
+}
+
+struct find_ref_ctx {
+	u64 dir;
+	struct fs_path *name;
+	int found_idx;
+};
+
+static int __find_iref(int num, u64 dir, int index,
+		       struct fs_path *name,
+		       void *ctx_)
+{
+	struct find_ref_ctx *ctx = ctx_;
+
+	if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) &&
+	    strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) {
+		ctx->found_idx = num;
+		return 1;
+	}
+	return 0;
+}
+
+static int find_iref(struct send_ctx *sctx,
+		     struct btrfs_root *root,
+		     struct btrfs_path *path,
+		     struct btrfs_key *key,
+		     u64 dir, struct fs_path *name)
+{
+	int ret;
+	struct find_ref_ctx ctx;
+
+	ctx.dir = dir;
+	ctx.name = name;
+	ctx.found_idx = -1;
+
+	ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx);
+	if (ret < 0)
+		return ret;
+
+	if (ctx.found_idx == -1)
+		return -ENOENT;
+
+	return ctx.found_idx;
+}
+
+static int __record_changed_new_ref(int num, u64 dir, int index,
+				    struct fs_path *name,
+				    void *ctx)
+{
+	int ret;
+	struct send_ctx *sctx = ctx;
+
+	ret = find_iref(sctx, sctx->parent_root, sctx->right_path,
+			sctx->cmp_key, dir, name);
+	if (ret == -ENOENT)
+		ret = __record_new_ref(num, dir, index, name, sctx);
+	else if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
+static int __record_changed_deleted_ref(int num, u64 dir, int index,
+					struct fs_path *name,
+					void *ctx)
+{
+	int ret;
+	struct send_ctx *sctx = ctx;
+
+	ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
+			dir, name);
+	if (ret == -ENOENT)
+		ret = __record_deleted_ref(num, dir, index, name, sctx);
+	else if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
+static int record_changed_ref(struct send_ctx *sctx)
+{
+	int ret = 0;
+
+	ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
+			sctx->cmp_key, 0, __record_changed_new_ref, sctx);
+	if (ret < 0)
+		goto out;
+	ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
+			sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
+
+out:
+	return ret;
+}
+
+/*
+ * Record and process all refs at once. Needed when an inode changes the
+ * generation number, which means that it was deleted and recreated.
+ */
+static int process_all_refs(struct send_ctx *sctx,
+			    enum btrfs_compare_tree_result cmd)
+{
+	int ret;
+	struct btrfs_root *root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *eb;
+	int slot;
+	iterate_inode_ref_t cb;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	if (cmd == BTRFS_COMPARE_TREE_NEW) {
+		root = sctx->send_root;
+		cb = __record_new_ref;
+	} else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
+		root = sctx->parent_root;
+		cb = __record_deleted_ref;
+	} else {
+		BUG();
+	}
+
+	key.objectid = sctx->cmp_key->objectid;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = 0;
+	while (1) {
+		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
+		if (ret < 0) {
+			btrfs_release_path(path);
+			goto out;
+		}
+		if (ret) {
+			btrfs_release_path(path);
+			break;
+		}
+
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+
+		if (found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			btrfs_release_path(path);
+			break;
+		}
+
+		ret = iterate_inode_ref(sctx, sctx->parent_root, path,
+				&found_key, 0, cb, sctx);
+		btrfs_release_path(path);
+		if (ret < 0)
+			goto out;
+
+		key.offset = found_key.offset + 1;
+	}
+
+	ret = process_recorded_refs(sctx);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int send_set_xattr(struct send_ctx *sctx,
+			  struct fs_path *path,
+			  const char *name, int name_len,
+			  const char *data, int data_len)
+{
+	int ret = 0;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
+	TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	return ret;
+}
+
+static int send_remove_xattr(struct send_ctx *sctx,
+			  struct fs_path *path,
+			  const char *name, int name_len)
+{
+	int ret = 0;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	return ret;
+}
+
+static int __process_new_xattr(int num, const char *name, int name_len,
+			       const char *data, int data_len,
+			       u8 type, void *ctx)
+{
+	int ret;
+	struct send_ctx *sctx = ctx;
+	struct fs_path *p;
+	posix_acl_xattr_header dummy_acl;
+
+	p = fs_path_alloc(sctx);
+	if (!p)
+		return -ENOMEM;
+
+	/*
+	 * This hack is needed because empty acl's are stored as zero byte
+	 * data in xattrs. Problem with that is, that receiving these zero byte
+	 * acl's will fail later. To fix this, we send a dummy acl list that
+	 * only contains the version number and no entries.
+	 */
+	if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
+	    !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
+		if (data_len == 0) {
+			dummy_acl.a_version =
+					cpu_to_le32(POSIX_ACL_XATTR_VERSION);
+			data = (char *)&dummy_acl;
+			data_len = sizeof(dummy_acl);
+		}
+	}
+
+	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto out;
+
+	ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
+
+out:
+	fs_path_free(sctx, p);
+	return ret;
+}
+
+static int __process_deleted_xattr(int num, const char *name, int name_len,
+				   const char *data, int data_len,
+				   u8 type, void *ctx)
+{
+	int ret;
+	struct send_ctx *sctx = ctx;
+	struct fs_path *p;
+
+	p = fs_path_alloc(sctx);
+	if (!p)
+		return -ENOMEM;
+
+	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto out;
+
+	ret = send_remove_xattr(sctx, p, name, name_len);
+
+out:
+	fs_path_free(sctx, p);
+	return ret;
+}
+
+static int process_new_xattr(struct send_ctx *sctx)
+{
+	int ret = 0;
+
+	ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
+			sctx->cmp_key, __process_new_xattr, sctx);
+
+	return ret;
+}
+
+static int process_deleted_xattr(struct send_ctx *sctx)
+{
+	int ret;
+
+	ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
+			sctx->cmp_key, __process_deleted_xattr, sctx);
+
+	return ret;
+}
+
+struct find_xattr_ctx {
+	const char *name;
+	int name_len;
+	int found_idx;
+	char *found_data;
+	int found_data_len;
+};
+
+static int __find_xattr(int num, const char *name, int name_len,
+			const char *data, int data_len,
+			u8 type, void *vctx)
+{
+	struct find_xattr_ctx *ctx = vctx;
+
+	if (name_len == ctx->name_len &&
+	    strncmp(name, ctx->name, name_len) == 0) {
+		ctx->found_idx = num;
+		ctx->found_data_len = data_len;
+		ctx->found_data = kmalloc(data_len, GFP_NOFS);
+		if (!ctx->found_data)
+			return -ENOMEM;
+		memcpy(ctx->found_data, data, data_len);
+		return 1;
+	}
+	return 0;
+}
+
+static int find_xattr(struct send_ctx *sctx,
+		      struct btrfs_root *root,
+		      struct btrfs_path *path,
+		      struct btrfs_key *key,
+		      const char *name, int name_len,
+		      char **data, int *data_len)
+{
+	int ret;
+	struct find_xattr_ctx ctx;
+
+	ctx.name = name;
+	ctx.name_len = name_len;
+	ctx.found_idx = -1;
+	ctx.found_data = NULL;
+	ctx.found_data_len = 0;
+
+	ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx);
+	if (ret < 0)
+		return ret;
+
+	if (ctx.found_idx == -1)
+		return -ENOENT;
+	if (data) {
+		*data = ctx.found_data;
+		*data_len = ctx.found_data_len;
+	} else {
+		kfree(ctx.found_data);
+	}
+	return ctx.found_idx;
+}
+
+
+static int __process_changed_new_xattr(int num, const char *name, int name_len,
+				       const char *data, int data_len,
+				       u8 type, void *ctx)
+{
+	int ret;
+	struct send_ctx *sctx = ctx;
+	char *found_data = NULL;
+	int found_data_len  = 0;
+	struct fs_path *p = NULL;
+
+	ret = find_xattr(sctx, sctx->parent_root, sctx->right_path,
+			sctx->cmp_key, name, name_len, &found_data,
+			&found_data_len);
+	if (ret == -ENOENT) {
+		ret = __process_new_xattr(num, name, name_len, data, data_len,
+				type, ctx);
+	} else if (ret >= 0) {
+		if (data_len != found_data_len ||
+		    memcmp(data, found_data, data_len)) {
+			ret = __process_new_xattr(num, name, name_len, data,
+					data_len, type, ctx);
+		} else {
+			ret = 0;
+		}
+	}
+
+	kfree(found_data);
+	fs_path_free(sctx, p);
+	return ret;
+}
+
+static int __process_changed_deleted_xattr(int num, const char *name,
+					   int name_len,
+					   const char *data, int data_len,
+					   u8 type, void *ctx)
+{
+	int ret;
+	struct send_ctx *sctx = ctx;
+
+	ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
+			name, name_len, NULL, NULL);
+	if (ret == -ENOENT)
+		ret = __process_deleted_xattr(num, name, name_len, data,
+				data_len, type, ctx);
+	else if (ret >= 0)
+		ret = 0;
+
+	return ret;
+}
+
+static int process_changed_xattr(struct send_ctx *sctx)
+{
+	int ret = 0;
+
+	ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
+			sctx->cmp_key, __process_changed_new_xattr, sctx);
+	if (ret < 0)
+		goto out;
+	ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
+			sctx->cmp_key, __process_changed_deleted_xattr, sctx);
+
+out:
+	return ret;
+}
+
+static int process_all_new_xattrs(struct send_ctx *sctx)
+{
+	int ret;
+	struct btrfs_root *root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *eb;
+	int slot;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	root = sctx->send_root;
+
+	key.objectid = sctx->cmp_key->objectid;
+	key.type = BTRFS_XATTR_ITEM_KEY;
+	key.offset = 0;
+	while (1) {
+		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = 0;
+			goto out;
+		}
+
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+
+		if (found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			ret = 0;
+			goto out;
+		}
+
+		ret = iterate_dir_item(sctx, root, path, &found_key,
+				__process_new_xattr, sctx);
+		if (ret < 0)
+			goto out;
+
+		btrfs_release_path(path);
+		key.offset = found_key.offset + 1;
+	}
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Read some bytes from the current inode/file and send a write command to
+ * user space.
+ */
+static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
+{
+	int ret = 0;
+	struct fs_path *p;
+	loff_t pos = offset;
+	int readed;
+	mm_segment_t old_fs;
+
+	p = fs_path_alloc(sctx);
+	if (!p)
+		return -ENOMEM;
+
+	/*
+	 * vfs normally only accepts user space buffers for security reasons.
+	 * we only read from the file and also only provide the read_buf buffer
+	 * to vfs. As this buffer does not come from a user space call, it's
+	 * ok to temporary allow kernel space buffers.
+	 */
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+
+verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
+
+	ret = open_cur_inode_file(sctx);
+	if (ret < 0)
+		goto out;
+
+	ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
+	if (ret < 0)
+		goto out;
+	readed = ret;
+	if (!readed)
+		goto out;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+	TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(sctx, p);
+	set_fs(old_fs);
+	if (ret < 0)
+		return ret;
+	return readed;
+}
+
+/*
+ * Send a clone command to user space.
+ */
+static int send_clone(struct send_ctx *sctx,
+		      u64 offset, u32 len,
+		      struct clone_root *clone_root)
+{
+	int ret = 0;
+	struct btrfs_root *clone_root2 = clone_root->root;
+	struct fs_path *p;
+	u64 gen;
+
+verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
+	       "clone_inode=%llu, clone_offset=%llu\n", offset, len,
+		clone_root->root->objectid, clone_root->ino,
+		clone_root->offset);
+
+	p = fs_path_alloc(sctx);
+	if (!p)
+		return -ENOMEM;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+
+	if (clone_root2 == sctx->send_root) {
+		ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
+				&gen, NULL, NULL, NULL);
+		if (ret < 0)
+			goto out;
+		ret = get_cur_path(sctx, clone_root->ino, gen, p);
+	} else {
+		ret = get_inode_path(sctx, clone_root2, clone_root->ino, p);
+	}
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+			clone_root2->root_item.uuid);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
+			clone_root2->root_item.ctransid);
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
+			clone_root->offset);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(sctx, p);
+	return ret;
+}
+
+static int send_write_or_clone(struct send_ctx *sctx,
+			       struct btrfs_path *path,
+			       struct btrfs_key *key,
+			       struct clone_root *clone_root)
+{
+	int ret = 0;
+	struct btrfs_file_extent_item *ei;
+	u64 offset = key->offset;
+	u64 pos = 0;
+	u64 len;
+	u32 l;
+	u8 type;
+
+	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			struct btrfs_file_extent_item);
+	type = btrfs_file_extent_type(path->nodes[0], ei);
+	if (type == BTRFS_FILE_EXTENT_INLINE)
+		len = btrfs_file_extent_inline_len(path->nodes[0], ei);
+	else
+		len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
+
+	if (offset + len > sctx->cur_inode_size)
+		len = sctx->cur_inode_size - offset;
+	if (len == 0) {
+		ret = 0;
+		goto out;
+	}
+
+	if (!clone_root) {
+		while (pos < len) {
+			l = len - pos;
+			if (l > BTRFS_SEND_READ_SIZE)
+				l = BTRFS_SEND_READ_SIZE;
+			ret = send_write(sctx, pos + offset, l);
+			if (ret < 0)
+				goto out;
+			if (!ret)
+				break;
+			pos += ret;
+		}
+		ret = 0;
+	} else {
+		ret = send_clone(sctx, offset, len, clone_root);
+	}
+
+out:
+	return ret;
+}
+
+static int is_extent_unchanged(struct send_ctx *sctx,
+			       struct btrfs_path *left_path,
+			       struct btrfs_key *ekey)
+{
+	int ret = 0;
+	struct btrfs_key key;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *eb;
+	int slot;
+	struct btrfs_key found_key;
+	struct btrfs_file_extent_item *ei;
+	u64 left_disknr;
+	u64 right_disknr;
+	u64 left_offset;
+	u64 right_offset;
+	u64 left_len;
+	u64 right_len;
+	u8 left_type;
+	u8 right_type;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	eb = left_path->nodes[0];
+	slot = left_path->slots[0];
+
+	ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+	left_type = btrfs_file_extent_type(eb, ei);
+	left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+	left_len = btrfs_file_extent_num_bytes(eb, ei);
+	left_offset = btrfs_file_extent_offset(eb, ei);
+
+	if (left_type != BTRFS_FILE_EXTENT_REG) {
+		ret = 0;
+		goto out;
+	}
+
+	key.objectid = ekey->objectid;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = ekey->offset;
+
+	while (1) {
+		ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path,
+				0, 0);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = 0;
+			goto out;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				path->slots[0]);
+		if (found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			ret = 0;
+			goto out;
+		}
+
+		eb = path->nodes[0];
+		slot = path->slots[0];
+
+		ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+		right_type = btrfs_file_extent_type(eb, ei);
+		right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+		right_len = btrfs_file_extent_num_bytes(eb, ei);
+		right_offset = btrfs_file_extent_offset(eb, ei);
+		btrfs_release_path(path);
+
+		if (right_type != BTRFS_FILE_EXTENT_REG) {
+			ret = 0;
+			goto out;
+		}
+
+		if (left_disknr != right_disknr) {
+			ret = 0;
+			goto out;
+		}
+
+		key.offset = found_key.offset + right_len;
+		if (key.offset >= ekey->offset + left_len) {
+			ret = 1;
+			goto out;
+		}
+	}
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int process_extent(struct send_ctx *sctx,
+			  struct btrfs_path *path,
+			  struct btrfs_key *key)
+{
+	int ret = 0;
+	struct clone_root *found_clone = NULL;
+
+	if (S_ISLNK(sctx->cur_inode_mode))
+		return 0;
+
+	if (sctx->parent_root && !sctx->cur_inode_new) {
+		ret = is_extent_unchanged(sctx, path, key);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = 0;
+			goto out;
+		}
+	}
+
+	ret = find_extent_clone(sctx, path, key->objectid, key->offset,
+			sctx->cur_inode_size, &found_clone);
+	if (ret != -ENOENT && ret < 0)
+		goto out;
+
+	ret = send_write_or_clone(sctx, path, key, found_clone);
+
+out:
+	return ret;
+}
+
+static int process_all_extents(struct send_ctx *sctx)
+{
+	int ret;
+	struct btrfs_root *root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *eb;
+	int slot;
+
+	root = sctx->send_root;
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = sctx->cmp_key->objectid;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = 0;
+	while (1) {
+		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = 0;
+			goto out;
+		}
+
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+
+		if (found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			ret = 0;
+			goto out;
+		}
+
+		ret = process_extent(sctx, path, &found_key);
+		if (ret < 0)
+			goto out;
+
+		btrfs_release_path(path);
+		key.offset = found_key.offset + 1;
+	}
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
+{
+	int ret = 0;
+
+	if (sctx->cur_ino == 0)
+		goto out;
+	if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
+	    sctx->cmp_key->type <= BTRFS_INODE_REF_KEY)
+		goto out;
+	if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
+		goto out;
+
+	ret = process_recorded_refs(sctx);
+
+out:
+	return ret;
+}
+
+static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
+{
+	int ret = 0;
+	u64 left_mode;
+	u64 left_uid;
+	u64 left_gid;
+	u64 right_mode;
+	u64 right_uid;
+	u64 right_gid;
+	int need_chmod = 0;
+	int need_chown = 0;
+
+	ret = process_recorded_refs_if_needed(sctx, at_end);
+	if (ret < 0)
+		goto out;
+
+	if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
+		goto out;
+	if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
+		goto out;
+
+	ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
+			&left_mode, &left_uid, &left_gid);
+	if (ret < 0)
+		goto out;
+
+	if (!S_ISLNK(sctx->cur_inode_mode)) {
+		if (!sctx->parent_root || sctx->cur_inode_new) {
+			need_chmod = 1;
+			need_chown = 1;
+		} else {
+			ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
+					NULL, NULL, &right_mode, &right_uid,
+					&right_gid);
+			if (ret < 0)
+				goto out;
+
+			if (left_uid != right_uid || left_gid != right_gid)
+				need_chown = 1;
+			if (left_mode != right_mode)
+				need_chmod = 1;
+		}
+	}
+
+	if (S_ISREG(sctx->cur_inode_mode)) {
+		ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				sctx->cur_inode_size);
+		if (ret < 0)
+			goto out;
+	}
+
+	if (need_chown) {
+		ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				left_uid, left_gid);
+		if (ret < 0)
+			goto out;
+	}
+	if (need_chmod) {
+		ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				left_mode);
+		if (ret < 0)
+			goto out;
+	}
+
+	/*
+	 * Need to send that every time, no matter if it actually changed
+	 * between the two trees as we have done changes to the inode before.
+	 */
+	ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+	if (ret < 0)
+		goto out;
+
+out:
+	return ret;
+}
+
+static int changed_inode(struct send_ctx *sctx,
+			 enum btrfs_compare_tree_result result)
+{
+	int ret = 0;
+	struct btrfs_key *key = sctx->cmp_key;
+	struct btrfs_inode_item *left_ii = NULL;
+	struct btrfs_inode_item *right_ii = NULL;
+	u64 left_gen = 0;
+	u64 right_gen = 0;
+
+	ret = close_cur_inode_file(sctx);
+	if (ret < 0)
+		goto out;
+
+	sctx->cur_ino = key->objectid;
+	sctx->cur_inode_new_gen = 0;
+	sctx->send_progress = sctx->cur_ino;
+
+	if (result == BTRFS_COMPARE_TREE_NEW ||
+	    result == BTRFS_COMPARE_TREE_CHANGED) {
+		left_ii = btrfs_item_ptr(sctx->left_path->nodes[0],
+				sctx->left_path->slots[0],
+				struct btrfs_inode_item);
+		left_gen = btrfs_inode_generation(sctx->left_path->nodes[0],
+				left_ii);
+	} else {
+		right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
+				sctx->right_path->slots[0],
+				struct btrfs_inode_item);
+		right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
+				right_ii);
+	}
+	if (result == BTRFS_COMPARE_TREE_CHANGED) {
+		right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
+				sctx->right_path->slots[0],
+				struct btrfs_inode_item);
+
+		right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
+				right_ii);
+		if (left_gen != right_gen)
+			sctx->cur_inode_new_gen = 1;
+	}
+
+	if (result == BTRFS_COMPARE_TREE_NEW) {
+		sctx->cur_inode_gen = left_gen;
+		sctx->cur_inode_new = 1;
+		sctx->cur_inode_deleted = 0;
+		sctx->cur_inode_size = btrfs_inode_size(
+				sctx->left_path->nodes[0], left_ii);
+		sctx->cur_inode_mode = btrfs_inode_mode(
+				sctx->left_path->nodes[0], left_ii);
+		if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
+			ret = send_create_inode(sctx, sctx->left_path,
+					sctx->cmp_key);
+	} else if (result == BTRFS_COMPARE_TREE_DELETED) {
+		sctx->cur_inode_gen = right_gen;
+		sctx->cur_inode_new = 0;
+		sctx->cur_inode_deleted = 1;
+		sctx->cur_inode_size = btrfs_inode_size(
+				sctx->right_path->nodes[0], right_ii);
+		sctx->cur_inode_mode = btrfs_inode_mode(
+				sctx->right_path->nodes[0], right_ii);
+	} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
+		if (sctx->cur_inode_new_gen) {
+			sctx->cur_inode_gen = right_gen;
+			sctx->cur_inode_new = 0;
+			sctx->cur_inode_deleted = 1;
+			sctx->cur_inode_size = btrfs_inode_size(
+					sctx->right_path->nodes[0], right_ii);
+			sctx->cur_inode_mode = btrfs_inode_mode(
+					sctx->right_path->nodes[0], right_ii);
+			ret = process_all_refs(sctx,
+					BTRFS_COMPARE_TREE_DELETED);
+			if (ret < 0)
+				goto out;
+
+			sctx->cur_inode_gen = left_gen;
+			sctx->cur_inode_new = 1;
+			sctx->cur_inode_deleted = 0;
+			sctx->cur_inode_size = btrfs_inode_size(
+					sctx->left_path->nodes[0], left_ii);
+			sctx->cur_inode_mode = btrfs_inode_mode(
+					sctx->left_path->nodes[0], left_ii);
+			ret = send_create_inode(sctx, sctx->left_path,
+					sctx->cmp_key);
+			if (ret < 0)
+				goto out;
+
+			ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
+			if (ret < 0)
+				goto out;
+			ret = process_all_extents(sctx);
+			if (ret < 0)
+				goto out;
+			ret = process_all_new_xattrs(sctx);
+			if (ret < 0)
+				goto out;
+		} else {
+			sctx->cur_inode_gen = left_gen;
+			sctx->cur_inode_new = 0;
+			sctx->cur_inode_new_gen = 0;
+			sctx->cur_inode_deleted = 0;
+			sctx->cur_inode_size = btrfs_inode_size(
+					sctx->left_path->nodes[0], left_ii);
+			sctx->cur_inode_mode = btrfs_inode_mode(
+					sctx->left_path->nodes[0], left_ii);
+		}
+	}
+
+out:
+	return ret;
+}
+
+static int changed_ref(struct send_ctx *sctx,
+		       enum btrfs_compare_tree_result result)
+{
+	int ret = 0;
+
+	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+
+	if (!sctx->cur_inode_new_gen &&
+	    sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
+		if (result == BTRFS_COMPARE_TREE_NEW)
+			ret = record_new_ref(sctx);
+		else if (result == BTRFS_COMPARE_TREE_DELETED)
+			ret = record_deleted_ref(sctx);
+		else if (result == BTRFS_COMPARE_TREE_CHANGED)
+			ret = record_changed_ref(sctx);
+	}
+
+	return ret;
+}
+
+static int changed_xattr(struct send_ctx *sctx,
+			 enum btrfs_compare_tree_result result)
+{
+	int ret = 0;
+
+	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+
+	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
+		if (result == BTRFS_COMPARE_TREE_NEW)
+			ret = process_new_xattr(sctx);
+		else if (result == BTRFS_COMPARE_TREE_DELETED)
+			ret = process_deleted_xattr(sctx);
+		else if (result == BTRFS_COMPARE_TREE_CHANGED)
+			ret = process_changed_xattr(sctx);
+	}
+
+	return ret;
+}
+
+static int changed_extent(struct send_ctx *sctx,
+			  enum btrfs_compare_tree_result result)
+{
+	int ret = 0;
+
+	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+
+	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
+		if (result != BTRFS_COMPARE_TREE_DELETED)
+			ret = process_extent(sctx, sctx->left_path,
+					sctx->cmp_key);
+	}
+
+	return ret;
+}
+
+
+static int changed_cb(struct btrfs_root *left_root,
+		      struct btrfs_root *right_root,
+		      struct btrfs_path *left_path,
+		      struct btrfs_path *right_path,
+		      struct btrfs_key *key,
+		      enum btrfs_compare_tree_result result,
+		      void *ctx)
+{
+	int ret = 0;
+	struct send_ctx *sctx = ctx;
+
+	sctx->left_path = left_path;
+	sctx->right_path = right_path;
+	sctx->cmp_key = key;
+
+	ret = finish_inode_if_needed(sctx, 0);
+	if (ret < 0)
+		goto out;
+
+	if (key->type == BTRFS_INODE_ITEM_KEY)
+		ret = changed_inode(sctx, result);
+	else if (key->type == BTRFS_INODE_REF_KEY)
+		ret = changed_ref(sctx, result);
+	else if (key->type == BTRFS_XATTR_ITEM_KEY)
+		ret = changed_xattr(sctx, result);
+	else if (key->type == BTRFS_EXTENT_DATA_KEY)
+		ret = changed_extent(sctx, result);
+
+out:
+	return ret;
+}
+
+static int full_send_tree(struct send_ctx *sctx)
+{
+	int ret;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_root *send_root = sctx->send_root;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+	struct extent_buffer *eb;
+	int slot;
+	u64 start_ctransid;
+	u64 ctransid;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	spin_lock(&send_root->root_times_lock);
+	start_ctransid = btrfs_root_ctransid(&send_root->root_item);
+	spin_unlock(&send_root->root_times_lock);
+
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+join_trans:
+	/*
+	 * We need to make sure the transaction does not get committed
+	 * while we do anything on commit roots. Join a transaction to prevent
+	 * this.
+	 */
+	trans = btrfs_join_transaction(send_root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		trans = NULL;
+		goto out;
+	}
+
+	/*
+	 * Make sure the tree has not changed
+	 */
+	spin_lock(&send_root->root_times_lock);
+	ctransid = btrfs_root_ctransid(&send_root->root_item);
+	spin_unlock(&send_root->root_times_lock);
+
+	if (ctransid != start_ctransid) {
+		WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
+				     "send was modified in between. This is "
+				     "probably a bug.\n");
+		ret = -EIO;
+		goto out;
+	}
+
+	ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
+	if (ret < 0)
+		goto out;
+	if (ret)
+		goto out_finish;
+
+	while (1) {
+		/*
+		 * When someone want to commit while we iterate, end the
+		 * joined transaction and rejoin.
+		 */
+		if (btrfs_should_end_transaction(trans, send_root)) {
+			ret = btrfs_end_transaction(trans, send_root);
+			trans = NULL;
+			if (ret < 0)
+				goto out;
+			btrfs_release_path(path);
+			goto join_trans;
+		}
+
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+
+		ret = changed_cb(send_root, NULL, path, NULL,
+				&found_key, BTRFS_COMPARE_TREE_NEW, sctx);
+		if (ret < 0)
+			goto out;
+
+		key.objectid = found_key.objectid;
+		key.type = found_key.type;
+		key.offset = found_key.offset + 1;
+
+		ret = btrfs_next_item(send_root, path);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret  = 0;
+			break;
+		}
+	}
+
+out_finish:
+	ret = finish_inode_if_needed(sctx, 1);
+
+out:
+	btrfs_free_path(path);
+	if (trans) {
+		if (!ret)
+			ret = btrfs_end_transaction(trans, send_root);
+		else
+			btrfs_end_transaction(trans, send_root);
+	}
+	return ret;
+}
+
+static int send_subvol(struct send_ctx *sctx)
+{
+	int ret;
+
+	ret = send_header(sctx);
+	if (ret < 0)
+		goto out;
+
+	ret = send_subvol_begin(sctx);
+	if (ret < 0)
+		goto out;
+
+	if (sctx->parent_root) {
+		ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
+				changed_cb, sctx);
+		if (ret < 0)
+			goto out;
+		ret = finish_inode_if_needed(sctx, 1);
+		if (ret < 0)
+			goto out;
+	} else {
+		ret = full_send_tree(sctx);
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	if (!ret)
+		ret = close_cur_inode_file(sctx);
+	else
+		close_cur_inode_file(sctx);
+
+	free_recorded_refs(sctx);
+	return ret;
+}
+
+long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
+{
+	int ret = 0;
+	struct btrfs_root *send_root;
+	struct btrfs_root *clone_root;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_ioctl_send_args *arg = NULL;
+	struct btrfs_key key;
+	struct file *filp = NULL;
+	struct send_ctx *sctx = NULL;
+	u32 i;
+	u64 *clone_sources_tmp = NULL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	send_root = BTRFS_I(fdentry(mnt_file)->d_inode)->root;
+	fs_info = send_root->fs_info;
+
+	arg = memdup_user(arg_, sizeof(*arg));
+	if (IS_ERR(arg)) {
+		ret = PTR_ERR(arg);
+		arg = NULL;
+		goto out;
+	}
+
+	if (!access_ok(VERIFY_READ, arg->clone_sources,
+			sizeof(*arg->clone_sources *
+			arg->clone_sources_count))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
+	if (!sctx) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&sctx->new_refs);
+	INIT_LIST_HEAD(&sctx->deleted_refs);
+	INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
+	INIT_LIST_HEAD(&sctx->name_cache_list);
+
+	sctx->send_filp = fget(arg->send_fd);
+	if (IS_ERR(sctx->send_filp)) {
+		ret = PTR_ERR(sctx->send_filp);
+		goto out;
+	}
+
+	sctx->mnt = mnt_file->f_path.mnt;
+
+	sctx->send_root = send_root;
+	sctx->clone_roots_cnt = arg->clone_sources_count;
+
+	sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
+	sctx->send_buf = vmalloc(sctx->send_max_size);
+	if (!sctx->send_buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
+	if (!sctx->read_buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
+			(arg->clone_sources_count + 1));
+	if (!sctx->clone_roots) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (arg->clone_sources_count) {
+		clone_sources_tmp = vmalloc(arg->clone_sources_count *
+				sizeof(*arg->clone_sources));
+		if (!clone_sources_tmp) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
+				arg->clone_sources_count *
+				sizeof(*arg->clone_sources));
+		if (ret) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		for (i = 0; i < arg->clone_sources_count; i++) {
+			key.objectid = clone_sources_tmp[i];
+			key.type = BTRFS_ROOT_ITEM_KEY;
+			key.offset = (u64)-1;
+			clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
+			if (!clone_root) {
+				ret = -EINVAL;
+				goto out;
+			}
+			if (IS_ERR(clone_root)) {
+				ret = PTR_ERR(clone_root);
+				goto out;
+			}
+			sctx->clone_roots[i].root = clone_root;
+		}
+		vfree(clone_sources_tmp);
+		clone_sources_tmp = NULL;
+	}
+
+	if (arg->parent_root) {
+		key.objectid = arg->parent_root;
+		key.type = BTRFS_ROOT_ITEM_KEY;
+		key.offset = (u64)-1;
+		sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
+		if (!sctx->parent_root) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	/*
+	 * Clones from send_root are allowed, but only if the clone source
+	 * is behind the current send position. This is checked while searching
+	 * for possible clone sources.
+	 */
+	sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root;
+
+	/* We do a bsearch later */
+	sort(sctx->clone_roots, sctx->clone_roots_cnt,
+			sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
+			NULL);
+
+	ret = send_subvol(sctx);
+	if (ret < 0)
+		goto out;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_END);
+	if (ret < 0)
+		goto out;
+	ret = send_cmd(sctx);
+	if (ret < 0)
+		goto out;
+
+out:
+	if (filp)
+		fput(filp);
+	kfree(arg);
+	vfree(clone_sources_tmp);
+
+	if (sctx) {
+		if (sctx->send_filp)
+			fput(sctx->send_filp);
+
+		vfree(sctx->clone_roots);
+		vfree(sctx->send_buf);
+		vfree(sctx->read_buf);
+
+		name_cache_free(sctx);
+
+		kfree(sctx);
+	}
+
+	return ret;
+}
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index a4c23ee..53f8ee7 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -124,3 +124,7 @@  enum {
 	__BTRFS_SEND_A_MAX,
 };
 #define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
+
+#ifdef __KERNEL__
+long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
+#endif