diff mbox

[15/26] Btrfs: add a new source file with device replace code

Message ID 4520a248e2bcf0493b35673ea4cac2d4e0757e08.1352217243.git.sbehrens@giantdisaster.de (mailing list archive)
State New, archived
Headers show

Commit Message

Stefan Behrens Nov. 6, 2012, 4:38 p.m. UTC
This adds a new file to the sources together with the header file
and the changes to ioctl.h that are required by the new C source
file.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/dev-replace.c | 843 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/dev-replace.h |  44 +++
 fs/btrfs/ioctl.h       |  45 +++
 3 files changed, 932 insertions(+)

Comments

Liu Bo Nov. 8, 2012, 2:50 p.m. UTC | #1
On Tue, Nov 06, 2012 at 05:38:33PM +0100, Stefan Behrens wrote:
> This adds a new file to the sources together with the header file
> and the changes to ioctl.h that are required by the new C source
> file.
> 
> Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
> ---
>  fs/btrfs/dev-replace.c | 843 +++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/btrfs/dev-replace.h |  44 +++
>  fs/btrfs/ioctl.h       |  45 +++
>  3 files changed, 932 insertions(+)
> 
> diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
> new file mode 100644
> index 0000000..1d56163
> --- /dev/null
> +++ b/fs/btrfs/dev-replace.c
> @@ -0,0 +1,843 @@
> +/*
> + * Copyright (C) STRATO AG 2012.  All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public
> + * License v2 as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public
> + * License along with this program; if not, write to the
> + * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
> + * Boston, MA 021110-1307, USA.
> + */
> +#include <linux/sched.h>
> +#include <linux/bio.h>
> +#include <linux/slab.h>
> +#include <linux/buffer_head.h>
> +#include <linux/blkdev.h>
> +#include <linux/random.h>
> +#include <linux/iocontext.h>
> +#include <linux/capability.h>
> +#include <linux/kthread.h>
> +#include <linux/math64.h>
> +#include <asm/div64.h>
> +#include "compat.h"
> +#include "ctree.h"
> +#include "extent_map.h"
> +#include "disk-io.h"
> +#include "transaction.h"
> +#include "print-tree.h"
> +#include "volumes.h"
> +#include "async-thread.h"
> +#include "check-integrity.h"
> +#include "rcu-string.h"
> +#include "dev-replace.h"
> +
> +static u64 btrfs_get_seconds_since_1970(void);
> +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
> +				       int scrub_ret);
> +static void btrfs_dev_replace_update_device_in_mapping_tree(
> +						struct btrfs_fs_info *fs_info,
> +						struct btrfs_device *srcdev,
> +						struct btrfs_device *tgtdev);
> +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
> +					 char *srcdev_name,
> +					 struct btrfs_device **device);
> +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
> +static int btrfs_dev_replace_kthread(void *data);
> +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
> +
> +
> +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
> +{
> +	struct btrfs_key key;
> +	struct btrfs_root *dev_root = fs_info->dev_root;
> +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
> +	struct extent_buffer *eb;
> +	int slot;
> +	int ret = 0;
> +	struct btrfs_path *path = NULL;
> +	int item_size;
> +	struct btrfs_dev_replace_item *ptr;
> +	u64 src_devid;
> +
> +	path = btrfs_alloc_path();
> +	if (!path) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	key.objectid = 0;
> +	key.type = BTRFS_DEV_REPLACE_KEY;
> +	key.offset = 0;
> +	ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
> +	if (ret) {
> +no_valid_dev_replace_entry_found:
> +		ret = 0;
> +		dev_replace->replace_state =
> +			BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
> +		dev_replace->cont_reading_from_srcdev_mode =
> +		    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
> +		dev_replace->replace_state = 0;
> +		dev_replace->time_started = 0;
> +		dev_replace->time_stopped = 0;
> +		atomic64_set(&dev_replace->num_write_errors, 0);
> +		atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
> +		dev_replace->cursor_left = 0;
> +		dev_replace->committed_cursor_left = 0;
> +		dev_replace->cursor_left_last_write_of_item = 0;
> +		dev_replace->cursor_right = 0;
> +		dev_replace->srcdev = NULL;
> +		dev_replace->tgtdev = NULL;
> +		dev_replace->is_valid = 0;
> +		dev_replace->item_needs_writeback = 0;
> +		goto out;
> +	}
> +	slot = path->slots[0];
> +	eb = path->nodes[0];
> +	item_size = btrfs_item_size_nr(eb, slot);
> +	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
> +
> +	if (item_size != sizeof(struct btrfs_dev_replace_item)) {
> +		pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
> +		goto no_valid_dev_replace_entry_found;
> +	}
> +
> +	src_devid = btrfs_dev_replace_src_devid(eb, ptr);
> +	dev_replace->cont_reading_from_srcdev_mode =
> +		btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
> +	dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
> +	dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
> +	dev_replace->time_stopped =
> +		btrfs_dev_replace_time_stopped(eb, ptr);
> +	atomic64_set(&dev_replace->num_write_errors,
> +		     btrfs_dev_replace_num_write_errors(eb, ptr));
> +	atomic64_set(&dev_replace->num_uncorrectable_read_errors,
> +		     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
> +	dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
> +	dev_replace->committed_cursor_left = dev_replace->cursor_left;
> +	dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
> +	dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
> +	dev_replace->is_valid = 1;
> +
> +	dev_replace->item_needs_writeback = 0;
> +	switch (dev_replace->replace_state) {
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
> +		dev_replace->srcdev = NULL;
> +		dev_replace->tgtdev = NULL;
> +		break;
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
> +		dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
> +							NULL, NULL);
> +		dev_replace->tgtdev = btrfs_find_device(fs_info,
> +							BTRFS_DEV_REPLACE_DEVID,
> +							NULL, NULL);
> +		/*
> +		 * allow 'btrfs dev replace_cancel' if src/tgt device is
> +		 * missing
> +		 */
> +		if (!dev_replace->srcdev &&
> +		    !btrfs_test_opt(dev_root, DEGRADED)) {
> +			ret = -EIO;
> +			pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
> +				(unsigned long long)src_devid);
> +		}
> +		if (!dev_replace->tgtdev &&
> +		    !btrfs_test_opt(dev_root, DEGRADED)) {
> +			ret = -EIO;
> +			pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
> +				(unsigned long long)BTRFS_DEV_REPLACE_DEVID);
> +		}
> +		if (dev_replace->tgtdev) {
> +			if (dev_replace->srcdev) {
> +				dev_replace->tgtdev->total_bytes =
> +					dev_replace->srcdev->total_bytes;
> +				dev_replace->tgtdev->disk_total_bytes =
> +					dev_replace->srcdev->disk_total_bytes;
> +				dev_replace->tgtdev->bytes_used =
> +					dev_replace->srcdev->bytes_used;
> +			}
> +			dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
> +			btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
> +				dev_replace->tgtdev);
> +		}
> +		break;
> +	}
> +
> +out:
> +	if (path) {
> +		btrfs_release_path(path);
> +		btrfs_free_path(path);

btrfs_free_path(path) will do release for you :)

> +	}
> +	return ret;
> +}
> +
> +/*
> + * called from commit_transaction. Writes changed device replace state to
> + * disk.
> + */
> +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
> +			  struct btrfs_fs_info *fs_info)
> +{
> +	int ret;
> +	struct btrfs_root *dev_root = fs_info->dev_root;
> +	struct btrfs_path *path;
> +	struct btrfs_key key;
> +	struct extent_buffer *eb;
> +	struct btrfs_dev_replace_item *ptr;
> +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
> +
> +	btrfs_dev_replace_lock(dev_replace);
> +	if (!dev_replace->is_valid ||
> +	    !dev_replace->item_needs_writeback) {
> +		btrfs_dev_replace_unlock(dev_replace);
> +		return 0;
> +	}
> +	btrfs_dev_replace_unlock(dev_replace);
> +
> +	key.objectid = 0;
> +	key.type = BTRFS_DEV_REPLACE_KEY;
> +	key.offset = 0;
> +
> +	path = btrfs_alloc_path();
> +	if (!path) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
> +	if (ret < 0) {
> +		pr_warn("btrfs: error %d while searching for dev_replace item!\n",
> +			ret);
> +		goto out;
> +	}
> +
> +	if (ret == 0 &&
> +	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
> +		/*
> +		 * need to delete old one and insert a new one.
> +		 * Since no attempt is made to recover any old state, if the
> +		 * dev_replace state is 'running', the data on the target
> +		 * drive is lost.
> +		 * It would be possible to recover the state: just make sure
> +		 * that the beginning of the item is never changed and always
> +		 * contains all the essential information. Then read this
> +		 * minimal set of information and use it as a base for the
> +		 * new state.
> +		 */
> +		ret = btrfs_del_item(trans, dev_root, path);
> +		if (ret != 0) {
> +			pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
> +				ret);
> +			goto out;
> +		}
> +		ret = 1;
> +	}
> +
> +	if (ret == 1) {
> +		/* need to insert a new item */
> +		btrfs_release_path(path);
> +		ret = btrfs_insert_empty_item(trans, dev_root, path,
> +					      &key, sizeof(*ptr));
> +		if (ret < 0) {
> +			pr_warn("btrfs: insert dev_replace item failed %d!\n",
> +				ret);
> +			goto out;
> +		}
> +	}
> +
> +	eb = path->nodes[0];
> +	ptr = btrfs_item_ptr(eb, path->slots[0],
> +			     struct btrfs_dev_replace_item);
> +
> +	btrfs_dev_replace_lock(dev_replace);
> +	if (dev_replace->srcdev)
> +		btrfs_set_dev_replace_src_devid(eb, ptr,
> +			dev_replace->srcdev->devid);
> +	else
> +		btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
> +	btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
> +		dev_replace->cont_reading_from_srcdev_mode);
> +	btrfs_set_dev_replace_replace_state(eb, ptr,
> +		dev_replace->replace_state);
> +	btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
> +	btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
> +	btrfs_set_dev_replace_num_write_errors(eb, ptr,
> +		atomic64_read(&dev_replace->num_write_errors));
> +	btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
> +		atomic64_read(&dev_replace->num_uncorrectable_read_errors));
> +	dev_replace->cursor_left_last_write_of_item =
> +		dev_replace->cursor_left;
> +	btrfs_set_dev_replace_cursor_left(eb, ptr,
> +		dev_replace->cursor_left_last_write_of_item);
> +	btrfs_set_dev_replace_cursor_right(eb, ptr,
> +		dev_replace->cursor_right);
> +	dev_replace->item_needs_writeback = 0;
> +	btrfs_dev_replace_unlock(dev_replace);
> +
> +	btrfs_mark_buffer_dirty(eb);
> +
> +out:
> +	btrfs_free_path(path);
> +
> +	return ret;
> +}
> +
> +void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
> +{
> +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
> +
> +	dev_replace->committed_cursor_left =
> +		dev_replace->cursor_left_last_write_of_item;
> +}
> +
> +static u64 btrfs_get_seconds_since_1970(void)
> +{
> +	struct timespec t = CURRENT_TIME_SEC;
> +
> +	return t.tv_sec;
> +}
> +
> +int btrfs_dev_replace_start(struct btrfs_root *root,
> +			    struct btrfs_ioctl_dev_replace_args *args)
> +{
> +	struct btrfs_trans_handle *trans;
> +	struct btrfs_fs_info *fs_info = root->fs_info;
> +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
> +	int ret;
> +	struct btrfs_device *tgt_device = NULL;
> +	struct btrfs_device *src_device = NULL;
> +
> +	switch (args->start.cont_reading_from_srcdev_mode) {
> +	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
> +	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
> +	    args->start.tgtdev_name[0] == '\0')
> +		return -EINVAL;
> +
> +	mutex_lock(&fs_info->volume_mutex);
> +	ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
> +					    &tgt_device);
> +	if (ret) {
> +		pr_err("btrfs: target device %s is invalid!\n",
> +		       args->start.tgtdev_name);
> +		mutex_unlock(&fs_info->volume_mutex);
> +		return -EINVAL;
> +	}
> +
> +	ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
> +					    args->start.srcdev_name,
> +					    &src_device);
> +	mutex_unlock(&fs_info->volume_mutex);
> +	if (ret) {
> +		ret = -EINVAL;
> +		goto leave_no_lock;
> +	}
> +
> +	if (tgt_device->total_bytes < src_device->total_bytes) {
> +		pr_err("btrfs: target device is smaller than source device!\n");
> +		ret = -EINVAL;
> +		goto leave_no_lock;
> +	}
> +
> +	btrfs_dev_replace_lock(dev_replace);
> +	switch (dev_replace->replace_state) {
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
> +		break;
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
> +		args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
> +		goto leave;
> +	}
> +
> +	dev_replace->cont_reading_from_srcdev_mode =
> +		args->start.cont_reading_from_srcdev_mode;
> +	WARN_ON(!src_device);
> +	dev_replace->srcdev = src_device;
> +	WARN_ON(!tgt_device);
> +	dev_replace->tgtdev = tgt_device;
> +
> +	tgt_device->total_bytes = src_device->total_bytes;
> +	tgt_device->disk_total_bytes = src_device->disk_total_bytes;
> +	tgt_device->bytes_used = src_device->bytes_used;
> +
> +	/*
> +	 * from now on, the writes to the srcdev are all duplicated to
> +	 * go to the tgtdev as well (refer to btrfs_map_block()).
> +	 */
> +	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
> +	dev_replace->time_started = btrfs_get_seconds_since_1970();
> +	dev_replace->cursor_left = 0;
> +	dev_replace->committed_cursor_left = 0;
> +	dev_replace->cursor_left_last_write_of_item = 0;
> +	dev_replace->cursor_right = 0;
> +	dev_replace->is_valid = 1;
> +	dev_replace->item_needs_writeback = 1;
> +	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
> +	btrfs_dev_replace_unlock(dev_replace);
> +
> +	btrfs_wait_ordered_extents(root, 0);
> +
> +	/* force writing the updated state information to disk */
> +	trans = btrfs_start_transaction(root, 0);

why a start_transaction here?  Any reasons?
(same question also for some other places)

thanks,
liubo

> +	if (IS_ERR(trans)) {
> +		ret = PTR_ERR(trans);
> +		btrfs_dev_replace_lock(dev_replace);
> +		goto leave;
> +	}
> +
> +	ret = btrfs_commit_transaction(trans, root);
> +	WARN_ON(ret);
> +
> +	/* the disk copy procedure reuses the scrub code */
> +	ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
> +			      src_device->total_bytes,
> +			      &dev_replace->scrub_progress, 0, 1);
> +
> +	ret = btrfs_dev_replace_finishing(root->fs_info, ret);
> +	WARN_ON(ret);
> +
> +	return 0;
> +
> +leave:
> +	dev_replace->srcdev = NULL;
> +	dev_replace->tgtdev = NULL;
> +	btrfs_dev_replace_unlock(dev_replace);
> +leave_no_lock:
> +	if (tgt_device)
> +		btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
> +	return ret;
> +}
> +
> +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
> +				       int scrub_ret)
> +{
> +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
> +	struct btrfs_device *tgt_device;
> +	struct btrfs_device *src_device;
> +	struct btrfs_root *root = fs_info->tree_root;
> +	u8 uuid_tmp[BTRFS_UUID_SIZE];
> +	struct btrfs_trans_handle *trans;
> +	int ret = 0;
> +
> +	/* don't allow cancel or unmount to disturb the finishing procedure */
> +	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
> +
> +	btrfs_dev_replace_lock(dev_replace);
> +	/* was the operation canceled, or is it finished? */
> +	if (dev_replace->replace_state !=
> +	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
> +		btrfs_dev_replace_unlock(dev_replace);
> +		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
> +		return 0;
> +	}
> +
> +	tgt_device = dev_replace->tgtdev;
> +	src_device = dev_replace->srcdev;
> +	btrfs_dev_replace_unlock(dev_replace);
> +
> +	/* replace old device with new one in mapping tree */
> +	if (!scrub_ret)
> +		btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
> +								src_device,
> +								tgt_device);
> +
> +	/*
> +	 * flush all outstanding I/O and inode extent mappings before the
> +	 * copy operation is declared as being finished
> +	 */
> +	btrfs_start_delalloc_inodes(root, 0);
> +	btrfs_wait_ordered_extents(root, 0);
> +
> +	trans = btrfs_start_transaction(root, 0);
> +	if (IS_ERR(trans)) {
> +		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
> +		return PTR_ERR(trans);
> +	}
> +	ret = btrfs_commit_transaction(trans, root);
> +	WARN_ON(ret);
> +
> +	/* keep away write_all_supers() during the finishing procedure */
> +	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
> +	btrfs_dev_replace_lock(dev_replace);
> +	dev_replace->replace_state =
> +		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
> +			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
> +	dev_replace->tgtdev = NULL;
> +	dev_replace->srcdev = NULL;
> +	dev_replace->time_stopped = btrfs_get_seconds_since_1970();
> +	dev_replace->item_needs_writeback = 1;
> +
> +	if (scrub_ret) {
> +		printk_in_rcu(KERN_ERR
> +			      "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
> +			      rcu_str_deref(src_device->name),
> +			      src_device->devid,
> +			      rcu_str_deref(tgt_device->name), scrub_ret);
> +		btrfs_dev_replace_unlock(dev_replace);
> +		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
> +		if (tgt_device)
> +			btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
> +		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
> +
> +		return 0;
> +	}
> +
> +	tgt_device->is_tgtdev_for_dev_replace = 0;
> +	tgt_device->devid = src_device->devid;
> +	src_device->devid = BTRFS_DEV_REPLACE_DEVID;
> +	tgt_device->bytes_used = src_device->bytes_used;
> +	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
> +	memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
> +	memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
> +	tgt_device->total_bytes = src_device->total_bytes;
> +	tgt_device->disk_total_bytes = src_device->disk_total_bytes;
> +	tgt_device->bytes_used = src_device->bytes_used;
> +	if (fs_info->sb->s_bdev == src_device->bdev)
> +		fs_info->sb->s_bdev = tgt_device->bdev;
> +	if (fs_info->fs_devices->latest_bdev == src_device->bdev)
> +		fs_info->fs_devices->latest_bdev = tgt_device->bdev;
> +	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
> +
> +	btrfs_rm_dev_replace_srcdev(fs_info, src_device);
> +	if (src_device->bdev) {
> +		/* zero out the old super */
> +		btrfs_scratch_superblock(src_device);
> +	}
> +	/*
> +	 * this is again a consistent state where no dev_replace procedure
> +	 * is running, the target device is part of the filesystem, the
> +	 * source device is not part of the filesystem anymore and its 1st
> +	 * superblock is scratched out so that it is no longer marked to
> +	 * belong to this filesystem.
> +	 */
> +	btrfs_dev_replace_unlock(dev_replace);
> +	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
> +
> +	/* write back the superblocks */
> +	trans = btrfs_start_transaction(root, 0);
> +	if (!IS_ERR(trans))
> +		btrfs_commit_transaction(trans, root);
> +
> +	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
> +
> +	return 0;
> +}
> +
> +static void btrfs_dev_replace_update_device_in_mapping_tree(
> +						struct btrfs_fs_info *fs_info,
> +						struct btrfs_device *srcdev,
> +						struct btrfs_device *tgtdev)
> +{
> +	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
> +	struct extent_map *em;
> +	struct map_lookup *map;
> +	u64 start = 0;
> +	int i;
> +
> +	write_lock(&em_tree->lock);
> +	do {
> +		em = lookup_extent_mapping(em_tree, start, (u64)-1);
> +		if (!em)
> +			break;
> +		map = (struct map_lookup *)em->bdev;
> +		for (i = 0; i < map->num_stripes; i++)
> +			if (srcdev == map->stripes[i].dev)
> +				map->stripes[i].dev = tgtdev;
> +		start = em->start + em->len;
> +		free_extent_map(em);
> +	} while (start);
> +	write_unlock(&em_tree->lock);
> +}
> +
> +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
> +					 char *srcdev_name,
> +					 struct btrfs_device **device)
> +{
> +	int ret;
> +
> +	if (srcdevid) {
> +		*device = btrfs_find_device(root->fs_info, srcdevid, NULL,
> +					    NULL);
> +		if (!*device)
> +			ret = -ENOENT;
> +	} else {
> +		ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
> +							   device);
> +	}
> +	return ret;
> +}
> +
> +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
> +			      struct btrfs_ioctl_dev_replace_args *args)
> +{
> +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
> +
> +	btrfs_dev_replace_lock(dev_replace);
> +	/* even if !dev_replace_is_valid, the values are good enough for
> +	 * the replace_status ioctl */
> +	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
> +	args->status.replace_state = dev_replace->replace_state;
> +	args->status.time_started = dev_replace->time_started;
> +	args->status.time_stopped = dev_replace->time_stopped;
> +	args->status.num_write_errors =
> +		atomic64_read(&dev_replace->num_write_errors);
> +	args->status.num_uncorrectable_read_errors =
> +		atomic64_read(&dev_replace->num_uncorrectable_read_errors);
> +	switch (dev_replace->replace_state) {
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
> +		args->status.progress_1000 = 0;
> +		break;
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
> +		args->status.progress_1000 = 1000;
> +		break;
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
> +		args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
> +			div64_u64(dev_replace->srcdev->total_bytes, 1000));
> +		break;
> +	}
> +	btrfs_dev_replace_unlock(dev_replace);
> +}
> +
> +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
> +			     struct btrfs_ioctl_dev_replace_args *args)
> +{
> +	args->result = __btrfs_dev_replace_cancel(fs_info);
> +	return 0;
> +}
> +
> +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
> +{
> +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
> +	struct btrfs_device *tgt_device = NULL;
> +	struct btrfs_trans_handle *trans;
> +	struct btrfs_root *root = fs_info->tree_root;
> +	u64 result;
> +	int ret;
> +
> +	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
> +	btrfs_dev_replace_lock(dev_replace);
> +	switch (dev_replace->replace_state) {
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
> +		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
> +		btrfs_dev_replace_unlock(dev_replace);
> +		goto leave;
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
> +		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
> +		tgt_device = dev_replace->tgtdev;
> +		dev_replace->tgtdev = NULL;
> +		dev_replace->srcdev = NULL;
> +		break;
> +	}
> +	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
> +	dev_replace->time_stopped = btrfs_get_seconds_since_1970();
> +	dev_replace->item_needs_writeback = 1;
> +	btrfs_dev_replace_unlock(dev_replace);
> +	btrfs_scrub_cancel(fs_info);
> +
> +	trans = btrfs_start_transaction(root, 0);
> +	if (IS_ERR(trans)) {
> +		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
> +		return PTR_ERR(trans);
> +	}
> +	ret = btrfs_commit_transaction(trans, root);
> +	WARN_ON(ret);
> +	if (tgt_device)
> +		btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
> +
> +leave:
> +	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
> +	return result;
> +}
> +
> +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
> +{
> +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
> +
> +	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
> +	btrfs_dev_replace_lock(dev_replace);
> +	switch (dev_replace->replace_state) {
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
> +		break;
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
> +		dev_replace->replace_state =
> +			BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
> +		dev_replace->time_stopped = btrfs_get_seconds_since_1970();
> +		dev_replace->item_needs_writeback = 1;
> +		pr_info("btrfs: suspending dev_replace for unmount\n");
> +		break;
> +	}
> +
> +	btrfs_dev_replace_unlock(dev_replace);
> +	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
> +}
> +
> +/* resume dev_replace procedure that was interrupted by unmount */
> +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
> +{
> +	struct task_struct *task;
> +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
> +
> +	btrfs_dev_replace_lock(dev_replace);
> +	switch (dev_replace->replace_state) {
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
> +		btrfs_dev_replace_unlock(dev_replace);
> +		return 0;
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
> +		break;
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
> +		dev_replace->replace_state =
> +			BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
> +		break;
> +	}
> +	if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
> +		pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
> +			"btrfs: you may cancel the operation after 'mount -o degraded'\n");
> +		btrfs_dev_replace_unlock(dev_replace);
> +		return 0;
> +	}
> +	btrfs_dev_replace_unlock(dev_replace);
> +
> +	WARN_ON(atomic_xchg(
> +		&fs_info->mutually_exclusive_operation_running, 1));
> +	task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
> +	return PTR_RET(task);
> +}
> +
> +static int btrfs_dev_replace_kthread(void *data)
> +{
> +	struct btrfs_fs_info *fs_info = data;
> +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
> +	struct btrfs_ioctl_dev_replace_args *status_args;
> +	u64 progress;
> +
> +	status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
> +	if (status_args) {
> +		btrfs_dev_replace_status(fs_info, status_args);
> +		progress = status_args->status.progress_1000;
> +		kfree(status_args);
> +		do_div(progress, 10);
> +		printk_in_rcu(KERN_INFO
> +			      "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
> +			      dev_replace->srcdev->missing ? "<missing disk>" :
> +				rcu_str_deref(dev_replace->srcdev->name),
> +			      dev_replace->srcdev->devid,
> +			      dev_replace->tgtdev ?
> +				rcu_str_deref(dev_replace->tgtdev->name) :
> +				"<missing target disk>",
> +			      (unsigned int)progress);
> +	}
> +	btrfs_dev_replace_continue_on_mount(fs_info);
> +	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
> +
> +	return 0;
> +}
> +
> +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
> +{
> +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
> +	int ret;
> +
> +	ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
> +			      dev_replace->committed_cursor_left,
> +			      dev_replace->srcdev->total_bytes,
> +			      &dev_replace->scrub_progress, 0, 1);
> +	ret = btrfs_dev_replace_finishing(fs_info, ret);
> +	WARN_ON(ret);
> +	return 0;
> +}
> +
> +int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
> +{
> +	if (!dev_replace->is_valid)
> +		return 0;
> +
> +	switch (dev_replace->replace_state) {
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
> +		return 0;
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
> +		/*
> +		 * return true even if tgtdev is missing (this is
> +		 * something that can happen if the dev_replace
> +		 * procedure is suspended by an umount and then
> +		 * the tgtdev is missing (or "btrfs dev scan") was
> +		 * not called and the the filesystem is remounted
> +		 * in degraded state. This does not stop the
> +		 * dev_replace procedure. It needs to be canceled
> +		 * manually if the cancelation is wanted.
> +		 */
> +		break;
> +	}
> +	return 1;
> +}
> +
> +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
> +{
> +	/* the beginning is just an optimization for the typical case */
> +	if (atomic_read(&dev_replace->nesting_level) == 0) {
> +acquire_lock:
> +		/* this is not a nested case where the same thread
> +		 * is trying to acqurire the same lock twice */
> +		mutex_lock(&dev_replace->lock);
> +		mutex_lock(&dev_replace->lock_management_lock);
> +		dev_replace->lock_owner = current->pid;
> +		atomic_inc(&dev_replace->nesting_level);
> +		mutex_unlock(&dev_replace->lock_management_lock);
> +		return;
> +	}
> +
> +	mutex_lock(&dev_replace->lock_management_lock);
> +	if (atomic_read(&dev_replace->nesting_level) > 0 &&
> +	    dev_replace->lock_owner == current->pid) {
> +		WARN_ON(!mutex_is_locked(&dev_replace->lock));
> +		atomic_inc(&dev_replace->nesting_level);
> +		mutex_unlock(&dev_replace->lock_management_lock);
> +		return;
> +	}
> +
> +	mutex_unlock(&dev_replace->lock_management_lock);
> +	goto acquire_lock;
> +}
> +
> +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
> +{
> +	WARN_ON(!mutex_is_locked(&dev_replace->lock));
> +	mutex_lock(&dev_replace->lock_management_lock);
> +	WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
> +	WARN_ON(dev_replace->lock_owner != current->pid);
> +	atomic_dec(&dev_replace->nesting_level);
> +	if (atomic_read(&dev_replace->nesting_level) == 0) {
> +		dev_replace->lock_owner = 0;
> +		mutex_unlock(&dev_replace->lock_management_lock);
> +		mutex_unlock(&dev_replace->lock);
> +	} else {
> +		mutex_unlock(&dev_replace->lock_management_lock);
> +	}
> +}
> diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
> new file mode 100644
> index 0000000..20035cb
> --- /dev/null
> +++ b/fs/btrfs/dev-replace.h
> @@ -0,0 +1,44 @@
> +/*
> + * Copyright (C) STRATO AG 2012.  All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public
> + * License v2 as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public
> + * License along with this program; if not, write to the
> + * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
> + * Boston, MA 021110-1307, USA.
> + */
> +
> +#if !defined(__BTRFS_DEV_REPLACE__)
> +#define __BTRFS_DEV_REPLACE__
> +
> +struct btrfs_ioctl_dev_replace_args;
> +
> +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
> +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
> +			  struct btrfs_fs_info *fs_info);
> +void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
> +int btrfs_dev_replace_start(struct btrfs_root *root,
> +			    struct btrfs_ioctl_dev_replace_args *args);
> +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
> +			      struct btrfs_ioctl_dev_replace_args *args);
> +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
> +			     struct btrfs_ioctl_dev_replace_args *args);
> +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
> +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
> +int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
> +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
> +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
> +
> +static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
> +{
> +	atomic64_inc(stat_value);
> +}
> +#endif
> diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
> index 731e287..62006ba 100644
> --- a/fs/btrfs/ioctl.h
> +++ b/fs/btrfs/ioctl.h
> @@ -123,6 +123,48 @@ struct btrfs_ioctl_scrub_args {
>  	__u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
>  };
>  
> +#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS	0
> +#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID	1
> +struct btrfs_ioctl_dev_replace_start_params {
> +	__u64 srcdevid;	/* in, if 0, use srcdev_name instead */
> +	__u8 srcdev_name[BTRFS_PATH_NAME_MAX + 1];	/* in */
> +	__u8 tgtdev_name[BTRFS_PATH_NAME_MAX + 1];	/* in */
> +	__u64 cont_reading_from_srcdev_mode;	/* in, see #define
> +						 * above */
> +};
> +
> +#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED	0
> +#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED		1
> +#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED		2
> +#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED		3
> +#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED		4
> +struct btrfs_ioctl_dev_replace_status_params {
> +	__u64 replace_state;	/* out, see #define above */
> +	__u64 progress_1000;	/* out, 0 <= x <= 1000 */
> +	__u64 time_started;	/* out, seconds since 1-Jan-1970 */
> +	__u64 time_stopped;	/* out, seconds since 1-Jan-1970 */
> +	__u64 num_write_errors;	/* out */
> +	__u64 num_uncorrectable_read_errors;	/* out */
> +};
> +
> +#define BTRFS_IOCTL_DEV_REPLACE_CMD_START			0
> +#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS			1
> +#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL			2
> +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR			0
> +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED		1
> +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED		2
> +struct btrfs_ioctl_dev_replace_args {
> +	__u64 cmd;	/* in */
> +	__u64 result;	/* out */
> +
> +	union {
> +		struct btrfs_ioctl_dev_replace_start_params start;
> +		struct btrfs_ioctl_dev_replace_status_params status;
> +	};	/* in/out */
> +
> +	__u64 spare[64];
> +};
> +
>  #define BTRFS_DEVICE_PATH_NAME_MAX 1024
>  struct btrfs_ioctl_dev_info_args {
>  	__u64 devid;				/* in/out */
> @@ -453,4 +495,7 @@ struct btrfs_ioctl_send_args {
>  			       struct btrfs_ioctl_qgroup_limit_args)
>  #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
>  				      struct btrfs_ioctl_get_dev_stats)
> +#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
> +				    struct btrfs_ioctl_dev_replace_args)
> +
>  #endif
> -- 
> 1.8.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stefan Behrens Nov. 8, 2012, 5:24 p.m. UTC | #2
On Thu, 8 Nov 2012 22:50:47 +0800, Liu Bo wrote:
> On Tue, Nov 06, 2012 at 05:38:33PM +0100, Stefan Behrens wrote:

>> +out:
>> +	if (path) {
>> +		btrfs_release_path(path);
>> +		btrfs_free_path(path);
> 
> btrfs_free_path(path) will do release for you :)
> 

Right :) Thanks.


>> +int btrfs_dev_replace_start(struct btrfs_root *root,
>> +			    struct btrfs_ioctl_dev_replace_args *args)
>> +{
>> +	struct btrfs_trans_handle *trans;
>> +	struct btrfs_fs_info *fs_info = root->fs_info;
>> +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
>> +	int ret;
>> +	struct btrfs_device *tgt_device = NULL;
>> +	struct btrfs_device *src_device = NULL;
>> +
>> +	switch (args->start.cont_reading_from_srcdev_mode) {
>> +	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
>> +	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
>> +		break;
>> +	default:
>> +		return -EINVAL;
>> +	}
>> +
>> +	if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
>> +	    args->start.tgtdev_name[0] == '\0')
>> +		return -EINVAL;
>> +
>> +	mutex_lock(&fs_info->volume_mutex);
>> +	ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
>> +					    &tgt_device);
>> +	if (ret) {
>> +		pr_err("btrfs: target device %s is invalid!\n",
>> +		       args->start.tgtdev_name);
>> +		mutex_unlock(&fs_info->volume_mutex);
>> +		return -EINVAL;
>> +	}
>> +
>> +	ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
>> +					    args->start.srcdev_name,
>> +					    &src_device);
>> +	mutex_unlock(&fs_info->volume_mutex);
>> +	if (ret) {
>> +		ret = -EINVAL;
>> +		goto leave_no_lock;
>> +	}
>> +
>> +	if (tgt_device->total_bytes < src_device->total_bytes) {
>> +		pr_err("btrfs: target device is smaller than source device!\n");
>> +		ret = -EINVAL;
>> +		goto leave_no_lock;
>> +	}
>> +
>> +	btrfs_dev_replace_lock(dev_replace);
>> +	switch (dev_replace->replace_state) {
>> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
>> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
>> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
>> +		break;
>> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
>> +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
>> +		args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
>> +		goto leave;
>> +	}
>> +
>> +	dev_replace->cont_reading_from_srcdev_mode =
>> +		args->start.cont_reading_from_srcdev_mode;
>> +	WARN_ON(!src_device);
>> +	dev_replace->srcdev = src_device;
>> +	WARN_ON(!tgt_device);
>> +	dev_replace->tgtdev = tgt_device;
>> +
>> +	tgt_device->total_bytes = src_device->total_bytes;
>> +	tgt_device->disk_total_bytes = src_device->disk_total_bytes;
>> +	tgt_device->bytes_used = src_device->bytes_used;
>> +
>> +	/*
>> +	 * from now on, the writes to the srcdev are all duplicated to
>> +	 * go to the tgtdev as well (refer to btrfs_map_block()).
>> +	 */
>> +	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
>> +	dev_replace->time_started = btrfs_get_seconds_since_1970();
>> +	dev_replace->cursor_left = 0;
>> +	dev_replace->committed_cursor_left = 0;
>> +	dev_replace->cursor_left_last_write_of_item = 0;
>> +	dev_replace->cursor_right = 0;
>> +	dev_replace->is_valid = 1;
>> +	dev_replace->item_needs_writeback = 1;
>> +	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
>> +	btrfs_dev_replace_unlock(dev_replace);
>> +
>> +	btrfs_wait_ordered_extents(root, 0);
>> +
>> +	/* force writing the updated state information to disk */
>> +	trans = btrfs_start_transaction(root, 0);
> 
> why a start_transaction here?  Any reasons?
> (same question also for some other places)
> 

Without this transaction, there is outstanding I/O which is not flushed.
Pending writes that go only to the old disk need to be flushed before
the mode is switched to write all live data to the source disk and to
the target disk as well. The copy operation that is part of the scrub
code works on the commit root for performance reasons. Every write
request that is performed after the commit root is established needs to
go to both disks. Those requests that already have the bdev assigned
(i.e., btrfs_map_bio() was already called) cannot be duplicated anymore
to write to the new disk as well.

btrfs_dev_replace_finishing() looks similar and goes through a
transaction commit between the steps where the bdev in the mapping tree
is swapped and the step when the old bdev is freed. Otherwise the bdev
would be accessed after being freed.


>> +	if (IS_ERR(trans)) {
>> +		ret = PTR_ERR(trans);
>> +		btrfs_dev_replace_lock(dev_replace);
>> +		goto leave;
>> +	}
>> +
>> +	ret = btrfs_commit_transaction(trans, root);
>> +	WARN_ON(ret);
>> +
>> +	/* the disk copy procedure reuses the scrub code */
>> +	ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
>> +			      src_device->total_bytes,
>> +			      &dev_replace->scrub_progress, 0, 1);
>> +
>> +	ret = btrfs_dev_replace_finishing(root->fs_info, ret);
>> +	WARN_ON(ret);
>> +
>> +	return 0;
>> +
>> +leave:
>> +	dev_replace->srcdev = NULL;
>> +	dev_replace->tgtdev = NULL;
>> +	btrfs_dev_replace_unlock(dev_replace);
>> +leave_no_lock:
>> +	if (tgt_device)
>> +		btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
>> +	return ret;
>> +}
>> +
>> +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
>> +				       int scrub_ret)
>> +{
>> +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
>> +	struct btrfs_device *tgt_device;
>> +	struct btrfs_device *src_device;
>> +	struct btrfs_root *root = fs_info->tree_root;
>> +	u8 uuid_tmp[BTRFS_UUID_SIZE];
>> +	struct btrfs_trans_handle *trans;
>> +	int ret = 0;
>> +
>> +	/* don't allow cancel or unmount to disturb the finishing procedure */
>> +	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
>> +
>> +	btrfs_dev_replace_lock(dev_replace);
>> +	/* was the operation canceled, or is it finished? */
>> +	if (dev_replace->replace_state !=
>> +	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
>> +		btrfs_dev_replace_unlock(dev_replace);
>> +		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
>> +		return 0;
>> +	}
>> +
>> +	tgt_device = dev_replace->tgtdev;
>> +	src_device = dev_replace->srcdev;
>> +	btrfs_dev_replace_unlock(dev_replace);
>> +
>> +	/* replace old device with new one in mapping tree */
>> +	if (!scrub_ret)
>> +		btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
>> +								src_device,
>> +								tgt_device);
>> +
>> +	/*
>> +	 * flush all outstanding I/O and inode extent mappings before the
>> +	 * copy operation is declared as being finished
>> +	 */
>> +	btrfs_start_delalloc_inodes(root, 0);
>> +	btrfs_wait_ordered_extents(root, 0);
>> +
>> +	trans = btrfs_start_transaction(root, 0);
>> +	if (IS_ERR(trans)) {
>> +		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
>> +		return PTR_ERR(trans);
>> +	}
>> +	ret = btrfs_commit_transaction(trans, root);
>> +	WARN_ON(ret);
>> +
>> +	/* keep away write_all_supers() during the finishing procedure */
>> +	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
>> +	btrfs_dev_replace_lock(dev_replace);
>> +	dev_replace->replace_state =
>> +		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
>> +			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
>> +	dev_replace->tgtdev = NULL;
>> +	dev_replace->srcdev = NULL;
>> +	dev_replace->time_stopped = btrfs_get_seconds_since_1970();
>> +	dev_replace->item_needs_writeback = 1;
>> +
>> +	if (scrub_ret) {
>> +		printk_in_rcu(KERN_ERR
>> +			      "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
>> +			      rcu_str_deref(src_device->name),
>> +			      src_device->devid,
>> +			      rcu_str_deref(tgt_device->name), scrub_ret);
>> +		btrfs_dev_replace_unlock(dev_replace);
>> +		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
>> +		if (tgt_device)
>> +			btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
>> +		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
>> +
>> +		return 0;
>> +	}
>> +
>> +	tgt_device->is_tgtdev_for_dev_replace = 0;
>> +	tgt_device->devid = src_device->devid;
>> +	src_device->devid = BTRFS_DEV_REPLACE_DEVID;
>> +	tgt_device->bytes_used = src_device->bytes_used;
>> +	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
>> +	memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
>> +	memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
>> +	tgt_device->total_bytes = src_device->total_bytes;
>> +	tgt_device->disk_total_bytes = src_device->disk_total_bytes;
>> +	tgt_device->bytes_used = src_device->bytes_used;
>> +	if (fs_info->sb->s_bdev == src_device->bdev)
>> +		fs_info->sb->s_bdev = tgt_device->bdev;
>> +	if (fs_info->fs_devices->latest_bdev == src_device->bdev)
>> +		fs_info->fs_devices->latest_bdev = tgt_device->bdev;
>> +	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
>> +
>> +	btrfs_rm_dev_replace_srcdev(fs_info, src_device);
>> +	if (src_device->bdev) {
>> +		/* zero out the old super */
>> +		btrfs_scratch_superblock(src_device);
>> +	}
>> +	/*
>> +	 * this is again a consistent state where no dev_replace procedure
>> +	 * is running, the target device is part of the filesystem, the
>> +	 * source device is not part of the filesystem anymore and its 1st
>> +	 * superblock is scratched out so that it is no longer marked to
>> +	 * belong to this filesystem.
>> +	 */
>> +	btrfs_dev_replace_unlock(dev_replace);
>> +	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
>> +
>> +	/* write back the superblocks */
>> +	trans = btrfs_start_transaction(root, 0);
>> +	if (!IS_ERR(trans))
>> +		btrfs_commit_transaction(trans, root);
>> +
>> +	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
>> +
>> +	return 0;
>> +}

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liu Bo Nov. 9, 2012, 12:44 a.m. UTC | #3
On Thu, Nov 08, 2012 at 06:24:36PM +0100, Stefan Behrens wrote:
> On Thu, 8 Nov 2012 22:50:47 +0800, Liu Bo wrote:
> > On Tue, Nov 06, 2012 at 05:38:33PM +0100, Stefan Behrens wrote:
[...]
> >> +	btrfs_dev_replace_unlock(dev_replace);
> >> +
> >> +	btrfs_wait_ordered_extents(root, 0);
> >> +
> >> +	/* force writing the updated state information to disk */
> >> +	trans = btrfs_start_transaction(root, 0);
> > 
> > why a start_transaction here?  Any reasons?
> > (same question also for some other places)
> > 
> 
> Without this transaction, there is outstanding I/O which is not flushed.
> Pending writes that go only to the old disk need to be flushed before
> the mode is switched to write all live data to the source disk and to
> the target disk as well. The copy operation that is part of the scrub
> code works on the commit root for performance reasons. Every write
> request that is performed after the commit root is established needs to
> go to both disks. Those requests that already have the bdev assigned
> (i.e., btrfs_map_bio() was already called) cannot be duplicated anymore
> to write to the new disk as well.
> 
> btrfs_dev_replace_finishing() looks similar and goes through a
> transaction commit between the steps where the bdev in the mapping tree
> is swapped and the step when the old bdev is freed. Otherwise the bdev
> would be accessed after being freed.
> 

I see, if you're only about to flush metadata, why not join a transaction?

thanks,
liubo
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stefan Behrens Nov. 9, 2012, 10:19 a.m. UTC | #4
On Fri, 9 Nov 2012 08:44:01 +0800, Liu Bo wrote:
> On Thu, Nov 08, 2012 at 06:24:36PM +0100, Stefan Behrens wrote:
>> On Thu, 8 Nov 2012 22:50:47 +0800, Liu Bo wrote:
>>> On Tue, Nov 06, 2012 at 05:38:33PM +0100, Stefan Behrens wrote:
>>>> +	trans = btrfs_start_transaction(root, 0);
>>>
>>> why a start_transaction here?  Any reasons?
>>> (same question also for some other places)
>>>
>>
>> Without this transaction, there is outstanding I/O which is not flushed.
>> Pending writes that go only to the old disk need to be flushed before
>> the mode is switched to write all live data to the source disk and to
>> the target disk as well. The copy operation that is part of the scrub
>> code works on the commit root for performance reasons. Every write
>> request that is performed after the commit root is established needs to
>> go to both disks. Those requests that already have the bdev assigned
>> (i.e., btrfs_map_bio() was already called) cannot be duplicated anymore
>> to write to the new disk as well.
>>
>> btrfs_dev_replace_finishing() looks similar and goes through a
>> transaction commit between the steps where the bdev in the mapping tree
>> is swapped and the step when the old bdev is freed. Otherwise the bdev
>> would be accessed after being freed.
>>
> 
> I see, if you're only about to flush metadata, why not join a transaction?

btrfs_join_transaction() would delay the current transaction and enforce
that the current transaction is used and not a new one.
btrfs_start_transaction() would use either the current transaction, or a
new one. It is less interfering.

Since in dev-replace.c it is not required to enforce that a current
transaction is joined, btrfs_start_transaction() is the one to choose
here, as I understood it.

But that's an interesting topic and I would appreciate to get a definite
rule which one to choose when.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liu Bo Nov. 9, 2012, 2:45 p.m. UTC | #5
On Fri, Nov 09, 2012 at 11:19:17AM +0100, Stefan Behrens wrote:
> On Fri, 9 Nov 2012 08:44:01 +0800, Liu Bo wrote:
> > On Thu, Nov 08, 2012 at 06:24:36PM +0100, Stefan Behrens wrote:
> >> On Thu, 8 Nov 2012 22:50:47 +0800, Liu Bo wrote:
> >>> On Tue, Nov 06, 2012 at 05:38:33PM +0100, Stefan Behrens wrote:
> >>>> +	trans = btrfs_start_transaction(root, 0);
> >>>
> >>> why a start_transaction here?  Any reasons?
> >>> (same question also for some other places)
> >>>
> >>
> >> Without this transaction, there is outstanding I/O which is not flushed.
> >> Pending writes that go only to the old disk need to be flushed before
> >> the mode is switched to write all live data to the source disk and to
> >> the target disk as well. The copy operation that is part of the scrub
> >> code works on the commit root for performance reasons. Every write
> >> request that is performed after the commit root is established needs to
> >> go to both disks. Those requests that already have the bdev assigned
> >> (i.e., btrfs_map_bio() was already called) cannot be duplicated anymore
> >> to write to the new disk as well.
> >>
> >> btrfs_dev_replace_finishing() looks similar and goes through a
> >> transaction commit between the steps where the bdev in the mapping tree
> >> is swapped and the step when the old bdev is freed. Otherwise the bdev
> >> would be accessed after being freed.
> >>
> > 
> > I see, if you're only about to flush metadata, why not join a transaction?
> 
> btrfs_join_transaction() would delay the current transaction and enforce
> that the current transaction is used and not a new one.
> btrfs_start_transaction() would use either the current transaction, or a
> new one. It is less interfering.

hmm...btrfs_start_transaction() would not use the current transaction unless
you're still in the same task, ie. current->journal_info remains unchanged,
otherwise it will be blocked by the current transaction(wait_current_trans()).

If there are several btrfs_start_transaction() being blocked, after the current
one's commit, one of them will allocate a new transaction, and the rest can join it.

But btrfs_join_transaction will join the current as much as possible.

And since here we don't do any reservation and seems to just update chunk/device
tree(which will use global block rsv directly), I perfer btrfs_join_transaction().

thanks,
liubo

> 
> Since in dev-replace.c it is not required to enforce that a current
> transaction is joined, btrfs_start_transaction() is the one to choose
> here, as I understood it.
> 
> But that's an interesting topic and I would appreciate to get a definite
> rule which one to choose when.
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stefan Behrens Nov. 12, 2012, 5:21 p.m. UTC | #6
On Fri, 9 Nov 2012 22:45:16 +0800, Liu Bo wrote:
> On Fri, Nov 09, 2012 at 11:19:17AM +0100, Stefan Behrens wrote:
>> On Fri, 9 Nov 2012 08:44:01 +0800, Liu Bo wrote:
>>> On Thu, Nov 08, 2012 at 06:24:36PM +0100, Stefan Behrens wrote:
>>>> On Thu, 8 Nov 2012 22:50:47 +0800, Liu Bo wrote:
>>>>> On Tue, Nov 06, 2012 at 05:38:33PM +0100, Stefan Behrens wrote:
>>>>>> +	trans = btrfs_start_transaction(root, 0);
>>>>>
>>>>> why a start_transaction here?  Any reasons?
>>>>> (same question also for some other places)
>>>>>
>>>>
>>>> Without this transaction, there is outstanding I/O which is not flushed.
>>>> Pending writes that go only to the old disk need to be flushed before
>>>> the mode is switched to write all live data to the source disk and to
>>>> the target disk as well. The copy operation that is part of the scrub
>>>> code works on the commit root for performance reasons. Every write
>>>> request that is performed after the commit root is established needs to
>>>> go to both disks. Those requests that already have the bdev assigned
>>>> (i.e., btrfs_map_bio() was already called) cannot be duplicated anymore
>>>> to write to the new disk as well.
>>>>
>>>> btrfs_dev_replace_finishing() looks similar and goes through a
>>>> transaction commit between the steps where the bdev in the mapping tree
>>>> is swapped and the step when the old bdev is freed. Otherwise the bdev
>>>> would be accessed after being freed.
>>>>
>>>
>>> I see, if you're only about to flush metadata, why not join a transaction?
>>
>> btrfs_join_transaction() would delay the current transaction and enforce
>> that the current transaction is used and not a new one.
>> btrfs_start_transaction() would use either the current transaction, or a
>> new one. It is less interfering.
> 
> hmm...btrfs_start_transaction() would not use the current transaction unless
> you're still in the same task, ie. current->journal_info remains unchanged,
> otherwise it will be blocked by the current transaction(wait_current_trans()).
> 
> If there are several btrfs_start_transaction() being blocked, after the current
> one's commit, one of them will allocate a new transaction, and the rest can join it.
> 
> But btrfs_join_transaction will join the current as much as possible.
> 
> And since here we don't do any reservation and seems to just update chunk/device
> tree(which will use global block rsv directly), I perfer btrfs_join_transaction().
> 

I am still not sure, which one is worse or better:
a) to delay a commit by calling btrfs_join_transaction() which joins and thereby delays a transaction, or
b) to go through one additional transaction.

Here is the log message of the commit that added btrfs_join_transaction(). For me, it sounds like one should use btrfs_join_transaction() only when it is _required_ to join a transaction, e.g. when a low level function is required to join the transaction that some higher level function has started:

commit f9295749388f82c8d2f485e99c72cd7c7876a99b
Author: Chris Mason <chris.mason@oracle.com>
Date:   Thu Jul 17 12:54:14 2008 -0400

    btrfs_start_transaction: wait for commits in progress to finish

    btrfs_commit_transaction has to loop waiting for any writers in the
    transaction to finish before it can proceed.  btrfs_start_transaction
    should be polite and not join a transaction that is in the process
    of being finished off.

    There are a few places that can't wait, basically the ones doing IO that
    might be needed to finish the transaction.  For them, btrfs_join_transaction
    is added.



>>
>> Since in dev-replace.c it is not required to enforce that a current
>> transaction is joined, btrfs_start_transaction() is the one to choose
>> here, as I understood it.
>>
>> But that's an interesting topic and I would appreciate to get a definite
>> rule which one to choose when.


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
new file mode 100644
index 0000000..1d56163
--- /dev/null
+++ b/fs/btrfs/dev-replace.c
@@ -0,0 +1,843 @@ 
+/*
+ * Copyright (C) STRATO AG 2012.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+
+static u64 btrfs_get_seconds_since_1970(void);
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+				       int scrub_ret);
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+						struct btrfs_fs_info *fs_info,
+						struct btrfs_device *srcdev,
+						struct btrfs_device *tgtdev);
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+					 char *srcdev_name,
+					 struct btrfs_device **device);
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
+static int btrfs_dev_replace_kthread(void *data);
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
+
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_key key;
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct extent_buffer *eb;
+	int slot;
+	int ret = 0;
+	struct btrfs_path *path = NULL;
+	int item_size;
+	struct btrfs_dev_replace_item *ptr;
+	u64 src_devid;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = 0;
+	key.type = BTRFS_DEV_REPLACE_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+	if (ret) {
+no_valid_dev_replace_entry_found:
+		ret = 0;
+		dev_replace->replace_state =
+			BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
+		dev_replace->cont_reading_from_srcdev_mode =
+		    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
+		dev_replace->replace_state = 0;
+		dev_replace->time_started = 0;
+		dev_replace->time_stopped = 0;
+		atomic64_set(&dev_replace->num_write_errors, 0);
+		atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
+		dev_replace->cursor_left = 0;
+		dev_replace->committed_cursor_left = 0;
+		dev_replace->cursor_left_last_write_of_item = 0;
+		dev_replace->cursor_right = 0;
+		dev_replace->srcdev = NULL;
+		dev_replace->tgtdev = NULL;
+		dev_replace->is_valid = 0;
+		dev_replace->item_needs_writeback = 0;
+		goto out;
+	}
+	slot = path->slots[0];
+	eb = path->nodes[0];
+	item_size = btrfs_item_size_nr(eb, slot);
+	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
+
+	if (item_size != sizeof(struct btrfs_dev_replace_item)) {
+		pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
+		goto no_valid_dev_replace_entry_found;
+	}
+
+	src_devid = btrfs_dev_replace_src_devid(eb, ptr);
+	dev_replace->cont_reading_from_srcdev_mode =
+		btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
+	dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
+	dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
+	dev_replace->time_stopped =
+		btrfs_dev_replace_time_stopped(eb, ptr);
+	atomic64_set(&dev_replace->num_write_errors,
+		     btrfs_dev_replace_num_write_errors(eb, ptr));
+	atomic64_set(&dev_replace->num_uncorrectable_read_errors,
+		     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
+	dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
+	dev_replace->committed_cursor_left = dev_replace->cursor_left;
+	dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
+	dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
+	dev_replace->is_valid = 1;
+
+	dev_replace->item_needs_writeback = 0;
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		dev_replace->srcdev = NULL;
+		dev_replace->tgtdev = NULL;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
+							NULL, NULL);
+		dev_replace->tgtdev = btrfs_find_device(fs_info,
+							BTRFS_DEV_REPLACE_DEVID,
+							NULL, NULL);
+		/*
+		 * allow 'btrfs dev replace_cancel' if src/tgt device is
+		 * missing
+		 */
+		if (!dev_replace->srcdev &&
+		    !btrfs_test_opt(dev_root, DEGRADED)) {
+			ret = -EIO;
+			pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
+				(unsigned long long)src_devid);
+		}
+		if (!dev_replace->tgtdev &&
+		    !btrfs_test_opt(dev_root, DEGRADED)) {
+			ret = -EIO;
+			pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
+				(unsigned long long)BTRFS_DEV_REPLACE_DEVID);
+		}
+		if (dev_replace->tgtdev) {
+			if (dev_replace->srcdev) {
+				dev_replace->tgtdev->total_bytes =
+					dev_replace->srcdev->total_bytes;
+				dev_replace->tgtdev->disk_total_bytes =
+					dev_replace->srcdev->disk_total_bytes;
+				dev_replace->tgtdev->bytes_used =
+					dev_replace->srcdev->bytes_used;
+			}
+			dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
+			btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
+				dev_replace->tgtdev);
+		}
+		break;
+	}
+
+out:
+	if (path) {
+		btrfs_release_path(path);
+		btrfs_free_path(path);
+	}
+	return ret;
+}
+
+/*
+ * called from commit_transaction. Writes changed device replace state to
+ * disk.
+ */
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+			  struct btrfs_fs_info *fs_info)
+{
+	int ret;
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	struct btrfs_dev_replace_item *ptr;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	btrfs_dev_replace_lock(dev_replace);
+	if (!dev_replace->is_valid ||
+	    !dev_replace->item_needs_writeback) {
+		btrfs_dev_replace_unlock(dev_replace);
+		return 0;
+	}
+	btrfs_dev_replace_unlock(dev_replace);
+
+	key.objectid = 0;
+	key.type = BTRFS_DEV_REPLACE_KEY;
+	key.offset = 0;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+	if (ret < 0) {
+		pr_warn("btrfs: error %d while searching for dev_replace item!\n",
+			ret);
+		goto out;
+	}
+
+	if (ret == 0 &&
+	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+		/*
+		 * need to delete old one and insert a new one.
+		 * Since no attempt is made to recover any old state, if the
+		 * dev_replace state is 'running', the data on the target
+		 * drive is lost.
+		 * It would be possible to recover the state: just make sure
+		 * that the beginning of the item is never changed and always
+		 * contains all the essential information. Then read this
+		 * minimal set of information and use it as a base for the
+		 * new state.
+		 */
+		ret = btrfs_del_item(trans, dev_root, path);
+		if (ret != 0) {
+			pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
+				ret);
+			goto out;
+		}
+		ret = 1;
+	}
+
+	if (ret == 1) {
+		/* need to insert a new item */
+		btrfs_release_path(path);
+		ret = btrfs_insert_empty_item(trans, dev_root, path,
+					      &key, sizeof(*ptr));
+		if (ret < 0) {
+			pr_warn("btrfs: insert dev_replace item failed %d!\n",
+				ret);
+			goto out;
+		}
+	}
+
+	eb = path->nodes[0];
+	ptr = btrfs_item_ptr(eb, path->slots[0],
+			     struct btrfs_dev_replace_item);
+
+	btrfs_dev_replace_lock(dev_replace);
+	if (dev_replace->srcdev)
+		btrfs_set_dev_replace_src_devid(eb, ptr,
+			dev_replace->srcdev->devid);
+	else
+		btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
+	btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
+		dev_replace->cont_reading_from_srcdev_mode);
+	btrfs_set_dev_replace_replace_state(eb, ptr,
+		dev_replace->replace_state);
+	btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
+	btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
+	btrfs_set_dev_replace_num_write_errors(eb, ptr,
+		atomic64_read(&dev_replace->num_write_errors));
+	btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
+		atomic64_read(&dev_replace->num_uncorrectable_read_errors));
+	dev_replace->cursor_left_last_write_of_item =
+		dev_replace->cursor_left;
+	btrfs_set_dev_replace_cursor_left(eb, ptr,
+		dev_replace->cursor_left_last_write_of_item);
+	btrfs_set_dev_replace_cursor_right(eb, ptr,
+		dev_replace->cursor_right);
+	dev_replace->item_needs_writeback = 0;
+	btrfs_dev_replace_unlock(dev_replace);
+
+	btrfs_mark_buffer_dirty(eb);
+
+out:
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	dev_replace->committed_cursor_left =
+		dev_replace->cursor_left_last_write_of_item;
+}
+
+static u64 btrfs_get_seconds_since_1970(void)
+{
+	struct timespec t = CURRENT_TIME_SEC;
+
+	return t.tv_sec;
+}
+
+int btrfs_dev_replace_start(struct btrfs_root *root,
+			    struct btrfs_ioctl_dev_replace_args *args)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	int ret;
+	struct btrfs_device *tgt_device = NULL;
+	struct btrfs_device *src_device = NULL;
+
+	switch (args->start.cont_reading_from_srcdev_mode) {
+	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
+	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
+	    args->start.tgtdev_name[0] == '\0')
+		return -EINVAL;
+
+	mutex_lock(&fs_info->volume_mutex);
+	ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+					    &tgt_device);
+	if (ret) {
+		pr_err("btrfs: target device %s is invalid!\n",
+		       args->start.tgtdev_name);
+		mutex_unlock(&fs_info->volume_mutex);
+		return -EINVAL;
+	}
+
+	ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
+					    args->start.srcdev_name,
+					    &src_device);
+	mutex_unlock(&fs_info->volume_mutex);
+	if (ret) {
+		ret = -EINVAL;
+		goto leave_no_lock;
+	}
+
+	if (tgt_device->total_bytes < src_device->total_bytes) {
+		pr_err("btrfs: target device is smaller than source device!\n");
+		ret = -EINVAL;
+		goto leave_no_lock;
+	}
+
+	btrfs_dev_replace_lock(dev_replace);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+		goto leave;
+	}
+
+	dev_replace->cont_reading_from_srcdev_mode =
+		args->start.cont_reading_from_srcdev_mode;
+	WARN_ON(!src_device);
+	dev_replace->srcdev = src_device;
+	WARN_ON(!tgt_device);
+	dev_replace->tgtdev = tgt_device;
+
+	tgt_device->total_bytes = src_device->total_bytes;
+	tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+	tgt_device->bytes_used = src_device->bytes_used;
+
+	/*
+	 * from now on, the writes to the srcdev are all duplicated to
+	 * go to the tgtdev as well (refer to btrfs_map_block()).
+	 */
+	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+	dev_replace->time_started = btrfs_get_seconds_since_1970();
+	dev_replace->cursor_left = 0;
+	dev_replace->committed_cursor_left = 0;
+	dev_replace->cursor_left_last_write_of_item = 0;
+	dev_replace->cursor_right = 0;
+	dev_replace->is_valid = 1;
+	dev_replace->item_needs_writeback = 1;
+	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+	btrfs_dev_replace_unlock(dev_replace);
+
+	btrfs_wait_ordered_extents(root, 0);
+
+	/* force writing the updated state information to disk */
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		btrfs_dev_replace_lock(dev_replace);
+		goto leave;
+	}
+
+	ret = btrfs_commit_transaction(trans, root);
+	WARN_ON(ret);
+
+	/* the disk copy procedure reuses the scrub code */
+	ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
+			      src_device->total_bytes,
+			      &dev_replace->scrub_progress, 0, 1);
+
+	ret = btrfs_dev_replace_finishing(root->fs_info, ret);
+	WARN_ON(ret);
+
+	return 0;
+
+leave:
+	dev_replace->srcdev = NULL;
+	dev_replace->tgtdev = NULL;
+	btrfs_dev_replace_unlock(dev_replace);
+leave_no_lock:
+	if (tgt_device)
+		btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+	return ret;
+}
+
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+				       int scrub_ret)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct btrfs_device *tgt_device;
+	struct btrfs_device *src_device;
+	struct btrfs_root *root = fs_info->tree_root;
+	u8 uuid_tmp[BTRFS_UUID_SIZE];
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+
+	/* don't allow cancel or unmount to disturb the finishing procedure */
+	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+
+	btrfs_dev_replace_lock(dev_replace);
+	/* was the operation canceled, or is it finished? */
+	if (dev_replace->replace_state !=
+	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
+		btrfs_dev_replace_unlock(dev_replace);
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+		return 0;
+	}
+
+	tgt_device = dev_replace->tgtdev;
+	src_device = dev_replace->srcdev;
+	btrfs_dev_replace_unlock(dev_replace);
+
+	/* replace old device with new one in mapping tree */
+	if (!scrub_ret)
+		btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+								src_device,
+								tgt_device);
+
+	/*
+	 * flush all outstanding I/O and inode extent mappings before the
+	 * copy operation is declared as being finished
+	 */
+	btrfs_start_delalloc_inodes(root, 0);
+	btrfs_wait_ordered_extents(root, 0);
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+		return PTR_ERR(trans);
+	}
+	ret = btrfs_commit_transaction(trans, root);
+	WARN_ON(ret);
+
+	/* keep away write_all_supers() during the finishing procedure */
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	btrfs_dev_replace_lock(dev_replace);
+	dev_replace->replace_state =
+		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
+			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
+	dev_replace->tgtdev = NULL;
+	dev_replace->srcdev = NULL;
+	dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+	dev_replace->item_needs_writeback = 1;
+
+	if (scrub_ret) {
+		printk_in_rcu(KERN_ERR
+			      "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+			      rcu_str_deref(src_device->name),
+			      src_device->devid,
+			      rcu_str_deref(tgt_device->name), scrub_ret);
+		btrfs_dev_replace_unlock(dev_replace);
+		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+		if (tgt_device)
+			btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+		return 0;
+	}
+
+	tgt_device->is_tgtdev_for_dev_replace = 0;
+	tgt_device->devid = src_device->devid;
+	src_device->devid = BTRFS_DEV_REPLACE_DEVID;
+	tgt_device->bytes_used = src_device->bytes_used;
+	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
+	memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
+	memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
+	tgt_device->total_bytes = src_device->total_bytes;
+	tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+	tgt_device->bytes_used = src_device->bytes_used;
+	if (fs_info->sb->s_bdev == src_device->bdev)
+		fs_info->sb->s_bdev = tgt_device->bdev;
+	if (fs_info->fs_devices->latest_bdev == src_device->bdev)
+		fs_info->fs_devices->latest_bdev = tgt_device->bdev;
+	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+
+	btrfs_rm_dev_replace_srcdev(fs_info, src_device);
+	if (src_device->bdev) {
+		/* zero out the old super */
+		btrfs_scratch_superblock(src_device);
+	}
+	/*
+	 * this is again a consistent state where no dev_replace procedure
+	 * is running, the target device is part of the filesystem, the
+	 * source device is not part of the filesystem anymore and its 1st
+	 * superblock is scratched out so that it is no longer marked to
+	 * belong to this filesystem.
+	 */
+	btrfs_dev_replace_unlock(dev_replace);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	/* write back the superblocks */
+	trans = btrfs_start_transaction(root, 0);
+	if (!IS_ERR(trans))
+		btrfs_commit_transaction(trans, root);
+
+	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+	return 0;
+}
+
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+						struct btrfs_fs_info *fs_info,
+						struct btrfs_device *srcdev,
+						struct btrfs_device *tgtdev)
+{
+	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	u64 start = 0;
+	int i;
+
+	write_lock(&em_tree->lock);
+	do {
+		em = lookup_extent_mapping(em_tree, start, (u64)-1);
+		if (!em)
+			break;
+		map = (struct map_lookup *)em->bdev;
+		for (i = 0; i < map->num_stripes; i++)
+			if (srcdev == map->stripes[i].dev)
+				map->stripes[i].dev = tgtdev;
+		start = em->start + em->len;
+		free_extent_map(em);
+	} while (start);
+	write_unlock(&em_tree->lock);
+}
+
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+					 char *srcdev_name,
+					 struct btrfs_device **device)
+{
+	int ret;
+
+	if (srcdevid) {
+		*device = btrfs_find_device(root->fs_info, srcdevid, NULL,
+					    NULL);
+		if (!*device)
+			ret = -ENOENT;
+	} else {
+		ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
+							   device);
+	}
+	return ret;
+}
+
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+			      struct btrfs_ioctl_dev_replace_args *args)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	btrfs_dev_replace_lock(dev_replace);
+	/* even if !dev_replace_is_valid, the values are good enough for
+	 * the replace_status ioctl */
+	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+	args->status.replace_state = dev_replace->replace_state;
+	args->status.time_started = dev_replace->time_started;
+	args->status.time_stopped = dev_replace->time_stopped;
+	args->status.num_write_errors =
+		atomic64_read(&dev_replace->num_write_errors);
+	args->status.num_uncorrectable_read_errors =
+		atomic64_read(&dev_replace->num_uncorrectable_read_errors);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		args->status.progress_1000 = 0;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+		args->status.progress_1000 = 1000;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
+			div64_u64(dev_replace->srcdev->total_bytes, 1000));
+		break;
+	}
+	btrfs_dev_replace_unlock(dev_replace);
+}
+
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+			     struct btrfs_ioctl_dev_replace_args *args)
+{
+	args->result = __btrfs_dev_replace_cancel(fs_info);
+	return 0;
+}
+
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct btrfs_device *tgt_device = NULL;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = fs_info->tree_root;
+	u64 result;
+	int ret;
+
+	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+	btrfs_dev_replace_lock(dev_replace);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+		btrfs_dev_replace_unlock(dev_replace);
+		goto leave;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+		tgt_device = dev_replace->tgtdev;
+		dev_replace->tgtdev = NULL;
+		dev_replace->srcdev = NULL;
+		break;
+	}
+	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
+	dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+	dev_replace->item_needs_writeback = 1;
+	btrfs_dev_replace_unlock(dev_replace);
+	btrfs_scrub_cancel(fs_info);
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+		return PTR_ERR(trans);
+	}
+	ret = btrfs_commit_transaction(trans, root);
+	WARN_ON(ret);
+	if (tgt_device)
+		btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+
+leave:
+	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+	return result;
+}
+
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+	btrfs_dev_replace_lock(dev_replace);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+		dev_replace->replace_state =
+			BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+		dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+		dev_replace->item_needs_writeback = 1;
+		pr_info("btrfs: suspending dev_replace for unmount\n");
+		break;
+	}
+
+	btrfs_dev_replace_unlock(dev_replace);
+	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+}
+
+/* resume dev_replace procedure that was interrupted by unmount */
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
+{
+	struct task_struct *task;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	btrfs_dev_replace_lock(dev_replace);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		btrfs_dev_replace_unlock(dev_replace);
+		return 0;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		dev_replace->replace_state =
+			BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+		break;
+	}
+	if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
+		pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
+			"btrfs: you may cancel the operation after 'mount -o degraded'\n");
+		btrfs_dev_replace_unlock(dev_replace);
+		return 0;
+	}
+	btrfs_dev_replace_unlock(dev_replace);
+
+	WARN_ON(atomic_xchg(
+		&fs_info->mutually_exclusive_operation_running, 1));
+	task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
+	return PTR_RET(task);
+}
+
+static int btrfs_dev_replace_kthread(void *data)
+{
+	struct btrfs_fs_info *fs_info = data;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct btrfs_ioctl_dev_replace_args *status_args;
+	u64 progress;
+
+	status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
+	if (status_args) {
+		btrfs_dev_replace_status(fs_info, status_args);
+		progress = status_args->status.progress_1000;
+		kfree(status_args);
+		do_div(progress, 10);
+		printk_in_rcu(KERN_INFO
+			      "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+			      dev_replace->srcdev->missing ? "<missing disk>" :
+				rcu_str_deref(dev_replace->srcdev->name),
+			      dev_replace->srcdev->devid,
+			      dev_replace->tgtdev ?
+				rcu_str_deref(dev_replace->tgtdev->name) :
+				"<missing target disk>",
+			      (unsigned int)progress);
+	}
+	btrfs_dev_replace_continue_on_mount(fs_info);
+	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+
+	return 0;
+}
+
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	int ret;
+
+	ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
+			      dev_replace->committed_cursor_left,
+			      dev_replace->srcdev->total_bytes,
+			      &dev_replace->scrub_progress, 0, 1);
+	ret = btrfs_dev_replace_finishing(fs_info, ret);
+	WARN_ON(ret);
+	return 0;
+}
+
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+{
+	if (!dev_replace->is_valid)
+		return 0;
+
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		return 0;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		/*
+		 * return true even if tgtdev is missing (this is
+		 * something that can happen if the dev_replace
+		 * procedure is suspended by an umount and then
+		 * the tgtdev is missing (or "btrfs dev scan") was
+		 * not called and the the filesystem is remounted
+		 * in degraded state. This does not stop the
+		 * dev_replace procedure. It needs to be canceled
+		 * manually if the cancelation is wanted.
+		 */
+		break;
+	}
+	return 1;
+}
+
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
+{
+	/* the beginning is just an optimization for the typical case */
+	if (atomic_read(&dev_replace->nesting_level) == 0) {
+acquire_lock:
+		/* this is not a nested case where the same thread
+		 * is trying to acqurire the same lock twice */
+		mutex_lock(&dev_replace->lock);
+		mutex_lock(&dev_replace->lock_management_lock);
+		dev_replace->lock_owner = current->pid;
+		atomic_inc(&dev_replace->nesting_level);
+		mutex_unlock(&dev_replace->lock_management_lock);
+		return;
+	}
+
+	mutex_lock(&dev_replace->lock_management_lock);
+	if (atomic_read(&dev_replace->nesting_level) > 0 &&
+	    dev_replace->lock_owner == current->pid) {
+		WARN_ON(!mutex_is_locked(&dev_replace->lock));
+		atomic_inc(&dev_replace->nesting_level);
+		mutex_unlock(&dev_replace->lock_management_lock);
+		return;
+	}
+
+	mutex_unlock(&dev_replace->lock_management_lock);
+	goto acquire_lock;
+}
+
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
+{
+	WARN_ON(!mutex_is_locked(&dev_replace->lock));
+	mutex_lock(&dev_replace->lock_management_lock);
+	WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
+	WARN_ON(dev_replace->lock_owner != current->pid);
+	atomic_dec(&dev_replace->nesting_level);
+	if (atomic_read(&dev_replace->nesting_level) == 0) {
+		dev_replace->lock_owner = 0;
+		mutex_unlock(&dev_replace->lock_management_lock);
+		mutex_unlock(&dev_replace->lock);
+	} else {
+		mutex_unlock(&dev_replace->lock_management_lock);
+	}
+}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
new file mode 100644
index 0000000..20035cb
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@ 
+/*
+ * Copyright (C) STRATO AG 2012.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_DEV_REPLACE__)
+#define __BTRFS_DEV_REPLACE__
+
+struct btrfs_ioctl_dev_replace_args;
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+			  struct btrfs_fs_info *fs_info);
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_start(struct btrfs_root *root,
+			    struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+			      struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+			     struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
+
+static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
+{
+	atomic64_inc(stat_value);
+}
+#endif
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 731e287..62006ba 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -123,6 +123,48 @@  struct btrfs_ioctl_scrub_args {
 	__u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
 };
 
+#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS	0
+#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID	1
+struct btrfs_ioctl_dev_replace_start_params {
+	__u64 srcdevid;	/* in, if 0, use srcdev_name instead */
+	__u8 srcdev_name[BTRFS_PATH_NAME_MAX + 1];	/* in */
+	__u8 tgtdev_name[BTRFS_PATH_NAME_MAX + 1];	/* in */
+	__u64 cont_reading_from_srcdev_mode;	/* in, see #define
+						 * above */
+};
+
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED	0
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED		1
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED		2
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED		3
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED		4
+struct btrfs_ioctl_dev_replace_status_params {
+	__u64 replace_state;	/* out, see #define above */
+	__u64 progress_1000;	/* out, 0 <= x <= 1000 */
+	__u64 time_started;	/* out, seconds since 1-Jan-1970 */
+	__u64 time_stopped;	/* out, seconds since 1-Jan-1970 */
+	__u64 num_write_errors;	/* out */
+	__u64 num_uncorrectable_read_errors;	/* out */
+};
+
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_START			0
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS			1
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL			2
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR			0
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED		1
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED		2
+struct btrfs_ioctl_dev_replace_args {
+	__u64 cmd;	/* in */
+	__u64 result;	/* out */
+
+	union {
+		struct btrfs_ioctl_dev_replace_start_params start;
+		struct btrfs_ioctl_dev_replace_status_params status;
+	};	/* in/out */
+
+	__u64 spare[64];
+};
+
 #define BTRFS_DEVICE_PATH_NAME_MAX 1024
 struct btrfs_ioctl_dev_info_args {
 	__u64 devid;				/* in/out */
@@ -453,4 +495,7 @@  struct btrfs_ioctl_send_args {
 			       struct btrfs_ioctl_qgroup_limit_args)
 #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
 				      struct btrfs_ioctl_get_dev_stats)
+#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
+				    struct btrfs_ioctl_dev_replace_args)
+
 #endif