@@ -7,4 +7,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o acl.o free-space-cache.o zlib.o \
- compression.o delayed-ref.o relocation.o
+ compression.o delayed-ref.o relocation.o debugfs.o hotdata_map.o \
+ hotdata_hash.o hotdata_relocate.o
@@ -31,6 +31,8 @@
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
+#include "hotdata_map.h"
+#include "hotdata_hash.h"
struct btrfs_trans_handle;
struct btrfs_transaction;
@@ -664,6 +666,17 @@ struct btrfs_csum_item {
#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
+/*
+ * New block groups for use with hot data relocation feature. When hot data
+ * relocation is on, *_SSD block groups are forced to nonrotating drives and
+ * the plain DATA and METADATA block groups are forced to rotating drives.
+ *
+ * This should be further optimized, i.e. force metadata to SSD or relocate
+ * inode metadata to SSD when any of its subfile ranges are relocated to SSD
+ * so that reads and writes aren't delayed by HDD seeks.
+ */
+#define BTRFS_BLOCK_GROUP_DATA_SSD (1 << 7)
+#define BTRFS_BLOCK_GROUP_METADATA_SSD (1 << 8)
#define BTRFS_NR_RAID_TYPES 5
struct btrfs_block_group_item {
@@ -877,6 +890,22 @@ struct btrfs_fs_info {
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
struct mutex volume_mutex;
+
+ /* protects hot data items while being iterated and updated */
+ struct mutex hot_data_update_kthread_mutex;
+
+ /*
+ * protects heat hash list while iterating through it for hot data
+ * relocation operations
+ */
+ struct mutex hot_data_relocate_kthread_mutex;
+
+ /*
+ * will eventually protect ssd scan operations that bring previously
+ * hot inode and range items into memory after a mount
+ */
+ struct mutex ssd_scan_kthread_mutex;
+
/*
* this protects the ordered operations list only while we are
* processing all of the entries on it. This way we make
@@ -950,6 +979,13 @@ struct btrfs_fs_info {
struct btrfs_workers endio_meta_write_workers;
struct btrfs_workers endio_write_workers;
struct btrfs_workers submit_workers;
+
+ /*
+ * Workers to update hot_data_hash and relocate data
+ */
+ struct btrfs_workers hot_data_update_workers;
+ struct btrfs_workers hot_data_relocate_workers;
+
/*
* fixup workers take dirty pages that didn't properly go through
* the cow mechanism and make them safe to write. It happens
@@ -958,6 +994,10 @@ struct btrfs_fs_info {
struct btrfs_workers fixup_workers;
struct task_struct *transaction_kthread;
struct task_struct *cleaner_kthread;
+ struct task_struct *hot_data_update_kthread;
+ struct task_struct *hot_data_relocate_kthread;
+ struct task_struct *ssd_scan_kthread;
+
int thread_pool_size;
struct kobject super_kobj;
@@ -1009,6 +1049,9 @@ struct btrfs_fs_info {
unsigned data_chunk_allocations;
unsigned metadata_ratio;
+ unsigned data_ssd_chunk_allocations;
+ unsigned metadata_ssd_ratio;
+
void *bdev_holder;
};
@@ -1092,6 +1135,20 @@ struct btrfs_root {
/* red-black tree that keeps track of in-memory inodes */
struct rb_root inode_tree;
+ /* red-black tree that keeps track of fs-wide hot data */
+ struct hot_inode_tree hot_inode_tree;
+
+ /* hash map of inode temperature */
+ struct heat_hashlist_entry heat_inode_hl[HEAT_HASH_SIZE];
+
+ /* hash map of range temperature */
+ struct heat_hashlist_entry heat_range_hl[HEAT_HASH_SIZE];
+
+ int heat_threshold;
+
+ struct btrfs_work work_inode;
+
+ struct btrfs_work work_range;
/*
* right now this just gets used so that a root has its own devid
* for stat. It may be used for more later
@@ -1192,6 +1249,12 @@ struct btrfs_root {
#define BTRFS_MOUNT_NOSSD (1 << 9)
#define BTRFS_MOUNT_DISCARD (1 << 10)
#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11)
+/*
+ * for activating hot data tracking and relocation.
+ * always ensure that HOTDATA_MOVE implies HOTDATA_TRACK.
+ */
+#define BTRFS_MOUNT_HOTDATA_TRACK (1 << 12)
+#define BTRFS_MOUNT_HOTDATA_MOVE (1 << 13)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1211,6 +1274,28 @@ struct btrfs_root {
#define BTRFS_INODE_NODUMP (1 << 8)
#define BTRFS_INODE_NOATIME (1 << 9)
#define BTRFS_INODE_DIRSYNC (1 << 10)
+/*
+ * same as mount flags, but these turn off tracking/relocation when set
+ * to 1. (not implemented)
+ */
+#define BTRFS_INODE_NO_HOTDATA_TRACK (1 << 11)
+#define BTRFS_INODE_NO_HOTDATA_MOVE (1 << 12)
+
+/* Hot data tracking and relocation -- guard macros */
+#define BTRFS_TRACKING_HOT_DATA(btrfs_root) \
+(btrfs_test_opt(btrfs_root, HOTDATA_TRACK))
+
+#define BTRFS_MOVING_HOT_DATA(btrfs_root) \
+((btrfs_test_opt(btrfs_root, HOTDATA_MOVE)) && \
+!(btrfs_root->fs_info->sb->s_flags & MS_RDONLY))
+
+#define BTRFS_TRACK_THIS_INODE(btrfs_inode) \
+((BTRFS_TRACKING_HOT_DATA(btrfs_inode->root)) && \
+!(btrfs_inode->flags & BTRFS_INODE_NO_HOTDATA_TRACK))
+
+#define BTRFS_MOVE_THIS_INODE(btrfs_inode) \
+((BTRFS_MOVING_HOT_DATA(btrfs_inode->root)) && \
+!(btrfs_inode->flags & BTRFS_INODE_NO_HOTDATA_MOVE))
/* some macros to generate set/get funcs for the struct fields. This
* assumes there is a lefoo_to_cpu for every type, so lets make a simple
@@ -2376,6 +2461,10 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
+int btrfs_set_extent_prefer_nonrotating(struct inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state);
+int btrfs_set_extent_prefer_rotating(struct inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state);
int btrfs_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -2457,6 +2546,13 @@ int btrfs_sysfs_add_root(struct btrfs_root *root);
void btrfs_sysfs_del_root(struct btrfs_root *root);
void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
+
+/* debugfs.c */
+int btrfs_init_debugfs(void);
+void btrfs_exit_debugfs(void);
+int btrfs_init_debugfs_volume(const char *, struct super_block *);
+void btrfs_exit_debugfs_volume(struct super_block *);
+
/* xattr.c */
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@ -39,6 +39,7 @@
#include "locking.h"
#include "tree-log.h"
#include "free-space-cache.h"
+#include "hotdata_hash.h"
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
@@ -898,6 +899,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
struct btrfs_fs_info *fs_info,
u64 objectid)
{
+ int i;
+
root->node = NULL;
root->commit_root = NULL;
root->sectorsize = sectorsize;
@@ -917,6 +920,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->name = NULL;
root->in_sysfs = 0;
root->inode_tree = RB_ROOT;
+ hot_inode_tree_init(&root->hot_inode_tree);
root->block_rsv = NULL;
root->orphan_block_rsv = NULL;
@@ -938,6 +942,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->log_batch = 0;
root->log_transid = 0;
root->last_log_commit = 0;
+ root->heat_threshold = HEAT_INITIAL_THRESH;
extent_io_tree_init(&root->dirty_log_pages,
fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -945,6 +950,19 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
memset(&root->root_item, 0, sizeof(root->root_item));
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
memset(&root->root_kobj, 0, sizeof(root->root_kobj));
+ memset(&root->heat_inode_hl, 0, sizeof(root->heat_inode_hl));
+ memset(&root->heat_range_hl, 0, sizeof(root->heat_range_hl));
+ for (i = 0; i < HEAT_HASH_SIZE; i++) {
+ INIT_HLIST_HEAD(&root->heat_inode_hl[i].hashhead);
+ INIT_HLIST_HEAD(&root->heat_range_hl[i].hashhead);
+
+ rwlock_init(&root->heat_inode_hl[i].rwlock);
+ rwlock_init(&root->heat_range_hl[i].rwlock);
+
+ root->heat_inode_hl[i].temperature = i;
+ root->heat_range_hl[i].temperature = i;
+ }
+
root->defrag_trans_start = fs_info->generation;
init_completion(&root->kobj_unregister);
root->defrag_running = 0;
@@ -1671,6 +1689,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
+ mutex_init(&fs_info->hot_data_update_kthread_mutex);
+ mutex_init(&fs_info->hot_data_relocate_kthread_mutex);
+ mutex_init(&fs_info->ssd_scan_kthread_mutex);
init_rwsem(&fs_info->extent_commit_sem);
init_rwsem(&fs_info->cleanup_work_sem);
init_rwsem(&fs_info->subvol_sem);
@@ -2324,6 +2345,9 @@ static void free_fs_root(struct btrfs_root *root)
down_write(&root->anon_super.s_umount);
kill_anon_super(&root->anon_super);
}
+
+ free_heat_hashlists(root);
+ free_hot_inode_tree(root);
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
kfree(root->name);
@@ -2429,6 +2453,10 @@ int close_ctree(struct btrfs_root *root)
kthread_stop(root->fs_info->transaction_kthread);
kthread_stop(root->fs_info->cleaner_kthread);
+ if (btrfs_test_opt(root, HOTDATA_TRACK))
+ kthread_stop(root->fs_info->hot_data_update_kthread);
+ if (btrfs_test_opt(root, HOTDATA_TRACK))
+ kthread_stop(root->fs_info->hot_data_relocate_kthread);
fs_info->closing = 2;
smp_mb();
@@ -505,7 +505,8 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
struct btrfs_space_info *found;
flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
- BTRFS_BLOCK_GROUP_METADATA;
+ BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA_SSD |
+ BTRFS_BLOCK_GROUP_METADATA_SSD;
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
@@ -2780,7 +2781,9 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
spin_lock_init(&found->lock);
found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
BTRFS_BLOCK_GROUP_SYSTEM |
- BTRFS_BLOCK_GROUP_METADATA);
+ BTRFS_BLOCK_GROUP_METADATA |
+ BTRFS_BLOCK_GROUP_DATA_SSD |
+ BTRFS_BLOCK_GROUP_METADATA_SSD);
found->total_bytes = total_bytes;
found->bytes_used = bytes_used;
found->disk_used = bytes_used * factor;
@@ -2854,12 +2857,21 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
return btrfs_reduce_alloc_profile(root, flags);
}
+/*
+ * Turns a chunk_type integer into set of block group flags (a profile).
+ * Hot data relocation code adds chunk_types 2 and 3 for hot data specific
+ * block group types.
+ */
static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
{
u64 flags;
- if (data)
+ if (data == 1)
flags = BTRFS_BLOCK_GROUP_DATA;
+ else if (data == 2)
+ flags = BTRFS_BLOCK_GROUP_DATA_SSD;
+ else if (data == 3)
+ flags = BTRFS_BLOCK_GROUP_METADATA_SSD;
else if (root == root->fs_info->chunk_root)
flags = BTRFS_BLOCK_GROUP_SYSTEM;
else
@@ -2998,6 +3010,19 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
rcu_read_unlock();
}
+static void force_metadata_ssd_allocation(struct btrfs_fs_info *info)
+{
+ struct list_head *head = &info->space_info;
+ struct btrfs_space_info *found;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(found, head, list) {
+ if (found->flags & BTRFS_BLOCK_GROUP_METADATA_SSD)
+ found->force_alloc = 1;
+ }
+ rcu_read_unlock();
+}
+
static int should_alloc_chunk(struct btrfs_space_info *sinfo,
u64 alloc_bytes)
{
@@ -3060,6 +3085,14 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
force_metadata_allocation(fs_info);
}
+ if (flags & BTRFS_BLOCK_GROUP_DATA_SSD &&
+ fs_info->metadata_ssd_ratio) {
+ fs_info->data_ssd_chunk_allocations++;
+ if (!(fs_info->data_ssd_chunk_allocations %
+ fs_info->metadata_ssd_ratio))
+ force_metadata_ssd_allocation(fs_info);
+ }
+
ret = btrfs_alloc_chunk(trans, extent_root, flags);
spin_lock(&space_info->lock);
if (ret)
@@ -3503,6 +3536,20 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
meta_used = sinfo->bytes_used;
spin_unlock(&sinfo->lock);
+ sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA_SSD);
+ if (sinfo) {
+ spin_lock(&sinfo->lock);
+ data_used += sinfo->bytes_used;
+ spin_unlock(&sinfo->lock);
+ }
+
+ sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA_SSD);
+ if (sinfo) {
+ spin_lock(&sinfo->lock);
+ meta_used += sinfo->bytes_used;
+ spin_unlock(&sinfo->lock);
+ }
+
num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
csum_size * 2;
num_bytes += div64_u64(data_used + meta_used, 50);
@@ -3518,7 +3565,6 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
struct btrfs_space_info *sinfo = block_rsv->space_info;
u64 num_bytes;
-
num_bytes = calc_global_metadata_size(fs_info);
spin_lock(&block_rsv->lock);
@@ -4831,7 +4877,8 @@ checks:
BUG_ON(offset > search_start);
ret = update_reserved_bytes(block_group, num_bytes, 1,
- (data & BTRFS_BLOCK_GROUP_DATA));
+ (data & BTRFS_BLOCK_GROUP_DATA) ||
+ (data & BTRFS_BLOCK_GROUP_DATA_SSD));
if (ret == -EAGAIN) {
btrfs_add_free_space(block_group, offset, num_bytes);
goto loop;
@@ -4939,7 +4986,8 @@ loop:
/* we found what we needed */
if (ins->objectid) {
- if (!(data & BTRFS_BLOCK_GROUP_DATA))
+ if (!(data & BTRFS_BLOCK_GROUP_DATA) &&
+ !(data & BTRFS_BLOCK_GROUP_DATA_SSD))
trans->block_group = block_group->key.objectid;
btrfs_put_block_group(block_group);
@@ -961,6 +961,22 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
0, NULL, cached_state, mask);
}
+int set_extent_prefer_nonrotating(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_PREFER_NONROTATING,
+ 0, NULL, cached_state, mask);
+}
+
+int set_extent_prefer_rotating(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_PREFER_ROTATING,
+ 0, NULL, cached_state, mask);
+}
+
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
@@ -2468,8 +2484,10 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
int ret = 0;
int done = 0;
int nr_to_write_done = 0;
+ int nr_written = 0;
struct pagevec pvec;
int nr_pages;
+ pgoff_t start;
pgoff_t index;
pgoff_t end; /* Inclusive */
int scanned = 0;
@@ -2486,6 +2504,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
range_whole = 1;
scanned = 1;
}
+ start = index << PAGE_CACHE_SHIFT;
retry:
while (!done && !nr_to_write_done && (index <= end) &&
(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
@@ -2547,10 +2566,13 @@ retry:
* at any time
*/
nr_to_write_done = wbc->nr_to_write <= 0;
+ nr_written += 1;
}
+
pagevec_release(&pvec);
cond_resched();
}
+
if (!scanned && !done) {
/*
* We hit the last page and there is more work to be done: wrap
@@ -2560,6 +2582,18 @@ retry:
index = 0;
goto retry;
}
+
+ /*
+ * Update access frequency statistics.
+ * i_ino = 1 appears to come from metadata operations, ignore
+ * those writes.
+ */
+ if (BTRFS_TRACK_THIS_INODE(BTRFS_I(mapping->host)) &&
+ mapping->host->i_ino > 1 && nr_written > 0) {
+ btrfs_update_freqs(mapping->host, start,
+ nr_written * PAGE_CACHE_SIZE, 1);
+ }
+
return ret;
}
@@ -17,6 +17,8 @@
#define EXTENT_NODATASUM (1 << 10)
#define EXTENT_DO_ACCOUNTING (1 << 11)
#define EXTENT_FIRST_DELALLOC (1 << 12)
+#define EXTENT_PREFER_NONROTATING (1 << 13)
+#define EXTENT_PREFER_ROTATING (1 << 14)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
@@ -205,6 +207,11 @@ int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
u64 end, gfp_t mask);
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
+int set_extent_prefer_nonrotating(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state,
+ gfp_t mask);
+int set_extent_prefer_rotating(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask);
int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -37,6 +37,7 @@
#include <linux/posix_acl.h>
#include <linux/falloc.h>
#include <linux/slab.h>
+#include <linux/pagevec.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
@@ -50,6 +51,8 @@
#include "tree-log.h"
#include "compression.h"
#include "locking.h"
+#include "hotdata_map.h"
+#include "hotdata_relocate.h"
struct btrfs_iget_args {
u64 ino;
@@ -763,6 +766,9 @@ static noinline int cow_file_range(struct inode *inode,
struct extent_map *em;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
int ret = 0;
+ int prefer_nonrot;
+ int prefer_rot;
+ int chunk_type = 1;
trans = btrfs_join_transaction(root, 1);
BUG_ON(!trans);
@@ -776,6 +782,79 @@ static noinline int cow_file_range(struct inode *inode,
disk_num_bytes = num_bytes;
ret = 0;
+ /*
+ * Use COW operations to move hot data to SSD and cold data
+ * back to rotating disk. Sets chunk_type to 1 to indicate
+ * to write to BTRFS_BLOCK_GROUP_DATA or 2 to indicate
+ * BTRFS_BLOCK_GROUP_DATA_SSD.
+ */
+ if (BTRFS_MOVE_THIS_INODE(BTRFS_I(inode))) {
+ prefer_nonrot = test_range_bit(&BTRFS_I(inode)->io_tree,
+ start, end, EXTENT_PREFER_NONROTATING, 1, NULL);
+ prefer_rot = test_range_bit(&BTRFS_I(inode)->io_tree,
+ start, end, EXTENT_PREFER_ROTATING, 1, NULL);
+ WARN_ON(prefer_nonrot && prefer_rot);
+
+ if (prefer_nonrot)
+ chunk_type = 2;
+ if (prefer_rot)
+ chunk_type = 1;
+
+ /*
+ * Although the async thread has not chosen this range
+ * for relocation to SSD, we're COWing the data anyway
+ * so let's test the range now. Note that "range" here
+ * is different from ranges on RANGE_SIZE boundaries.
+ */
+ if (!(prefer_rot || prefer_nonrot)) {
+ int temperature = 0;
+ struct hot_inode_item *he;
+ struct hot_range_item *hr;
+
+ /* Test just the first proper hotdata range */
+ he = lookup_hot_inode_item(
+ &root->hot_inode_tree, inode->i_ino);
+ if (!he)
+ goto skip_cow_reloc;
+ hr = lookup_hot_range_item(&he->hot_range_tree,
+ start & RANGE_SIZE_MASK);
+ if (!hr) {
+ free_hot_inode_item(he);
+ goto skip_cow_reloc;
+ }
+
+ spin_lock(&hr->lock);
+ temperature = btrfs_get_temp(&hr->freq_data);
+ spin_unlock(&hr->lock);
+
+ if (temperature >=
+ root->fs_info->fs_root->heat_threshold) {
+ /* This range is hot */
+ chunk_type = 2;
+
+ /*
+ * Set extent flags and location so future
+ * operations keep the range on SSD
+ */
+ btrfs_set_extent_prefer_nonrotating(inode,
+ start, end, NULL);
+ clear_extent_bits(&BTRFS_I(inode)->io_tree,
+ start, end, EXTENT_PREFER_ROTATING,
+ GFP_NOFS);
+ spin_lock(&hr->lock);
+ spin_lock(&hr->heat_node->location_lock);
+ hr->heat_node->location = BTRFS_ON_NONROTATING;
+ spin_unlock(&hr->heat_node->location_lock);
+ spin_unlock(&hr->lock);
+ } else
+ chunk_type = 1;
+
+ free_hot_range_item(hr);
+ free_hot_inode_item(he);
+ }
+ }
+
+skip_cow_reloc:
if (start == 0) {
/* lets try to make an inline extent */
ret = cow_file_range_inline(trans, root, inode,
@@ -811,7 +890,10 @@ static noinline int cow_file_range(struct inode *inode,
cur_alloc_size = disk_num_bytes;
ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
root->sectorsize, 0, alloc_hint,
- (u64)-1, &ins, 1);
+ (u64)-1, &ins, chunk_type);
+ if (ret)
+ printk(KERN_INFO "btrfs cow_file_range btrfs_reserve"
+ "_extent returned %d\n", ret);
BUG_ON(ret);
em = alloc_extent_map(GFP_NOFS);
@@ -1225,9 +1307,25 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
unsigned long *nr_written)
{
int ret;
+ int prefer_rot = 0;
+ int prefer_nonrot = 0;
+
struct btrfs_root *root = BTRFS_I(inode)->root;
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
+ /*
+ * Force COW for hot data relocation
+ */
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW &&
+ BTRFS_MOVE_THIS_INODE(BTRFS_I(inode))) {
+ prefer_nonrot = test_range_bit(&BTRFS_I(inode)->io_tree,
+ start, end, EXTENT_PREFER_NONROTATING, 1, NULL);
+ prefer_rot = test_range_bit(&BTRFS_I(inode)->io_tree,
+ start, end, EXTENT_PREFER_ROTATING, 1, NULL);
+ WARN_ON(prefer_nonrot && prefer_rot);
+ }
+
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !(prefer_rot ||
+ prefer_nonrot))
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 1, nr_written);
else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
@@ -1480,6 +1578,26 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
cached_state, GFP_NOFS);
}
+int btrfs_set_extent_prefer_nonrotating(struct inode *inode, u64 start,
+ u64 end, struct extent_state
+ **cached_state)
+{
+ if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
+ WARN_ON(1);
+ return set_extent_prefer_nonrotating(&BTRFS_I(inode)->io_tree, start,
+ end, cached_state, GFP_NOFS);
+}
+
+int btrfs_set_extent_prefer_rotating(struct inode *inode, u64 start,
+ u64 end, struct extent_state
+ **cached_state)
+{
+ if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
+ WARN_ON(1);
+ return set_extent_prefer_rotating(&BTRFS_I(inode)->io_tree, start,
+ end, cached_state, GFP_NOFS);
+}
+
/* see btrfs_writepage_start_hook for details on why this is required */
struct btrfs_writepage_fixup {
struct page *page;
@@ -2870,6 +2988,18 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
dentry->d_name.name, dentry->d_name.len);
BUG_ON(ret);
+ if (BTRFS_TRACKING_HOT_DATA(root)) {
+ struct hot_inode_item *he;
+
+ he = lookup_hot_inode_item(
+ &root->hot_inode_tree, inode->i_ino);
+
+ if (he) {
+ btrfs_remove_inode_from_heat_index(he, root);
+ free_hot_inode_item(he);
+ }
+ }
+
if (inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans, inode);
BUG_ON(ret);
@@ -5781,6 +5911,11 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
lockstart = offset;
lockend = offset + count - 1;
+ /* Update access frequency statistics */
+ if (BTRFS_TRACK_THIS_INODE(BTRFS_I(inode)) && count > 0)
+ btrfs_update_freqs(inode, lockstart, (u64) count,
+ writing);
+
if (writing) {
ret = btrfs_delalloc_reserve_space(inode, count);
if (ret)
@@ -5860,7 +5995,16 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
int btrfs_readpage(struct file *file, struct page *page)
{
struct extent_io_tree *tree;
+ u64 start;
+
tree = &BTRFS_I(page->mapping->host)->io_tree;
+ start = (u64) page->index << PAGE_CACHE_SHIFT;
+
+ /* Update access frequency statistics */
+ if (BTRFS_TRACK_THIS_INODE(BTRFS_I(page->mapping->host)))
+ btrfs_update_freqs(page->mapping->host, start,
+ PAGE_CACHE_SIZE, 0);
+
return extent_read_full_page(tree, page, btrfs_get_extent);
}
@@ -5868,13 +6012,14 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
{
struct extent_io_tree *tree;
-
if (current->flags & PF_MEMALLOC) {
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return 0;
}
+
tree = &BTRFS_I(page->mapping->host)->io_tree;
+
return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
}
@@ -5884,6 +6029,7 @@ int btrfs_writepages(struct address_space *mapping,
struct extent_io_tree *tree;
tree = &BTRFS_I(mapping->host)->io_tree;
+
return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
}
@@ -5892,7 +6038,17 @@ btrfs_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
struct extent_io_tree *tree;
+ u64 start, len;
+
tree = &BTRFS_I(mapping->host)->io_tree;
+ start = (u64) (list_entry(pages->prev, struct page, lru)->index)
+ << PAGE_CACHE_SHIFT;
+ len = nr_pages * PAGE_CACHE_SIZE;
+
+ /* Update access frequency statistics */
+ if (len > 0 && BTRFS_TRACK_THIS_INODE(BTRFS_I(mapping->host)))
+ btrfs_update_freqs(mapping->host, start, len, 0);
+
return extent_readpages(tree, mapping, pages, nr_pages,
btrfs_get_extent);
}
@@ -51,6 +51,9 @@
#include "version.h"
#include "export.h"
#include "compression.h"
+#include "hotdata_map.h"
+#include "hotdata_hash.h"
+#include "hotdata_relocate.h"
static const struct super_operations btrfs_super_ops;
@@ -59,6 +62,11 @@ static void btrfs_put_super(struct super_block *sb)
struct btrfs_root *root = btrfs_sb(sb);
int ret;
+ root->heat_threshold = 0;
+
+ if (btrfs_test_opt(root, HOTDATA_TRACK))
+ btrfs_exit_debugfs_volume(sb);
+
ret = close_ctree(root);
sb->s_fs_info = NULL;
}
@@ -68,7 +76,7 @@ enum {
Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
- Opt_discard, Opt_err,
+ Opt_discard, Opt_hotdatatrack, Opt_hotdatamove, Opt_err,
};
static match_table_t tokens = {
@@ -92,6 +100,8 @@ static match_table_t tokens = {
{Opt_flushoncommit, "flushoncommit"},
{Opt_ratio, "metadata_ratio=%d"},
{Opt_discard, "discard"},
+ {Opt_hotdatatrack, "hotdatatrack"},
+ {Opt_hotdatamove, "hotdatamove"},
{Opt_err, NULL},
};
@@ -235,6 +245,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_discard:
btrfs_set_opt(info->mount_opt, DISCARD);
break;
+ case Opt_hotdatamove:
+ printk(KERN_INFO "btrfs: turning on hot data "
+ "migration\n");
+ printk(KERN_INFO " (implies hotdatatrack, "
+ "no ssd_spread)\n");
+ btrfs_set_opt(info->mount_opt, HOTDATA_MOVE);
+ btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
+ case Opt_hotdatatrack:
+ printk(KERN_INFO "btrfs: turning on hot data"
+ " tracking\n");
+ btrfs_set_opt(info->mount_opt, HOTDATA_TRACK);
+ break;
case Opt_err:
printk(KERN_INFO "btrfs: unrecognized mount option "
"'%s'\n", p);
@@ -457,6 +479,17 @@ static int btrfs_fill_super(struct super_block *sb,
printk("btrfs: open_ctree failed\n");
return PTR_ERR(tree_root);
}
+
+ /*
+ * Initialize relocate kthread with HOTDATA_TRACK
+ * to allow seamless remount to enable HOTDATA_MOVE
+ */
+ if (btrfs_test_opt(tree_root, HOTDATA_TRACK)) {
+ init_hash_list_kthread(tree_root);
+ init_hot_data_relocate_kthread(tree_root);
+ init_ssd_scan_kthread(tree_root);
+ }
+
sb->s_fs_info = tree_root;
disk_super = &tree_root->fs_info->super_copy;
@@ -658,6 +691,8 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
mnt->mnt_sb = s;
mnt->mnt_root = root;
+ if (btrfs_test_opt(btrfs_sb(s), HOTDATA_TRACK))
+ btrfs_init_debugfs_volume(dev_name, s);
kfree(subvol_name);
return 0;
@@ -846,18 +881,30 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_sysfs;
- err = extent_io_init();
+ err = btrfs_init_debugfs();
if (err)
goto free_cachep;
+ err = extent_io_init();
+ if (err)
+ goto free_debugfs;
+
err = extent_map_init();
if (err)
goto free_extent_io;
- err = btrfs_interface_init();
+ err = hot_inode_item_init();
if (err)
goto free_extent_map;
+ err = hot_range_item_init();
+ if (err)
+ goto free_hot_inode_item;
+
+ err = btrfs_interface_init();
+ if (err)
+ goto free_hot_range_item;
+
err = register_filesystem(&btrfs_fs_type);
if (err)
goto unregister_ioctl;
@@ -867,10 +914,16 @@ static int __init init_btrfs_fs(void)
unregister_ioctl:
btrfs_interface_exit();
+free_hot_range_item:
+ hot_range_item_exit();
+free_hot_inode_item:
+ hot_inode_item_exit();
free_extent_map:
extent_map_exit();
free_extent_io:
extent_io_exit();
+free_debugfs:
+ btrfs_exit_debugfs();
free_cachep:
btrfs_destroy_cachep();
free_sysfs:
@@ -882,10 +935,13 @@ static void __exit exit_btrfs_fs(void)
{
btrfs_destroy_cachep();
extent_map_exit();
+ hot_inode_item_exit();
+ hot_range_item_exit();
extent_io_exit();
btrfs_interface_exit();
unregister_filesystem(&btrfs_fs_type);
btrfs_exit_sysfs();
+ btrfs_exit_debugfs();
btrfs_cleanup_fs_uuids();
btrfs_zlib_exit();
}
@@ -2210,10 +2210,12 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
min_stripes = 4;
}
- if (type & BTRFS_BLOCK_GROUP_DATA) {
+ if (type & BTRFS_BLOCK_GROUP_DATA ||
+ type & BTRFS_BLOCK_GROUP_DATA_SSD) {
max_chunk_size = 10 * calc_size;
min_stripe_size = 64 * 1024 * 1024;
- } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+ } else if (type & BTRFS_BLOCK_GROUP_METADATA ||
+ type & BTRFS_BLOCK_GROUP_METADATA_SSD) {
max_chunk_size = 256 * 1024 * 1024;
min_stripe_size = 32 * 1024 * 1024;
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
@@ -2274,15 +2276,43 @@ again:
INIT_LIST_HEAD(&private_devs);
while (index < num_stripes) {
+ int dev_rotating;
+ int skip_device = 0;
device = list_entry(cur, struct btrfs_device, dev_alloc_list);
BUG_ON(!device->writeable);
+ dev_rotating = !blk_queue_nonrot(bdev_get_queue(device->bdev));
+
+ /*
+ * If HOTDATA_MOVE is set, the chunk type being allocated
+ * determines which disks the data may be allocated on.
+ * This can cause problems if, for example, the data alloc
+ * profile is RAID0 and there are only two devices, 1 SSD +
+ * 1 HDD. All allocations to BTRFS_BLOCK_GROUP_DATA_SSD
+ * in this config will return -ENOSPC as the allocation code
+ * can't find allowable space for the second stripe.
+ */
+ if (btrfs_test_opt(extent_root, HOTDATA_MOVE)) {
+ if (type & BTRFS_BLOCK_GROUP_DATA &&
+ !dev_rotating)
+ skip_device = 1;
+ if (type & BTRFS_BLOCK_GROUP_METADATA &&
+ !dev_rotating)
+ skip_device = 1;
+ if (type & BTRFS_BLOCK_GROUP_DATA_SSD &&
+ dev_rotating)
+ skip_device = 1;
+ if (type & BTRFS_BLOCK_GROUP_METADATA_SSD &&
+ dev_rotating)
+ skip_device = 1;
+ }
if (device->total_bytes > device->bytes_used)
avail = device->total_bytes - device->bytes_used;
else
avail = 0;
- cur = cur->next;
- if (device->in_fs_metadata && avail >= min_free) {
+ cur = cur->next;
+ if (!skip_device &&
+ device->in_fs_metadata && avail >= min_free) {
ret = find_free_dev_extent(trans, device,
min_free, &dev_offset,
&max_avail);