@@ -259,3 +259,475 @@ void nova_init_file_write_item(struct super_block *sb,
entry->size = file_size;
}
+
+/*
+ * Check if there is an existing entry or hole for target page offset.
+ * Used for inplace write, DAX-mmap and fallocate.
+ */
+unsigned long nova_check_existing_entry(struct super_block *sb,
+ struct inode *inode, unsigned long num_blocks, unsigned long start_blk,
+ struct nova_file_write_entry **ret_entry,
+ int check_next, u64 epoch_id,
+ int *inplace)
+{
+ struct nova_inode_info *si = NOVA_I(inode);
+ struct nova_inode_info_header *sih = &si->header;
+ struct nova_file_write_entry *entry;
+ unsigned long next_pgoff;
+ unsigned long ent_blks = 0;
+ timing_t check_time;
+
+ NOVA_START_TIMING(check_entry_t, check_time);
+
+ *ret_entry = NULL;
+ *inplace = 0;
+ entry = nova_get_write_entry(sb, sih, start_blk);
+
+ if (entry) {
+ *ret_entry = entry;
+
+ /* We can do inplace write. Find contiguous blocks */
+ if (entry->reassigned == 0)
+ ent_blks = entry->num_pages -
+ (start_blk - entry->pgoff);
+ else
+ ent_blks = 1;
+
+ if (ent_blks > num_blocks)
+ ent_blks = num_blocks;
+
+ if (entry->epoch_id == epoch_id)
+ *inplace = 1;
+
+ } else if (check_next) {
+ /* Possible Hole */
+ entry = nova_find_next_entry(sb, sih, start_blk);
+ if (entry) {
+ next_pgoff = entry->pgoff;
+ if (next_pgoff <= start_blk) {
+ nova_err(sb, "iblock %lu, entry pgoff %lu, num pages %lu\n",
+ start_blk, next_pgoff, entry->num_pages);
+ nova_print_inode_log(sb, inode);
+ dump_stack();
+ ent_blks = num_blocks;
+ goto out;
+ }
+ ent_blks = next_pgoff - start_blk;
+ if (ent_blks > num_blocks)
+ ent_blks = num_blocks;
+ } else {
+ /* File grow */
+ ent_blks = num_blocks;
+ }
+ }
+
+ if (entry && ent_blks == 0) {
+ nova_dbg("%s: %d\n", __func__, check_next);
+ dump_stack();
+ }
+
+out:
+ NOVA_END_TIMING(check_entry_t, check_time);
+ return ent_blks;
+}
+
+/* Memcpy from newly allocated data blocks to existing data blocks */
+static int nova_inplace_memcpy(struct super_block *sb, struct inode *inode,
+ struct nova_file_write_entry *from, struct nova_file_write_entry *to,
+ unsigned long num_blocks, loff_t pos, size_t len)
+{
+ struct nova_inode_info *si = NOVA_I(inode);
+ struct nova_inode_info_header *sih = &si->header;
+ struct nova_log_entry_info entry_info;
+ unsigned long pgoff;
+ unsigned long from_nvmm, to_nvmm;
+ void *from_addr, *to_addr = NULL;
+ loff_t base, start, end, offset;
+
+ pgoff = le64_to_cpu(from->pgoff);
+ base = start = pgoff << PAGE_SHIFT;
+ end = (pgoff + num_blocks) << PAGE_SHIFT;
+
+ if (start < pos)
+ start = pos;
+
+ if (end > pos + len)
+ end = pos + len;
+
+ len = end - start;
+ offset = start - base;
+
+ from_nvmm = get_nvmm(sb, sih, from, pgoff);
+ from_addr = nova_get_block(sb, (from_nvmm << PAGE_SHIFT));
+ to_nvmm = get_nvmm(sb, sih, to, pgoff);
+ to_addr = nova_get_block(sb, (to_nvmm << PAGE_SHIFT));
+
+ memcpy_to_pmem_nocache(to_addr + offset, from_addr + offset, len);
+
+ /* Update entry */
+ entry_info.type = FILE_WRITE;
+ entry_info.epoch_id = from->epoch_id;
+ entry_info.trans_id = from->trans_id;
+ entry_info.time = from->mtime;
+ entry_info.file_size = from->size;
+ entry_info.inplace = 1;
+
+ nova_inplace_update_write_entry(sb, inode, to, &entry_info);
+ return 0;
+}
+
+/*
+ * Due to concurrent DAX fault, we may have overlapped entries in the list.
+ * We copy the data to the existing data pages and update the entry.
+ * Must be called with sih write lock held.
+ */
+static int nova_commit_inplace_writes_to_log(struct super_block *sb,
+ struct nova_inode *pi, struct inode *inode,
+ struct list_head *head, unsigned long new_blocks,
+ loff_t pos, size_t len)
+{
+ struct nova_inode_info *si = NOVA_I(inode);
+ struct nova_inode_info_header *sih = &si->header;
+ struct nova_file_write_item *entry_item, *temp;
+ struct nova_file_write_item *new_item;
+ struct nova_file_write_entry *curr, *entry;
+ struct list_head new_head;
+ unsigned long start_blk, ent_blks;
+ unsigned long num_blocks;
+ unsigned long blocknr;
+ u64 epoch_id;
+ int inplace;
+ int ret = 0;
+
+ if (list_empty(head))
+ return 0;
+
+ sih_lock(sih);
+ INIT_LIST_HEAD(&new_head);
+
+ list_for_each_entry_safe(entry_item, temp, head, list) {
+ list_del(&entry_item->list);
+ curr = &entry_item->entry;
+ epoch_id = le64_to_cpu(curr->epoch_id);
+again:
+ num_blocks = le32_to_cpu(curr->num_pages);
+ start_blk = le64_to_cpu(curr->pgoff);
+
+ ent_blks = nova_check_existing_entry(sb, inode, num_blocks,
+ start_blk, &entry,
+ 1, epoch_id, &inplace);
+
+ if (!entry && ent_blks == num_blocks) {
+ /* Hole */
+ list_add_tail(&entry_item->list, &new_head);
+ continue;
+ }
+
+ blocknr = nova_get_blocknr(sb, curr->block,
+ sih->i_blk_type);
+ /* Overlap with head. Memcpy */
+ if (entry) {
+ new_blocks -= ent_blks;
+ nova_inplace_memcpy(sb, inode, curr, entry, ent_blks,
+ pos, len);
+ if (ent_blks == num_blocks) {
+ /* Full copy */
+ nova_free_data_blocks(sb, sih, blocknr,
+ ent_blks);
+ nova_free_file_write_item(entry_item);
+ continue;
+ } else {
+ /* Partial copy */
+ curr->num_pages -= ent_blks;
+ curr->pgoff += ent_blks;
+ curr->block += ent_blks << PAGE_SHIFT;
+ nova_free_data_blocks(sb, sih, blocknr,
+ ent_blks);
+ goto again;
+ }
+ }
+
+ /* Overlap with middle or tail. */
+ new_item = nova_alloc_file_write_item(sb);
+ if (!new_item) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ nova_init_file_write_item(sb, sih, new_item,
+ epoch_id, start_blk, ent_blks,
+ blocknr, entry->mtime, entry->size);
+
+ list_add_tail(&new_item->list, &new_head);
+
+ curr->num_pages -= ent_blks;
+ curr->pgoff += ent_blks;
+ curr->block += ent_blks << PAGE_SHIFT;
+ goto again;
+ }
+
+ ret = nova_commit_writes_to_log(sb, pi, inode,
+ &new_head, new_blocks, 1);
+ if (ret < 0) {
+ nova_err(sb, "commit to log failed\n");
+ goto out;
+ }
+
+out:
+ if (ret < 0)
+ nova_cleanup_incomplete_write(sb, sih, &new_head, 1);
+
+ sih_unlock(sih);
+ return ret;
+}
+
+/*
+ * Do an inplace write. This function assumes that the lock on the inode is
+ * already held.
+ *
+ * We do this in three steps:
+ * 1. Check the tree, protected by sih read lock.
+ * 2. Allocate blocks for hole, copy from user buffer.
+ * 3. Take sih write lock and commit the writes.
+ *
+ * This is necessary because DAX fault can occur when we do the copy.
+ * We cannot hold sih lock when performing the data copy,
+ * and DAX fault may allocate data pages during step 2.
+ * In this case we overwrite with our data and free the data pages we allocated.
+ */
+ssize_t do_nova_inplace_file_write(struct file *filp,
+ const char __user *buf, size_t len, loff_t *ppos)
+{
+ struct address_space *mapping = filp->f_mapping;
+ struct inode *inode = mapping->host;
+ struct nova_inode_info *si = NOVA_I(inode);
+ struct nova_inode_info_header *sih = &si->header;
+ struct super_block *sb = inode->i_sb;
+ struct nova_inode *pi;
+ struct nova_file_write_entry *entry;
+ struct nova_file_write_item *entry_item;
+ struct list_head item_head;
+ struct nova_inode_update update;
+ ssize_t written = 0;
+ loff_t pos, original_pos;
+ size_t count, offset, copied;
+ unsigned long start_blk, num_blocks, ent_blks = 0;
+ unsigned long total_blocks;
+ unsigned long new_blocks = 0;
+ unsigned long blocknr = 0;
+ int allocated = 0;
+ int inplace = 0;
+ bool hole_fill = false;
+ void *kmem;
+ u64 blk_off;
+ size_t bytes;
+ long status = 0;
+ timing_t inplace_write_time, memcpy_time;
+ unsigned long step = 0;
+ u64 epoch_id;
+ u64 file_size;
+ u32 time;
+ ssize_t ret;
+
+ if (len == 0)
+ return 0;
+
+ NOVA_START_TIMING(inplace_write_t, inplace_write_time);
+ INIT_LIST_HEAD(&item_head);
+
+ if (!access_ok(VERIFY_READ, buf, len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ pos = original_pos = *ppos;
+
+ if (filp->f_flags & O_APPEND)
+ pos = i_size_read(inode);
+
+ count = len;
+
+ pi = nova_get_block(sb, sih->pi_addr);
+
+ offset = pos & (sb->s_blocksize - 1);
+ num_blocks = ((count + offset - 1) >> sb->s_blocksize_bits) + 1;
+ total_blocks = num_blocks;
+
+ /* offset in the actual block size block */
+
+ ret = file_remove_privs(filp);
+ if (ret)
+ goto out;
+
+ inode->i_ctime = inode->i_mtime = current_time(inode);
+ time = current_time(inode).tv_sec;
+
+ epoch_id = nova_get_epoch_id(sb);
+
+ nova_dbgv("%s: epoch_id %llu, inode %lu, offset %lld, count %lu\n",
+ __func__, epoch_id, inode->i_ino, pos, count);
+ update.tail = sih->log_tail;
+ while (num_blocks > 0) {
+ hole_fill = false;
+ offset = pos & (nova_inode_blk_size(sih) - 1);
+ start_blk = pos >> sb->s_blocksize_bits;
+
+ sih_lock_shared(sih);
+ ent_blks = nova_check_existing_entry(sb, inode, num_blocks,
+ start_blk, &entry,
+ 1, epoch_id, &inplace);
+ sih_unlock_shared(sih);
+
+ if (entry && inplace) {
+ /* We can do inplace write. Find contiguous blocks */
+ blocknr = get_nvmm(sb, sih, entry, start_blk);
+ blk_off = blocknr << PAGE_SHIFT;
+ allocated = ent_blks;
+ } else {
+ /* Allocate blocks to fill hole */
+ allocated = nova_new_data_blocks(sb, sih, &blocknr,
+ start_blk, ent_blks, ALLOC_NO_INIT,
+ ANY_CPU, ALLOC_FROM_HEAD);
+
+ nova_dbg_verbose("%s: alloc %d blocks @ %lu\n",
+ __func__, allocated, blocknr);
+
+ if (allocated <= 0) {
+ nova_dbg("%s alloc blocks failed!, %d\n",
+ __func__, allocated);
+ ret = allocated;
+ goto out;
+ }
+
+ hole_fill = true;
+ new_blocks += allocated;
+ blk_off = nova_get_block_off(sb, blocknr,
+ sih->i_blk_type);
+
+ invalidate_inode_pages2_range(inode->i_mapping,
+ start_blk, start_blk + allocated - 1);
+ }
+
+ step++;
+ bytes = sb->s_blocksize * allocated - offset;
+ if (bytes > count)
+ bytes = count;
+
+ kmem = nova_get_block(inode->i_sb, blk_off);
+
+ if (hole_fill &&
+ (offset || ((offset + bytes) & (PAGE_SIZE - 1)) != 0)) {
+ ret = nova_handle_head_tail_blocks(sb, inode,
+ pos, bytes, kmem);
+ if (ret)
+ goto out;
+
+ }
+
+ /* Now copy from user buf */
+// nova_dbg("Write: %p\n", kmem);
+ NOVA_START_TIMING(memcpy_w_nvmm_t, memcpy_time);
+ copied = bytes - memcpy_to_pmem_nocache(kmem + offset,
+ buf, bytes);
+ NOVA_END_TIMING(memcpy_w_nvmm_t, memcpy_time);
+
+ if (pos + copied > inode->i_size)
+ file_size = cpu_to_le64(pos + copied);
+ else
+ file_size = cpu_to_le64(inode->i_size);
+
+ /* Handle hole fill write */
+ if (hole_fill) {
+ entry_item = nova_alloc_file_write_item(sb);
+ if (!entry_item) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ nova_init_file_write_item(sb, sih, entry_item,
+ epoch_id, start_blk, allocated,
+ blocknr, time, file_size);
+
+ list_add_tail(&entry_item->list, &item_head);
+ } else {
+ /* Update existing entry */
+ struct nova_log_entry_info entry_info;
+
+ entry_info.type = FILE_WRITE;
+ entry_info.epoch_id = epoch_id;
+ entry_info.trans_id = sih->trans_id;
+ entry_info.time = time;
+ entry_info.file_size = file_size;
+ entry_info.inplace = 1;
+
+ nova_inplace_update_write_entry(sb, inode, entry,
+ &entry_info);
+ }
+
+ nova_dbgv("Write: %p, %lu\n", kmem, copied);
+ if (copied > 0) {
+ status = copied;
+ written += copied;
+ pos += copied;
+ buf += copied;
+ count -= copied;
+ num_blocks -= allocated;
+ }
+ if (unlikely(copied != bytes)) {
+ nova_dbg("%s ERROR!: %p, bytes %lu, copied %lu\n",
+ __func__, kmem, bytes, copied);
+ if (status >= 0)
+ status = -EFAULT;
+ }
+ if (status < 0)
+ break;
+ }
+
+ ret = nova_commit_inplace_writes_to_log(sb, pi, inode, &item_head,
+ new_blocks, original_pos, len);
+ if (ret < 0) {
+ nova_err(sb, "commit to log failed\n");
+ goto out;
+ }
+
+ ret = written;
+ NOVA_STATS_ADD(inplace_write_breaks, step);
+ nova_dbgv("blocks: %lu, %lu\n", inode->i_blocks, sih->i_blocks);
+
+ *ppos = pos;
+ if (pos > inode->i_size) {
+ i_size_write(inode, pos);
+ sih->i_size = pos;
+ }
+
+out:
+ if (ret < 0)
+ nova_cleanup_incomplete_write(sb, sih, &item_head, 1);
+
+ NOVA_END_TIMING(inplace_write_t, inplace_write_time);
+ NOVA_STATS_ADD(inplace_write_bytes, written);
+ return ret;
+}
+
+/*
+ * Acquire locks and perform an inplace update.
+ */
+ssize_t nova_inplace_file_write(struct file *filp,
+ const char __user *buf, size_t len, loff_t *ppos)
+{
+ struct address_space *mapping = filp->f_mapping;
+ struct inode *inode = mapping->host;
+ int ret;
+
+ if (len == 0)
+ return 0;
+
+ sb_start_write(inode->i_sb);
+ inode_lock(inode);
+
+ ret = do_nova_inplace_file_write(filp, buf, len, ppos);
+
+ inode_unlock(inode);
+ sb_end_write(inode->i_sb);
+
+ return ret;
+}
@@ -448,7 +448,10 @@ ssize_t nova_cow_file_write(struct file *filp,
sb_start_write(inode->i_sb);
inode_lock(inode);
- ret = do_nova_cow_file_write(filp, buf, len, ppos);
+ if (mapping_mapped(mapping))
+ ret = do_nova_inplace_file_write(filp, buf, len, ppos);
+ else
+ ret = do_nova_cow_file_write(filp, buf, len, ppos);
inode_unlock(inode);
sb_end_write(inode->i_sb);
@@ -460,7 +463,10 @@ ssize_t nova_cow_file_write(struct file *filp,
static ssize_t nova_dax_file_write(struct file *filp, const char __user *buf,
size_t len, loff_t *ppos)
{
- return nova_cow_file_write(filp, buf, len, ppos);
+ if (inplace_data_updates)
+ return nova_inplace_file_write(filp, buf, len, ppos);
+ else
+ return nova_cow_file_write(filp, buf, len, ppos);
}
@@ -477,6 +477,10 @@ void nova_init_file_write_item(struct super_block *sb,
struct nova_inode_info_header *sih, struct nova_file_write_item *item,
u64 epoch_id, u64 pgoff, int num_pages, u64 blocknr, u32 time,
u64 file_size);
+ssize_t nova_inplace_file_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *ppos);
+ssize_t do_nova_inplace_file_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *ppos);
/* dir.c */
extern const struct file_operations nova_dir_operations;