From patchwork Fri May 21 17:03:41 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Josef Bacik X-Patchwork-Id: 101526 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.3/8.14.3) with ESMTP id o4LH981P031649 for ; Fri, 21 May 2010 17:09:08 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934422Ab0EURGI (ORCPT ); Fri, 21 May 2010 13:06:08 -0400 Received: from mx1.redhat.com ([209.132.183.28]:25196 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S934067Ab0EURGF (ORCPT ); Fri, 21 May 2010 13:06:05 -0400 Received: from int-mx03.intmail.prod.int.phx2.redhat.com (int-mx03.intmail.prod.int.phx2.redhat.com [10.5.11.16]) by mx1.redhat.com (8.13.8/8.13.8) with ESMTP id o4LH5pY8001180 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK); Fri, 21 May 2010 13:05:51 -0400 Received: from localhost.localdomain (test1244.test.redhat.com [10.10.10.244]) by int-mx03.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with ESMTP id o4LH5luC001618; Fri, 21 May 2010 13:05:51 -0400 From: Josef Bacik To: linux-btrfs@vger.kernel.org, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org, hch@infradead.org, akpm@linux-foundation.org Subject: [PATCH 5/6] Btrfs: add basic DIO read/write support Date: Fri, 21 May 2010 13:03:41 -0400 Message-Id: <1274461422-18433-5-git-send-email-josef@redhat.com> In-Reply-To: <1274461422-18433-1-git-send-email-josef@redhat.com> References: <1274461422-18433-1-git-send-email-josef@redhat.com> X-Scanned-By: MIMEDefang 2.67 on 10.5.11.16 Sender: linux-btrfs-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-btrfs@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter.kernel.org [140.211.167.41]); Fri, 21 May 2010 17:09:08 +0000 (UTC) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 746a724..36994e8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2257,6 +2257,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u32 *dst); +int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 logical_offset, u32 *dst); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 54a2550..34ea718 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, } -int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u32 *dst) +static int __btrfs_lookup_bio_sums(struct btrfs_root *root, + struct inode *inode, struct bio *bio, + u64 logical_offset, u32 *dst, int dio) { u32 sum; struct bio_vec *bvec = bio->bi_io_vec; int bio_index = 0; - u64 offset; + u64 offset = 0; u64 item_start_offset = 0; u64 item_last_offset = 0; u64 disk_bytenr; @@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, WARN_ON(bio->bi_vcnt <= 0); disk_bytenr = (u64)bio->bi_sector << 9; + if (dio) + offset = logical_offset; while (bio_index < bio->bi_vcnt) { - offset = page_offset(bvec->bv_page) + bvec->bv_offset; + if (!dio) + offset = page_offset(bvec->bv_page) + bvec->bv_offset; ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); if (ret == 0) goto found; @@ -238,6 +242,7 @@ found: else set_state_private(io_tree, offset, sum); disk_bytenr += bvec->bv_len; + offset += bvec->bv_len; bio_index++; bvec++; } @@ -245,6 +250,18 @@ found: return 0; } +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst) +{ + return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0); +} + +int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 offset, u32 *dst) +{ + return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1); +} + int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 29ff749..dace07b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -823,6 +823,47 @@ again: return 0; } +/* Copied from read-write.c */ +static void wait_on_retry_sync_kiocb(struct kiocb *iocb) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + if (!kiocbIsKicked(iocb)) + schedule(); + else + kiocbClearKicked(iocb); + __set_current_state(TASK_RUNNING); +} + +/* + * Just a copy of what do_sync_write does. + */ +static ssize_t __btrfs_direct_write(struct file *file, const char __user *buf, + size_t count, loff_t pos, loff_t *ppos) +{ + struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; + unsigned long nr_segs = 1; + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = pos; + kiocb.ki_left = count; + kiocb.ki_nbytes = count; + + while (1) { + ret = generic_file_direct_write(&kiocb, &iov, &nr_segs, pos, + ppos, count, count); + if (ret != -EIOCBRETRY) + break; + wait_on_retry_sync_kiocb(&kiocb); + } + + if (ret == -EIOCBQUEUED) + ret = wait_on_sync_kiocb(&kiocb); + *ppos = kiocb.ki_pos; + return ret; +} + static ssize_t btrfs_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -839,12 +880,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, unsigned long first_index; unsigned long last_index; int will_write; + int buffered = 0; will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || (file->f_flags & O_DIRECT)); - nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, - PAGE_CACHE_SIZE / (sizeof(struct page *))); pinned[0] = NULL; pinned[1] = NULL; @@ -875,13 +915,34 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, goto out; file_update_time(file); + BTRFS_I(inode)->sequence++; + + if (unlikely(file->f_flags & O_DIRECT)) { + num_written = __btrfs_direct_write(file, buf, count, pos, + ppos); + pos += num_written; + count -= num_written; + + /* We've written everything we wanted to, exit */ + if (num_written < 0 || !count) + goto out; + /* + * We are going to do buffered for the rest of the range, so we + * need to make sure to invalidate the buffered pages when we're + * done. + */ + buffered = 1; + buf += num_written; + } + + nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, + PAGE_CACHE_SIZE / (sizeof(struct page *))); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); /* generic_write_checks can change our pos */ start_pos = pos; - BTRFS_I(inode)->sequence++; first_index = pos >> PAGE_CACHE_SHIFT; last_index = (pos + count) >> PAGE_CACHE_SHIFT; @@ -1023,7 +1084,7 @@ out_nolock: btrfs_end_transaction(trans, root); } } - if (file->f_flags & O_DIRECT) { + if (file->f_flags & O_DIRECT && buffered) { invalidate_mapping_pages(inode->i_mapping, start_pos >> PAGE_CACHE_SHIFT, (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2bfdc64..89a9455 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -697,6 +697,38 @@ retry: return 0; } +static u64 get_extent_allocation_hint(struct inode *inode, u64 start, + u64 num_bytes) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + u64 alloc_hint = 0; + + read_lock(&em_tree->lock); + em = search_extent_mapping(em_tree, start, num_bytes); + if (em) { + /* + * if block start isn't an actual block number then find the + * first block in this inode and use that as a hint. If that + * block is also bogus then just don't worry about it. + */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + em = search_extent_mapping(em_tree, 0, 0); + if (em && em->block_start < EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + if (em) + free_extent_map(em); + } else { + alloc_hint = em->block_start; + free_extent_map(em); + } + } + read_unlock(&em_tree->lock); + + return alloc_hint; +} + /* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to @@ -769,29 +801,7 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(disk_num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy)); - - read_lock(&BTRFS_I(inode)->extent_tree.lock); - em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, - start, num_bytes); - if (em) { - /* - * if block start isn't an actual block number then find the - * first block in this inode and use that as a hint. If that - * block is also bogus then just don't worry about it. - */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - em = search_extent_mapping(em_tree, 0, 0); - if (em && em->block_start < EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; - if (em) - free_extent_map(em); - } else { - alloc_hint = em->block_start; - free_extent_map(em); - } - } - read_unlock(&BTRFS_I(inode)->extent_tree.lock); + alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); while (disk_num_bytes > 0) { @@ -4875,11 +4885,439 @@ out: return em; } +static struct extent_map *btrfs_new_extent_direct(struct inode *inode, + u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct extent_map *em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct btrfs_key ins; + u64 alloc_hint; + int ret; + + btrfs_drop_extent_cache(inode, start, start + len - 1, 0); + + trans = btrfs_start_transaction(root, 1); + if (!trans) + return ERR_PTR(-ENOMEM); + + alloc_hint = get_extent_allocation_hint(inode, start, len); + ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, + alloc_hint, (u64)-1, &ins, 1); + if (ret) { + em = ERR_PTR(ret); + goto out; + } + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + em = ERR_PTR(-ENOMEM); + goto out; + } + + em->start = start; + em->orig_start = em->start; + em->len = ins.offset; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) + break; + btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); + } + + ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, + ins.offset, ins.offset, 0); + if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + em = ERR_PTR(ret); + } +out: + btrfs_end_transaction(trans, root); + return em; +} + +static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 start = iblock << inode->i_blkbits; + u64 len = bh_result->b_size; + + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + /* + * Ok for INLINE and COMPRESSED extents we need to fallback on buffered + * io. INLINE is special, and we could probably kludge it in here, but + * it's still buffered so for safety lets just fall back to the generic + * buffered path. + * + * For COMPRESSED we _have_ to read the entire extent in so we can + * decompress it, so there will be buffering required no matter what we + * do, so go ahead and fallback to buffered. + * + * We return -ENOTBLK because thats what makes DIO go ahead and go back + * to buffered IO. Don't blame me, this is the price we pay for using + * the generic code. + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || + em->block_start == EXTENT_MAP_INLINE) { + free_extent_map(em); + return -ENOTBLK; + } + + /* Just a good old fashioned hole, return */ + if (!create && (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + free_extent_map(em); + /* DIO will do one hole at a time, so just unlock a sector */ + unlock_extent(&BTRFS_I(inode)->io_tree, start, + start + root->sectorsize - 1, GFP_NOFS); + return 0; + } + + /* + * We don't allocate a new extent in the following cases + * + * 1) The inode is marked as NODATACOW. In this case we'll just use the + * existing extent. + * 2) The extent is marked as PREALLOC. We're good to go here and can + * just use the extent. + * + */ + if (!create) + goto map; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + em->block_start != EXTENT_MAP_HOLE)) { + u64 block_start; + int type; + int ret; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + type = BTRFS_ORDERED_PREALLOC; + else + type = BTRFS_ORDERED_NOCOW; + len = min(len, em->block_len - (start - em->start)); + block_start = em->block_start + (start - em->start); + ret = btrfs_add_ordered_extent_dio(inode, start, + start, len, len, type); + if (ret) { + free_extent_map(em); + return ret; + } + } else { + free_extent_map(em); + em = btrfs_new_extent_direct(inode, start, len); + if (IS_ERR(em)) + return PTR_ERR(em); + len = min(len, em->block_len); + } + /* We always just use one extent worth of information */ + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, + GFP_NOFS); +map: + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> + inode->i_blkbits; + bh_result->b_size = em->len - (start - em->start); + bh_result->b_bdev = em->bdev; + set_buffer_mapped(bh_result); + if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + set_buffer_new(bh_result); + + free_extent_map(em); + + return 0; +} + +struct btrfs_dio_private { + struct inode *inode; + u64 logical_offset; + u64 disk_bytenr; + u64 bytes; + u32 *csums; + void *private; +}; + +static void btrfs_endio_direct_read(struct bio *bio, int err) +{ + struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec = bio->bi_io_vec; + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 start; + u32 *private = dip->csums; + + start = dip->logical_offset; + do { + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + struct page *page = bvec->bv_page; + char *kaddr; + u32 csum = ~(u32)0; + unsigned long flags; + + local_irq_save(flags); + kaddr = kmap_atomic(page, KM_IRQ0); + csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, + csum, bvec->bv_len); + btrfs_csum_final(csum, (char *)&csum); + kunmap_atomic(kaddr, KM_IRQ0); + local_irq_restore(flags); + + flush_dcache_page(bvec->bv_page); + if (csum != *private) { + printk(KERN_ERR "btrfs csum failed ino %lu off" + " %llu csum %u private %u\n", + inode->i_ino, (unsigned long long)start, + csum, *private); + err = -EIO; + } + } + + start += bvec->bv_len; + private++; + bvec++; + } while (bvec <= bvec_end); + + unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, + dip->logical_offset + dip->bytes - 1, GFP_NOFS); + bio->bi_private = dip->private; + + kfree(dip->csums); + kfree(dip); + dio_end_io(bio, err); +} + +static void btrfs_endio_direct_write(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_ordered_extent *ordered = NULL; + struct extent_state *cached_state = NULL; + int ret; + + if (err) + goto out_done; + + ret = btrfs_dec_test_ordered_pending(inode, &ordered, + dip->logical_offset, dip->bytes); + if (!ret) + goto out_done; + BUG_ON(!ordered); + + trans = btrfs_join_transaction(root, 1); + if (!trans) { + err = -ENOMEM; + goto out; + } + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { + ret = btrfs_ordered_update_i_size(inode, 0, ordered); + if (!ret) + ret = btrfs_update_inode(trans, root, inode); + err = ret; + goto out; + } + + lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, + ordered->file_offset + ordered->len - 1, 0, + &cached_state, GFP_NOFS); + + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { + ret = btrfs_mark_extent_written(trans, inode, + ordered->file_offset, + ordered->file_offset + + ordered->len); + if (ret) { + err = ret; + goto out_unlock; + } + } else { + ret = insert_reserved_file_extent(trans, inode, + ordered->file_offset, + ordered->start, + ordered->disk_len, + ordered->len, + ordered->len, + 0, 0, 0, + BTRFS_FILE_EXTENT_REG); + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered->file_offset, ordered->len); + if (ret) { + err = ret; + WARN_ON(1); + goto out_unlock; + } + } + + add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); + btrfs_ordered_update_i_size(inode, 0, ordered); + btrfs_update_inode(trans, root, inode); +out_unlock: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, + ordered->file_offset + ordered->len - 1, + &cached_state, GFP_NOFS); +out: + btrfs_end_transaction(trans, root); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); +out_done: + bio->bi_private = dip->private; + + kfree(dip->csums); + kfree(dip); + dio_end_io(bio, err); +} + +static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, + loff_t file_offset) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_dio_private *dip; + struct bio_vec *bvec = bio->bi_io_vec; + u64 start; + int skip_sum; + int write = rw & (1 << BIO_RW); + int ret = 0; + + skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + dip = kmalloc(sizeof(*dip), GFP_NOFS); + if (!dip) { + ret = -ENOMEM; + goto free_ordered; + } + dip->csums = NULL; + + if (!skip_sum) { + dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); + if (!dip->csums) { + ret = -ENOMEM; + goto free_ordered; + } + } + + dip->private = bio->bi_private; + dip->inode = inode; + dip->logical_offset = file_offset; + + start = dip->logical_offset; + dip->bytes = 0; + do { + dip->bytes += bvec->bv_len; + bvec++; + } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); + + dip->disk_bytenr = bio->bi_sector << 9; + bio->bi_private = dip; + + if (write) + bio->bi_end_io = btrfs_endio_direct_write; + else + bio->bi_end_io = btrfs_endio_direct_read; + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + if (ret) + goto out_err; + + if (write && !skip_sum) + btrfs_csum_one_bio(root, inode, bio, dip->logical_offset, 1); + else if (!skip_sum) + btrfs_lookup_bio_sums_dio(root, inode, bio, + dip->logical_offset, dip->csums); + + ret = btrfs_map_bio(root, rw, bio, 0, 0); + if (ret) + goto out_err; + return; +out_err: + kfree(dip->csums); + kfree(dip); +free_ordered: + /* + * If this is a write, we need to clean up the reserved space and kill + * the ordered extent. + */ + if (write) { + struct btrfs_ordered_extent *ordered; + ordered = btrfs_lookup_ordered_extent(inode, + dip->logical_offset); + if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + btrfs_free_reserved_extent(root, ordered->start, + ordered->disk_len); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } + bio_endio(bio, ret); +} + static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { - return -EINVAL; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct btrfs_ordered_extent *ordered; + u64 lockstart, lockend; + ssize_t ret; + + lockstart = offset; + lockend = offset + iov_length(iov, nr_segs) - 1; + while (1) { + lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + GFP_NOFS); + /* + * We're concerned with the entire range that we're going to be + * doing DIO to, so we need to make sure theres no ordered + * extents in this range. + */ + ordered = btrfs_lookup_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (!ordered) + break; + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + cond_resched(); + } + + ret = __blockdev_direct_IO(rw, iocb, inode, NULL, iov, offset, nr_segs, + btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, 0); + + if (ret < 0 && ret != -EIOCBQUEUED) { + unlock_extent(&BTRFS_I(inode)->io_tree, offset, + offset + iov_length(iov, nr_segs) - 1, GFP_NOFS); + } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) { + /* + * We're falling back to buffered, unlock the section we didn't + * do IO on. + */ + unlock_extent(&BTRFS_I(inode)->io_tree, offset + ret, + offset + iov_length(iov, nr_segs) - 1, GFP_NOFS); + } + + return ret; } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a127c0e..7a06e97 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) return 1; } +static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, + u64 len) +{ + if (file_offset + len <= entry->file_offset || + entry->file_offset + entry->len <= file_offset) + return 0; + return 1; +} + /* * look find the first ordered struct that has this offset, otherwise * the first one less than this offset @@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, * The tree is given a single reference on the ordered extent that was * inserted. */ -int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type) +static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int dio) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; @@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); + if (dio) + set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); + /* one ref for the tree */ atomic_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); @@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, return 0; } +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0); +} + +int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 1); +} + /* * Add a struct btrfs_ordered_sum into the list of checksums to be inserted * when an ordered extent is finished. If the list covers more than one @@ -491,7 +518,8 @@ void btrfs_start_ordered_extent(struct inode *inode, * start IO on any dirty ones so the wait doesn't stall waiting * for pdflush to find them */ - filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) + filemap_fdatawrite_range(inode->i_mapping, start, end); if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); @@ -588,6 +616,47 @@ out: return entry; } +/* Since the DIO code tries to lock a wide area we need to look for any ordered + * extents that exist in the range, rather than just the start of the range. + */ +struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, + u64 file_offset, + u64 len) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock(&tree->lock); + node = tree_search(tree, file_offset); + if (!node) { + node = tree_search(tree, file_offset + len); + if (!node) + goto out; + } + + while (1) { + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (range_overlaps(entry, file_offset, len)) + break; + + if (entry->file_offset >= file_offset + len) { + entry = NULL; + break; + } + entry = NULL; + node = rb_next(node); + if (!node) + break; + } +out: + if (entry) + atomic_inc(&entry->refs); + spin_unlock(&tree->lock); + return entry; +} + /* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c82f76a..8ac3654 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -72,6 +72,8 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ +#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ + struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int tyep); + u64 start, u64 len, u64 disk_len, int type); +int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type); int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); @@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode, int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, + u64 file_offset, + u64 len); int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);