@@ -435,14 +435,81 @@ static void btrfs_dio_write(struct btrfs_diocb *diocb)
{
}
+/* verify that we have locked everything we need to do the read and
+ * have pushed the ordered data into the btree so the extent is valid
+ */
+static void btrfs_dio_safe_to_read(struct btrfs_diocb *diocb,
+ struct extent_map *em, u64 *lockend,
+ u64 *data_len, int *safe_to_read)
+{
+ struct extent_io_tree *io_tree = &BTRFS_I(diocb->inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ u64 stop;
+
+ /* must ensure the whole compressed extent is valid on each loop
+ * as we don't know the final extent size until we look it up
+ */
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
+ (diocb->lockstart > em->start || *lockend <= em->start + em->len)) {
+ unlock_extent(io_tree, diocb->lockstart, *lockend, GFP_NOFS);
+ diocb->lockstart = em->start;
+ *lockend = min(*lockend, em->start + em->len - 1);
+ *safe_to_read = 0;
+ return;
+ }
+
+ /* one test on first loop covers all extents if no concurrent writes */
+ if (*safe_to_read)
+ return;
+
+ ordered = btrfs_lookup_first_ordered_extent(diocb->inode,
+ diocb->lockstart, *lockend + 1 - diocb->lockstart);
+ if (!ordered) {
+ *safe_to_read = 1;
+ return;
+ }
+
+ /* we checked everything to lockend which might cover multiple extents
+ * in the hope that we could do the whole read with one locking. that
+ * won't happen now, but we can read the first extent (or part of it
+ * for uncompressed data) if what we need is before this ordered data.
+ * we must have the whole extent valid to read any compressed data,
+ * while we can read a single block of valid uncompressed data.
+ */
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ stop = em->start + em->len;
+ else
+ stop = diocb->lockstart +
+ BTRFS_I(diocb->inode)->root->sectorsize;
+
+ if (ordered->file_offset < stop) {
+ unlock_extent(io_tree, diocb->lockstart, *lockend, GFP_NOFS);
+ btrfs_start_ordered_extent(diocb->inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ *safe_to_read = 0;
+ return;
+ }
+
+ /* do the part of the data that is valid to read now with the
+ * remainder unlocked so that ordered data can flush in parallel
+ */
+ unlock_extent(io_tree, ordered->file_offset, *lockend, GFP_NOFS);
+ *lockend = ordered->file_offset - 1;
+ *data_len = ordered->file_offset - diocb->start;
+ btrfs_put_ordered_extent(ordered);
+
+ *safe_to_read = 1;
+ return;
+}
+
static void btrfs_dio_read(struct btrfs_diocb *diocb)
{
struct extent_io_tree *io_tree = &BTRFS_I(diocb->inode)->io_tree;
u64 end = diocb->terminate; /* copy because reaper changes it */
u64 lockend;
u64 data_len;
+ int safe_to_read;
int err = 0;
- int loop = 0;
u32 blocksize = BTRFS_I(diocb->inode)->root->sectorsize;
/* expand lock region to include what we read to validate checksum */
@@ -450,42 +517,25 @@ static void btrfs_dio_read(struct btrfs_diocb *diocb)
lockend = ALIGN(end, blocksize) - 1;
getlock:
- mutex_lock(&diocb->inode->i_mutex);
+ /* writeout everything we read for checksum or compressed extents */
+ filemap_write_and_wait_range(diocb->inode->i_mapping,
+ diocb->lockstart, lockend);
+ lock_extent(io_tree, diocb->lockstart, lockend, GFP_NOFS);
- /* ensure writeout and btree update on everything
- * we might read for checksum or compressed extents
- */
- data_len = lockend + 1 - diocb->lockstart;
- err = btrfs_wait_ordered_range(diocb->inode,
- diocb->lockstart, data_len);
- if (err) {
- diocb->error = err;
- mutex_unlock(&diocb->inode->i_mutex);
- return;
- }
- data_len = i_size_read(diocb->inode);
- if (data_len < end)
- end = data_len;
- if (end <= diocb->start) {
- mutex_unlock(&diocb->inode->i_mutex);
- return; /* 0 is returned past EOF */
- }
- if (!loop) {
- loop++;
- diocb->terminate = end;
- lockend = ALIGN(end, blocksize) - 1;
+ data_len = min_t(u64, end, i_size_read(diocb->inode));
+ if (data_len <= diocb->start) {
+ /* whatever we finished (or 0) is returned past EOF */
+ goto fail;
}
+ data_len -= diocb->start;
- lock_extent(io_tree, diocb->lockstart, lockend, GFP_NOFS);
- mutex_unlock(&diocb->inode->i_mutex);
-
- data_len = end - diocb->start;
+ safe_to_read = 0;
while (data_len && !diocb->error) { /* error in reaper stops submit */
struct extent_map *em;
- u64 len = data_len;
+ u64 len;
em = btrfs_get_extent(diocb->inode, NULL, 0,
- diocb->start, len, 0);
+ diocb->start, data_len, 0);
if (IS_ERR(em)) {
err = PTR_ERR(em);
printk(KERN_ERR
@@ -496,6 +546,18 @@ getlock:
goto fail;
}
+ /* verify extent was locked and ordered data was flushed,
+ * may change data_len and lockend whether true or false.
+ */
+ btrfs_dio_safe_to_read(diocb, em, &lockend, &data_len,
+ &safe_to_read);
+ if (!safe_to_read) {
+ free_extent_map(em);
+ goto getlock;
+ }
+
+ len = data_len;
+
/* problem flushing ordered data with btree not updated */
if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
printk(KERN_ERR
@@ -520,25 +582,12 @@ getlock:
} else {
len = min(len, em->len - (diocb->start - em->start));
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
- em->block_start == EXTENT_MAP_HOLE) {
+ em->block_start == EXTENT_MAP_HOLE)
err = btrfs_dio_hole_read(diocb, len);
- } else if (test_bit(EXTENT_FLAG_COMPRESSED,
- &em->flags)) {
- if (diocb->lockstart > em->start ||
- lockend < em->start + em->len - 1) {
- /* lock everything we read to inflate */
- unlock_extent(io_tree, diocb->lockstart,
- lockend, GFP_NOFS);
- diocb->lockstart = em->start;
- lockend = max(lockend,
- em->start + em->len - 1);
- free_extent_map(em);
- goto getlock;
- }
+ else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
err = btrfs_dio_compressed_read(diocb, em, len);
- } else {
+ else
err = btrfs_dio_extent_read(diocb, em, len);
- }
}
free_extent_map(em);
@@ -547,6 +596,15 @@ getlock:
goto fail;
cond_resched();
}
+
+ /* we might have shortened data_len because of uncommitted
+ * ordered data, we want to try again to read the remainder
+ */
+ if (diocb->start < end && !err && !diocb->error) {
+ lockend = ALIGN(end, blocksize) - 1;
+ goto getlock;
+ }
+
fail:
if (err)
diocb->error = err;