From patchwork Fri Mar 5 19:42:25 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: jim owens X-Patchwork-Id: 83829 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.3/8.14.3) with ESMTP id o25JgVd4024110 for ; Fri, 5 Mar 2010 19:42:31 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755522Ab0CETma (ORCPT ); Fri, 5 Mar 2010 14:42:30 -0500 Received: from mail-fx0-f219.google.com ([209.85.220.219]:59674 "EHLO mail-fx0-f219.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1755511Ab0CETm3 (ORCPT ); Fri, 5 Mar 2010 14:42:29 -0500 Received: by mail-fx0-f219.google.com with SMTP id 19so4515087fxm.21 for ; Fri, 05 Mar 2010 11:42:28 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=gamma; h=domainkey-signature:received:received:message-id:date:from :user-agent:mime-version:to:subject:content-type :content-transfer-encoding; bh=DGpppgzQS/ECErM2H6uMnmN+F0OPttORf8a4ut4sorg=; b=XHv4qlesv9jUH4bCrfvecz2LlKxxJ4rXY7C6/luJDmkn80fOfF8tUQreo8D2o4oLKv v/4kee7l0XSi2ORAxpXh0ZRaGYzJz7jmA8IAevD4K2LeChDg1XQiODFXEZE/Y6JJD9Lb VyV5MQYvivxmE++KvuM/zF1V6p568vWt/uAes= DomainKey-Signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=message-id:date:from:user-agent:mime-version:to:subject :content-type:content-transfer-encoding; b=OFY/mDUn/IeEBKVzjkdYMQ9MG+9d50ZT3m9PxWms5urE8eTjAAX+fKKmsGKSPjW6FZ 2EPMwspukNK9u0CXZOIIiL/Eeg69BTjScfiIvJZgw7qLZOWRxXOJdcuUExmSvlUZzVkB nq7ffaTUr8UAdqh55iWX/500JVadOl3eHnu9k= Received: by 10.87.62.28 with SMTP id p28mr1839247fgk.55.1267818148732; Fri, 05 Mar 2010 11:42:28 -0800 (PST) Received: from [192.168.0.99] (c-24-147-40-65.hsd1.nh.comcast.net [24.147.40.65]) by mx.google.com with ESMTPS id 15sm1456002fxm.12.2010.03.05.11.42.26 (version=TLSv1/SSLv3 cipher=RC4-MD5); Fri, 05 Mar 2010 11:42:27 -0800 (PST) Message-ID: <4B915EA1.1030900@gmail.com> Date: Fri, 05 Mar 2010 14:42:25 -0500 From: jim owens User-Agent: Thunderbird 2.0.0.23 (X11/20090817) MIME-Version: 1.0 To: linux-btrfs , Chris Mason , Josef Bacik Subject: [PATCH 2/2] Btrfs: change dio.c to use dio_min_blocksize instead of 512. Sender: linux-btrfs-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-btrfs@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter.kernel.org [140.211.167.41]); Fri, 05 Mar 2010 19:42:32 +0000 (UTC) diff --git a/fs/btrfs/dio.c b/fs/btrfs/dio.c index b1beafc..b76b227 100644 --- a/fs/btrfs/dio.c +++ b/fs/btrfs/dio.c @@ -134,6 +134,7 @@ struct btrfs_diocb { struct workspace *workspace; char *csum_buf; + u32 alignment; int rw; int error; int sleeping; @@ -160,12 +161,10 @@ static void btrfs_dio_write(struct btrfs_diocb *diocb); static void btrfs_dio_read(struct btrfs_diocb *diocb); static int btrfs_dio_new_extcb(struct btrfs_dio_extcb **alloc_extcb, struct btrfs_diocb *diocb, struct extent_map *em); -static void btrfs_dio_eof_tail(u32 *filetail, int eof, - struct btrfs_diocb *diocb); static int btrfs_dio_compressed_read(struct btrfs_diocb *diocb, struct extent_map *lem, u64 data_len); static int btrfs_dio_extent_read(struct btrfs_diocb *diocb, - struct extent_map *lem, u64 data_len, int eof); + struct extent_map *lem, u64 data_len); static void btfrs_dio_unplug(struct btrfs_dio_extcb *extcb); static int btrfs_dio_read_stripes(struct btrfs_dio_extcb *extcb, u64 *rd_start, u64 *rd_len, int temp_pages); @@ -180,8 +179,6 @@ static int btrfs_dio_inline_next_in(struct bio_vec *ivec, struct btrfs_inflate *icb); static int btrfs_dio_get_user_bvec(struct bio_vec *uv, struct btrfs_dio_user_mem_control *umc); -static int btrfs_dio_not_aligned(unsigned long iomask, u32 testlen, - struct btrfs_dio_user_mem_control *umc); static void btrfs_dio_put_user_bvec(struct bio_vec *uv, struct btrfs_dio_user_mem_control *umc); static void btrfs_dio_release_unused_pages( @@ -221,29 +218,33 @@ ssize_t btrfs_direct_IO(int rw, struct kiocb *kiocb, ssize_t done = 0; struct btrfs_diocb *diocb; struct inode *inode = kiocb->ki_filp->f_mapping->host; + u32 alignment = BTRFS_I(inode)->root->sectorsize; - /* traditional 512-byte device sector alignment is the - * minimum required. if they have a larger sector disk - * (possibly multiple sizes in the filesystem) and need - * a larger alignment for this I/O, we just fail later. - */ - if (offset & 511) - return -EINVAL; - - /* check memory alignment, blocks cannot straddle pages. + /* check memory alignment, device blocks cannot straddle pages + * because special hardware (e.g. iommu) is needed for split dma. * allow 0-length vectors which are questionable but seem legal. + * limit I/O to smaller of request size or available memory. */ - for (seg = 0; seg < nr_segs; seg++) { - if (iov[seg].iov_len && - ((unsigned long)iov[seg].iov_base & 511)) - return -EINVAL; - if (iov[seg].iov_len & 511) - return -EINVAL; - done += iov[seg].iov_len; - } + alignment |= offset; + for (seg = 0; seg < nr_segs && done < kiocb->ki_left; seg++) + if (iov[seg].iov_len) { + /* alignment only needed through size of I/O */ + done += iov[seg].iov_len; + done = min_t(ssize_t, done, kiocb->ki_left); + alignment |= done | (unsigned long)iov[seg].iov_base; + } - /* limit request size to available memory */ - done = min_t(ssize_t, done, kiocb->ki_left); + /* minimum alignment is smallest logical_block_size of all devices in + * this fs. this check is not enough if there are larger blocksizes + * in the filesystem and we need a larger alignment for this I/O, so + * we retest alignment as we build the bio and fail it at that point. + * aligning here on largest blocksize would be simpler, but it would + * mean applications that were working might fail if the user added a + * larger blocksize device even though none of their file was on it. + */ + if (alignment & + (BTRFS_I(inode)->root->fs_info->fs_devices->dio_min_blocksize - 1)) + return -EINVAL; /* no write code here so fall back to buffered writes */ if (rw == WRITE) @@ -253,6 +254,14 @@ ssize_t btrfs_direct_IO(int rw, struct kiocb *kiocb, if (!diocb) return -ENOMEM; + /* determine minimum user alignment block size across entire I/O + * so we can use it for eof tail handling and testing each device + */ + diocb->alignment = + BTRFS_I(inode)->root->fs_info->fs_devices->dio_min_blocksize; + while (!(alignment & diocb->alignment)) + diocb->alignment *= 2; + diocb->rw = rw; diocb->kiocb = kiocb; diocb->start = offset; @@ -523,8 +532,7 @@ getlock: } err = btrfs_dio_compressed_read(diocb, em, len); } else { - err = btrfs_dio_extent_read(diocb, em, len, - len == data_len); + err = btrfs_dio_extent_read(diocb, em, len); } } @@ -650,28 +658,13 @@ static int btrfs_dio_compressed_read(struct btrfs_diocb *diocb, return err; } -/* for consistent eof processing between inline/compressed/normal - * extents, an unaligned eof gets special treatment, read into temp - * and memcpy to user on completion the part that does not match - * the users I/O alignment (for now always 511) - */ -static void btrfs_dio_eof_tail(u32 *filetail, int eof, - struct btrfs_diocb *diocb) -{ - if (eof) - *filetail &= 511; - else - *filetail = 0; /* aligned direct to user memory */ -} - /* called with a hard-sector bounded file byte data start/len * which covers areas of disk data. it might not... be contiguous, * be on the same device(s), have the same redundancy property. * get the extent map per contiguous chunk and submit bios. */ - static int btrfs_dio_extent_read(struct btrfs_diocb *diocb, - struct extent_map *lem, u64 data_len, int eof) + struct extent_map *lem, u64 data_len) { struct extent_map_tree *em_tree = &BTRFS_I(diocb->inode)-> root->fs_info->mapping_tree.map_tree; @@ -690,9 +683,11 @@ static int btrfs_dio_extent_read(struct btrfs_diocb *diocb, csum_after = blocksize - filetail; } - /* make post-eof consistent between inline/compressed/normal extents */ - if (filetail) - btrfs_dio_eof_tail(&filetail, eof, diocb); + /* to make eof consistent between inline/compressed/normal extents, + * any unaligned bytes at eof get special treatment. those bytes are + * read into a kernel temp page and copied to user memory. + */ + filetail &= diocb->alignment - 1; data_start -= csum_before; data_len += csum_before + csum_after; @@ -781,9 +776,7 @@ static int btrfs_dio_extent_read(struct btrfs_diocb *diocb, filetail; else csum_after = 0; - if (filetail) - btrfs_dio_eof_tail(&filetail, - eof, diocb); + filetail &= diocb->alignment - 1; } extcb->csum_pg2 = extcb->csum_pg1; @@ -811,7 +804,7 @@ static int btrfs_dio_extent_read(struct btrfs_diocb *diocb, */ extcb->csum_pg2 = extcb->csum_pg1; csum_after += filetail; - csum_after = ALIGN(csum_after, 512); /* for no csum */ + csum_after = ALIGN(csum_after, diocb->alignment); err = btrfs_dio_read_stripes(extcb, &data_start, &csum_after, 1); if (err) @@ -867,7 +860,6 @@ static int btrfs_dio_read_stripes(struct btrfs_dio_extcb *extcb, while (*rd_len) { u64 dev_left = *rd_len; struct btrfs_stripe_info stripe_info; - unsigned long iomask; int mirror = 0; int dvn; @@ -880,18 +872,16 @@ retry: btrfs_map_stripe_physical(extcb->em, stripe_info.stripe_index); - /* device start and length may not be sector aligned or - * user memory address/length vectors may not be aligned - * on a device sector because device sector size is > 512. - * we might have different size devices in the filesystem, - * so retry all copies to see if any meet the alignment. + /* we can have devices with different logical blocksizes + * in the filesystem. the user I/O start and length or + * memory address and length may not be sector aligned + * on a device with blocksize > dio_min_blocksize. + * if the user alignment is not correct for this device, + * try other copies to see if any meet their alignment. */ - iomask = bdev_logical_block_size( - btrfs_map_stripe_bdev(extcb->em, dvn)) - 1; - if ((extcb->diodev[dvn].physical & iomask) || - (dev_left & iomask) || (!temp_pages && - btrfs_dio_not_aligned(iomask, (u32)dev_left, - &extcb->diocb->umc))) { + if (!temp_pages && extcb->diocb->alignment < + bdev_logical_block_size(btrfs_map_stripe_bdev( + extcb->em, dvn))) { if (mirror < btrfs_map_num_copies(extcb->em)) { mirror++; goto retry; @@ -1056,38 +1046,6 @@ static int btrfs_dio_get_user_bvec(struct bio_vec *uv, return 0; } -static int btrfs_dio_not_aligned(unsigned long iomask, u32 testlen, - struct btrfs_dio_user_mem_control *umc) -{ - const struct iovec *nuv; - - if (!umc) /* temp pages are always good */ - return 0; - - if ((unsigned long)umc->work_iov.iov_base & iomask) - return 1; - if (testlen <= umc->work_iov.iov_len) - return 0; - if (umc->work_iov.iov_len & iomask) - return 1; - - testlen -= umc->work_iov.iov_len; - nuv = umc->user_iov; - while (testlen) { - nuv++; - while (nuv->iov_len == 0) - nuv++; - if ((unsigned long)nuv->iov_base & iomask) - return 1; - if (testlen <= nuv->iov_len) - return 0; - if (nuv->iov_len & iomask) - return 1; - testlen -= nuv->iov_len; - } - return 0; -} - /* error processing only, put back the user bvec we could not process * so we can get it again later or release it properly */