From patchwork Tue Jun 14 02:33:06 2011
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Jim Rees <rees@umich.edu>
X-Patchwork-Id: 877572
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by demeter2.kernel.org (8.14.4/8.14.4) with ESMTP id p5E2W3N1029514
	for <patchwork-linux-nfs@patchwork.kernel.org>;
	Tue, 14 Jun 2011 02:33:11 GMT
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1754260Ab1FNCdK (ORCPT
	<rfc822;patchwork-linux-nfs@patchwork.kernel.org>);
	Mon, 13 Jun 2011 22:33:10 -0400
Received: from merit-proxy01.merit.edu ([207.75.116.193]:48197 "EHLO
	merit-proxy01.merit.edu" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1752349Ab1FNCdJ (ORCPT
	<rfc822; linux-nfs@vger.kernel.org>); Mon, 13 Jun 2011 22:33:09 -0400
Received: from localhost (localhost.localdomain [127.0.0.1])
	by merit-proxy01.merit.edu (Postfix) with ESMTP id 035E12039D47;
	Mon, 13 Jun 2011 22:33:09 -0400 (EDT)
X-Virus-Scanned: amavisd-new at merit-proxy01.merit.edu
Received: from merit-proxy01.merit.edu ([127.0.0.1])
	by localhost (merit-proxy01.merit.edu [127.0.0.1]) (amavisd-new,
	port 10024)
	with ESMTP id aklOgRpTKgLX; Mon, 13 Jun 2011 22:33:08 -0400 (EDT)
Received: from merit.edu (74-126-0-171.static.123.net [74.126.0.171])
	by merit-proxy01.merit.edu (Postfix) with ESMTPSA id F2C402039CE1;
	Mon, 13 Jun 2011 22:33:07 -0400 (EDT)
X-Mailbox-Line: From db22e7ddaae99181d46fa42233381595eaf243ec Mon Sep 17
	00:00:00 2001
Message-Id: 
 <db22e7ddaae99181d46fa42233381595eaf243ec.1308017749.git.rees@umich.edu>
In-Reply-To: <cover.1308017749.git.rees@umich.edu>
References: <cover.1308017749.git.rees@umich.edu>
Subject: [PATCH 26/33] pnfsblock: write_begin
To: Benny Halevy <bhalevy@panasas.com>
Cc: linux-nfs@vger.kernel.org, peter honeyman <honey@citi.umich.edu>
Date: Mon, 13 Jun 2011 22:33:06 -0400
From: Jim Rees <rees@umich.edu>
Sender: linux-nfs-owner@vger.kernel.org
Precedence: bulk
List-ID: <linux-nfs.vger.kernel.org>
X-Mailing-List: linux-nfs@vger.kernel.org
X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by
	milter-greylist-4.2.6 (demeter2.kernel.org [140.211.167.43]);
	Tue, 14 Jun 2011 02:33:11 +0000 (UTC)

From: Fred Isaman <iisaman@citi.umich.edu>

Implements bl_write_begin and bl_do_flush, allowing block driver to read
in page "around" the data that is about to be copied to the page.

[pnfsblock: fix 64-bit compiler warnings for write_begin]
[pnfsblock: write_begin adjust for removed fields]
Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/blocklayout/blocklayout.c |  178 +++++++++++++++++++++++++++++++++++++-
 1 files changed, 177 insertions(+), 1 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index d9bcb13..b9b961f 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -31,6 +31,8 @@
  */
 #include <linux/module.h>
 #include <linux/init.h>
+
+#include <linux/buffer_head.h> /* various write calls */
 #include <linux/bio.h> /* struct bio */
 #include <linux/vmalloc.h>
 #include "blocklayout.h"
@@ -589,11 +591,185 @@ bl_clear_layoutdriver(struct nfs_server *server)
 	return 0;
 }
 
+/* STUB - mark intersection of layout and page as bad, so is not
+ * used again.
+ */
+static void mark_bad_read(void)
+{
+	return;
+}
+
+/* Copied from buffer.c */
+static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
+{
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		/* This happens, due to failed READA attempts. */
+		clear_buffer_uptodate(bh);
+	}
+	unlock_buffer(bh);
+}
+
+/* Copied from buffer.c */
+static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
+{
+	__end_buffer_read_notouch(bh, uptodate);
+}
+
+/*
+ * map_block:  map a requested I/0 block (isect) into an offset in the LVM
+ * meta block_device
+ */
+static void
+map_block(sector_t isect, struct pnfs_block_extent *be, struct buffer_head *bh)
+{
+	dprintk("%s enter be=%p\n", __func__, be);
+
+	set_buffer_mapped(bh);
+	bh->b_bdev = be->be_mdev;
+	bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
+		(be->be_mdev->bd_inode->i_blkbits - 9);
+
+	dprintk("%s isect %ld, bh->b_blocknr %ld, using bsize %Zd\n",
+				__func__, (long)isect,
+				(long)bh->b_blocknr,
+				bh->b_size);
+	return;
+}
+
+/* Given an unmapped page, zero it (or read in page for COW),
+ * and set appropriate flags/markings, but it is safe to not initialize
+ * the range given in [from, to).
+ */
+/* This is loosely based on nobh_write_begin */
+static int
+init_page_for_write(struct pnfs_block_layout *bl, struct page *page,
+		    unsigned from, unsigned to, sector_t **pages_to_mark)
+{
+	struct buffer_head *bh;
+	int inval, ret = -EIO;
+	struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+	sector_t isect;
+
+	dprintk("%s enter, %p\n", __func__, page);
+	bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
+	if (!bh) {
+		ret = -ENOMEM;
+		goto cleanup;
+	}
+
+	isect = (sector_t)page->index << (PAGE_CACHE_SHIFT - 9);
+	be = find_get_extent(bl, isect, &cow_read);
+	if (!be)
+		goto cleanup;
+	inval = is_hole(be, isect);
+	dprintk("%s inval=%i, from=%u, to=%u\n", __func__, inval, from, to);
+	if (inval) {
+		if (be->be_state == PNFS_BLOCK_NONE_DATA) {
+			dprintk("%s PANIC - got NONE_DATA extent %p\n",
+				__func__, be);
+			goto cleanup;
+		}
+		map_block(isect, be, bh);
+		unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+	}
+	if (PageUptodate(page)) {
+		/* Do nothing */
+	} else if (inval & !cow_read) {
+		zero_user_segments(page, 0, from, to, PAGE_CACHE_SIZE);
+	} else if (0 < from || PAGE_CACHE_SIZE > to) {
+		struct pnfs_block_extent *read_extent;
+
+		read_extent = (inval && cow_read) ? cow_read : be;
+		map_block(isect, read_extent, bh);
+		lock_buffer(bh);
+		bh->b_end_io = end_buffer_read_nobh;
+		submit_bh(READ, bh);
+		dprintk("%s: Waiting for buffer read\n", __func__);
+		/* XXX Don't really want to hold layout lock here */
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh))
+			goto cleanup;
+	}
+	if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+		/* There is a BUG here if is a short copy after write_begin,
+		 * but I think this is a generic fs bug.  The problem is that
+		 * we have marked the page as initialized, but it is possible
+		 * that the section not copied may never get copied.
+		 */
+		ret = mark_initialized_sectors(be->be_inval, isect,
+					       PAGE_CACHE_SECTORS,
+					       pages_to_mark);
+		/* Want to preallocate mem so above can't fail */
+		if (ret)
+			goto cleanup;
+	}
+	SetPageMappedToDisk(page);
+	ret = 0;
+
+cleanup:
+	free_buffer_head(bh);
+	put_extent(be);
+	put_extent(cow_read);
+	if (ret) {
+		/* Need to mark layout with bad read...should now
+		 * just use nfs4 for reads and writes.
+		 */
+		mark_bad_read();
+	}
+	return ret;
+}
+
 static int
 bl_write_begin(struct pnfs_layout_segment *lseg, struct page *page, loff_t pos,
 	       unsigned count, struct pnfs_fsdata *fsdata)
 {
-	return 0;
+	unsigned from, to;
+	int ret;
+	sector_t *pages_to_mark = NULL;
+	struct pnfs_block_layout *bl = BLK_LSEG2EXT(lseg);
+
+	dprintk("%s enter, %u@%lld\n", __func__, count, pos);
+	print_page(page);
+	/* The following code assumes blocksize >= PAGE_CACHE_SIZE */
+	if (bl->bl_blocksize < (PAGE_CACHE_SIZE >> 9)) {
+		dprintk("%s Can't handle blocksize %llu\n", __func__,
+			(u64)bl->bl_blocksize);
+		put_lseg(fsdata->lseg);
+		fsdata->lseg = NULL;
+		return 0;
+	}
+	if (PageMappedToDisk(page)) {
+		/* Basically, this is a flag that says we have
+		 * successfully called write_begin already on this page.
+		 */
+		/* NOTE - there are cache consistency issues here.
+		 * For example, what if the layout is recalled, then regained?
+		 * If the file is closed and reopened, will the page flags
+		 * be reset?  If not, we'll have to use layout info instead of
+		 * the page flag.
+		 */
+		return 0;
+	}
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + count;
+	ret = init_page_for_write(bl, page, from, to, &pages_to_mark);
+	if (ret) {
+		dprintk("%s init page failed with %i", __func__, ret);
+		/* Revert back to plain NFS and just continue on with
+		 * write.  This assumes there is no request attached, which
+		 * should be true if we get here.
+		 */
+		BUG_ON(PagePrivate(page));
+		put_lseg(fsdata->lseg);
+		fsdata->lseg = NULL;
+		kfree(pages_to_mark);
+		ret = 0;
+	} else {
+		fsdata->private = pages_to_mark;
+	}
+	return ret;
 }
 
 static int