diff mbox

[RFC,4/5] NFSD: Defer copying

Message ID 1374267830-30154-5-git-send-email-bjschuma@netapp.com (mailing list archive)
State New, archived
Headers show

Commit Message

Bryan Schumaker July 19, 2013, 9:03 p.m. UTC
From: Bryan Schumaker <bjschuma@netapp.com>

Rather than performing the copy right away, schedule it to run later and
reply to the client.  Later, send a callback to notify the client that
the copy has finished.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
---
 fs/nfsd/nfs4callback.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs4proc.c     |  59 +++++++++++++++------
 fs/nfsd/nfs4state.c    |  11 ++++
 fs/nfsd/state.h        |  21 ++++++++
 fs/nfsd/xdr4cb.h       |   9 ++++
 5 files changed, 221 insertions(+), 15 deletions(-)

Comments

J. Bruce Fields July 22, 2013, 6:50 p.m. UTC | #1
On Fri, Jul 19, 2013 at 05:03:49PM -0400, bjschuma@netapp.com wrote:
> From: Bryan Schumaker <bjschuma@netapp.com>
> 
> Rather than performing the copy right away, schedule it to run later and
> reply to the client.  Later, send a callback to notify the client that
> the copy has finished.

I believe you need to implement the referring triple support described
in http://tools.ietf.org/html/rfc5661#section-2.10.6.3 to fix the race
described in
http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
.

I see cb_delay initialized below, but not otherwise used.  Am I missing
anything?

What about OFFLOAD_STATUS and OFFLOAD_ABORT?

In some common cases the reply will be very quick, and we might be
better off handling it synchronously.  Could we implement a heuristic
like "copy synchronously if the filesystem has special support or the
range is less than the maximum iosize, otherwise copy asynchronously"?

--b.


> Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
> ---
>  fs/nfsd/nfs4callback.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/nfsd/nfs4proc.c     |  59 +++++++++++++++------
>  fs/nfsd/nfs4state.c    |  11 ++++
>  fs/nfsd/state.h        |  21 ++++++++
>  fs/nfsd/xdr4cb.h       |   9 ++++
>  5 files changed, 221 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> index 7f05cd1..8f797e1 100644
> --- a/fs/nfsd/nfs4callback.c
> +++ b/fs/nfsd/nfs4callback.c
> @@ -52,6 +52,9 @@ enum {
>  	NFSPROC4_CLNT_CB_NULL = 0,
>  	NFSPROC4_CLNT_CB_RECALL,
>  	NFSPROC4_CLNT_CB_SEQUENCE,
> +
> +	/* NFS v4.2 callback */
> +	NFSPROC4_CLNT_CB_OFFLOAD,
>  };
>  
>  struct nfs4_cb_compound_hdr {
> @@ -110,6 +113,7 @@ enum nfs_cb_opnum4 {
>  	OP_CB_WANTS_CANCELLED		= 12,
>  	OP_CB_NOTIFY_LOCK		= 13,
>  	OP_CB_NOTIFY_DEVICEID		= 14,
> +	OP_CB_OFFLOAD			= 15,
>  	OP_CB_ILLEGAL			= 10044
>  };
>  
> @@ -469,6 +473,31 @@ out_default:
>  	return nfs_cb_stat_to_errno(nfserr);
>  }
>  
> +static void encode_cb_offload4args(struct xdr_stream *xdr,
> +				   const struct nfs4_cb_offload *offload,
> +				   struct nfs4_cb_compound_hdr *hdr)
> +{
> +	__be32 *p;
> +
> +	if (hdr->minorversion < 2)
> +		return;
> +
> +	encode_nfs_cb_opnum4(xdr, OP_CB_OFFLOAD);
> +	encode_nfs_fh4(xdr, &offload->co_dst_fh);
> +	encode_stateid4(xdr, &offload->co_stid->sc_stateid);
> +
> +	p = xdr_reserve_space(xdr, 4);
> +	*p = cpu_to_be32(1);
> +	encode_stateid4(xdr, &offload->co_stid->sc_stateid);
> +
> +	p = xdr_reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
> +	p = xdr_encode_hyper(p, offload->co_count);
> +	*p++ = cpu_to_be32(offload->co_stable_how);
> +	xdr_encode_opaque_fixed(p, offload->co_verifier.data, NFS4_VERIFIER_SIZE);
> +
> +	hdr->nops++;
> +}
> +
>  /*
>   * NFSv4.0 and NFSv4.1 XDR encode functions
>   *
> @@ -505,6 +534,23 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
>  	encode_cb_nops(&hdr);
>  }
>  
> +/*
> + * CB_OFFLOAD
> + */
> +static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req, struct xdr_stream *xdr,
> +				    const struct nfsd4_callback *cb)
> +{
> +	const struct nfs4_cb_offload *args = cb->cb_op;
> +	struct nfs4_cb_compound_hdr hdr = {
> +		.ident = cb->cb_clp->cl_cb_ident,
> +		.minorversion = cb->cb_minorversion,
> +	};
> +
> +	encode_cb_compound4args(xdr, &hdr);
> +	encode_cb_sequence4args(xdr, cb, &hdr);
> +	encode_cb_offload4args(xdr, args, &hdr);
> +	encode_cb_nops(&hdr);
> +}
>  
>  /*
>   * NFSv4.0 and NFSv4.1 XDR decode functions
> @@ -552,6 +598,36 @@ out:
>  }
>  
>  /*
> + * CB_OFFLOAD
> + */
> +static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
> +				   struct nfsd4_callback *cb)
> +{
> +	struct nfs4_cb_compound_hdr hdr;
> +	enum nfsstat4 nfserr;
> +	int status;
> +
> +	status = decode_cb_compound4res(xdr, &hdr);
> +	if (unlikely(status))
> +		goto out;
> +
> +	if (cb != NULL) {
> +		status = decode_cb_sequence4res(xdr, cb);
> +		if (unlikely(status))
> +			goto out;
> +	}
> +
> +	status = decode_cb_op_status(xdr, OP_CB_OFFLOAD, &nfserr);
> +	if (unlikely(status))
> +		goto out;
> +	if (unlikely(nfserr != NFS4_OK))
> +		status = nfs_cb_stat_to_errno(nfserr);
> +
> +out:
> +	return status;
> +}
> +
> +/*
>   * RPC procedure tables
>   */
>  #define PROC(proc, call, argtype, restype)				\
> @@ -568,6 +644,7 @@ out:
>  static struct rpc_procinfo nfs4_cb_procedures[] = {
>  	PROC(CB_NULL,	NULL,		cb_null,	cb_null),
>  	PROC(CB_RECALL,	COMPOUND,	cb_recall,	cb_recall),
> +	PROC(CB_OFFLOAD, COMPOUND,	cb_offload,	cb_offload),
>  };
>  
>  static struct rpc_version nfs_cb_version4 = {
> @@ -1017,6 +1094,11 @@ void nfsd4_init_callback(struct nfsd4_callback *cb)
>  	INIT_WORK(&cb->cb_work, nfsd4_do_callback_rpc);
>  }
>  
> +void nfsd4_init_delayed_callback(struct nfsd4_callback *cb)
> +{
> +	INIT_DELAYED_WORK(&cb->cb_delay, nfsd4_do_callback_rpc);
> +}
> +
>  void nfsd4_cb_recall(struct nfs4_delegation *dp)
>  {
>  	struct nfsd4_callback *cb = &dp->dl_recall;
> @@ -1036,3 +1118,57 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
>  
>  	run_nfsd4_cb(&dp->dl_recall);
>  }
> +
> +static void nfsd4_cb_offload_done(struct rpc_task *task, void *calldata)
> +{
> +	struct nfsd4_callback *cb = calldata;
> +	struct nfs4_client *clp = cb->cb_clp;
> +	struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
> +
> +	nfsd4_cb_done(task, calldata);
> +
> +	if (current_rpc_client != task->tk_client)
> +		return;
> +
> +	if (cb->cb_done)
> +		return;
> +
> +	if (task->tk_status != 0)
> +		nfsd4_mark_cb_down(clp, task->tk_status);
> +	cb->cb_done = true;
> +}
> +
> +static void nfsd4_cb_offload_release(void *calldata)
> +{
> +	struct nfsd4_callback *cb = calldata;
> +	struct nfs4_cb_offload *offload = container_of(cb, struct nfs4_cb_offload, co_callback);
> +
> +	if (cb->cb_done) {
> +		nfs4_free_offload_stateid(offload->co_stid);
> +		kfree(offload);
> +	}
> +}
> +
> +static const struct rpc_call_ops nfsd4_cb_offload_ops = {
> +	.rpc_call_prepare = nfsd4_cb_prepare,
> +	.rpc_call_done    = nfsd4_cb_offload_done,
> +	.rpc_release      = nfsd4_cb_offload_release,
> +};
> +
> +void nfsd4_cb_offload(struct nfs4_cb_offload *offload)
> +{
> +	struct nfsd4_callback *cb = &offload->co_callback;
> +
> +	cb->cb_op = offload;
> +	cb->cb_clp = offload->co_stid->sc_client;
> +	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_OFFLOAD];
> +	cb->cb_msg.rpc_argp = cb;
> +	cb->cb_msg.rpc_resp = cb;
> +
> +	cb->cb_ops = &nfsd4_cb_offload_ops;
> +
> +	INIT_LIST_HEAD(&cb->cb_per_client);
> +	cb->cb_done = true;
> +
> +	run_nfsd4_cb(cb);
> +}
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index d4584ea..66a787f 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -35,6 +35,7 @@
>  #include <linux/file.h>
>  #include <linux/slab.h>
>  
> +#include "state.h"
>  #include "idmap.h"
>  #include "cache.h"
>  #include "xdr4.h"
> @@ -1062,29 +1063,57 @@ out:
>  	return status;
>  }
>  
> +static void
> +nfsd4_copy_async(struct work_struct *w)
> +{
> +	__be32 status;
> +	struct nfs4_cb_offload *offload;
> +
> +	offload = container_of(w, struct nfs4_cb_offload, co_work);
> +	status  = nfsd_copy_range(offload->co_src_file, offload->co_src_pos,
> +				  offload->co_dst_file, offload->co_dst_pos,
> +				  offload->co_count);
> +
> +	if (status == nfs_ok) {
> +		offload->co_stable_how = NFS_FILE_SYNC;
> +		gen_boot_verifier(&offload->co_verifier, offload->co_net);
> +		fput(offload->co_src_file);
> +		fput(offload->co_dst_file);
> +	}
> +	nfsd4_cb_offload(offload);
> +}
> +
>  static __be32
>  nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  		struct nfsd4_copy *copy)
>  {
> -	__be32 status;
>  	struct file *src = NULL, *dst = NULL;
> +	struct nfs4_cb_offload *offload;
>  
> -	status = nfsd4_verify_copy(rqstp, cstate, copy, &src, &dst);
> -	if (status)
> -		return status;
> -
> -	status = nfsd_copy_range(src, copy->cp_src_pos,
> -				 dst, copy->cp_dst_pos,
> -				 copy->cp_count);
> +	if (nfsd4_verify_copy(rqstp, cstate, copy, &src, &dst))
> +		return nfserr_jukebox;
>  
> -	if (status == nfs_ok) {
> -		copy->cp_res.wr_stateid = NULL;
> -		copy->cp_res.wr_bytes_written = copy->cp_count;
> -		copy->cp_res.wr_stable_how = NFS_FILE_SYNC;
> -		gen_boot_verifier(&copy->cp_res.wr_verifier, SVC_NET(rqstp));
> -	}
> +	offload = kmalloc(sizeof(struct nfs4_cb_offload), GFP_KERNEL);
> +	if (!offload)
> +		return nfserr_jukebox;
>  
> -	return status;
> +	offload->co_src_file = get_file(src);
> +	offload->co_dst_file = get_file(dst);
> +	offload->co_src_pos  = copy->cp_src_pos;
> +	offload->co_dst_pos  = copy->cp_dst_pos;
> +	offload->co_count    = copy->cp_count;
> +	offload->co_stid     = nfs4_alloc_offload_stateid(cstate->session->se_client);
> +	offload->co_net      = SVC_NET(rqstp);
> +	INIT_WORK(&offload->co_work, nfsd4_copy_async);
> +	nfsd4_init_callback(&offload->co_callback);
> +	memcpy(&offload->co_dst_fh, &cstate->current_fh, sizeof(struct knfsd_fh));
> +
> +	copy->cp_res.wr_stateid = &offload->co_stid->sc_stateid;
> +	copy->cp_res.wr_bytes_written = 0;
> +	copy->cp_res.wr_stable_how = NFS_UNSTABLE;
> +
> +	schedule_work(&offload->co_work);
> +	return nfs_ok;
>  }
>  
>  /* This routine never returns NFS_OK!  If there are no other errors, it
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index c4e270e..582edb5 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -364,6 +364,11 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
>  	return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
>  }
>  
> +struct nfs4_stid *nfs4_alloc_offload_stateid(struct nfs4_client *clp)
> +{
> +	return nfs4_alloc_stid(clp, stateid_slab);
> +}
> +
>  static struct nfs4_delegation *
>  alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
>  {
> @@ -617,6 +622,12 @@ static void free_generic_stateid(struct nfs4_ol_stateid *stp)
>  	kmem_cache_free(stateid_slab, stp);
>  }
>  
> +void nfs4_free_offload_stateid(struct nfs4_stid *stid)
> +{
> +	remove_stid(stid);
> +	kmem_cache_free(stateid_slab, stid);
> +}
> +
>  static void release_lock_stateid(struct nfs4_ol_stateid *stp)
>  {
>  	struct file *file;
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 2478805..56682fb 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -70,6 +70,7 @@ struct nfsd4_callback {
>  	struct rpc_message cb_msg;
>  	const struct rpc_call_ops *cb_ops;
>  	struct work_struct cb_work;
> +	struct delayed_work cb_delay;
>  	bool cb_done;
>  };
>  
> @@ -101,6 +102,22 @@ struct nfs4_delegation {
>  	struct nfsd4_callback	dl_recall;
>  };
>  
> +struct nfs4_cb_offload {
> +	struct file	*co_src_file;
> +	struct file	*co_dst_file;
> +	u64		co_src_pos;
> +	u64		co_dst_pos;
> +	u64		co_count;
> +	u32		co_stable_how;
> +	struct knfsd_fh	co_dst_fh;
> +	nfs4_verifier	co_verifier;
> +	struct net	*co_net;
> +
> +	struct nfs4_stid		*co_stid;
> +	struct work_struct		co_work;
> +	struct nfsd4_callback		co_callback;
> +};
> +
>  /* client delegation callback info */
>  struct nfs4_cb_conn {
>  	/* SETCLIENTID info */
> @@ -468,10 +485,12 @@ extern void nfs4_free_openowner(struct nfs4_openowner *);
>  extern void nfs4_free_lockowner(struct nfs4_lockowner *);
>  extern int set_callback_cred(void);
>  extern void nfsd4_init_callback(struct nfsd4_callback *);
> +extern void nfsd4_init_delayed_callback(struct nfsd4_callback *);
>  extern void nfsd4_probe_callback(struct nfs4_client *clp);
>  extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
>  extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
>  extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
> +extern void nfsd4_cb_offload(struct nfs4_cb_offload *);
>  extern int nfsd4_create_callback_queue(void);
>  extern void nfsd4_destroy_callback_queue(void);
>  extern void nfsd4_shutdown_callback(struct nfs4_client *);
> @@ -480,6 +499,8 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
>  							struct nfsd_net *nn);
>  extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
>  extern void put_client_renew(struct nfs4_client *clp);
> +extern struct nfs4_stid *nfs4_alloc_offload_stateid(struct nfs4_client *);
> +extern void nfs4_free_offload_stateid(struct nfs4_stid *);
>  
>  /* nfs4recover operations */
>  extern int nfsd4_client_tracking_init(struct net *net);
> diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
> index c5c55df..75b0ef7 100644
> --- a/fs/nfsd/xdr4cb.h
> +++ b/fs/nfsd/xdr4cb.h
> @@ -21,3 +21,12 @@
>  #define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
>  					cb_sequence_dec_sz +            \
>  					op_dec_sz)
> +
> +#define NFS4_enc_cb_offload_sz		(cb_compound_enc_hdr_sz +      \
> +					 cb_sequence_enc_sz +          \
> +					 1 + enc_stateid_sz + 2 + 1 +  \
> +					 XDR_QUADLEN(NFS4_VERIFIER_SIZE))
> +
> +#define NFS4_dec_cb_offload_sz		(cb_compound_dec_hdr_sz +  \
> +					 cb_sequence_dec_sz +      \
> +					 op_dec_sz)
> -- 
> 1.8.3.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bryan Schumaker July 22, 2013, 7:17 p.m. UTC | #2
On 07/22/2013 02:50 PM, J. Bruce Fields wrote:
> On Fri, Jul 19, 2013 at 05:03:49PM -0400, bjschuma@netapp.com wrote:
>> From: Bryan Schumaker <bjschuma@netapp.com>
>>
>> Rather than performing the copy right away, schedule it to run later and
>> reply to the client.  Later, send a callback to notify the client that
>> the copy has finished.
> 
> I believe you need to implement the referring triple support described
> in http://tools.ietf.org/html/rfc5661#section-2.10.6.3 to fix the race
> described in
> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
> .

I'll re-read and re-write.

> 
> I see cb_delay initialized below, but not otherwise used.  Am I missing
> anything?

Whoops!  I was using that earlier to try to fake up a callback, but I eventually decided it's easier to just do the copy asynchronously.  I must have forgotten to take it out :(

> 
> What about OFFLOAD_STATUS and OFFLOAD_ABORT?

I haven't thought out those too much... I haven't thought about a use for them on the client yet.

> 
> In some common cases the reply will be very quick, and we might be
> better off handling it synchronously.  Could we implement a heuristic
> like "copy synchronously if the filesystem has special support or the
> range is less than the maximum iosize, otherwise copy asynchronously"?

I'm sure that can be done, I'm just not sure how to do it yet...

> 
> --b.
> 
> 
>> Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
>> ---
>>  fs/nfsd/nfs4callback.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  fs/nfsd/nfs4proc.c     |  59 +++++++++++++++------
>>  fs/nfsd/nfs4state.c    |  11 ++++
>>  fs/nfsd/state.h        |  21 ++++++++
>>  fs/nfsd/xdr4cb.h       |   9 ++++
>>  5 files changed, 221 insertions(+), 15 deletions(-)
>>
>> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
>> index 7f05cd1..8f797e1 100644
>> --- a/fs/nfsd/nfs4callback.c
>> +++ b/fs/nfsd/nfs4callback.c
>> @@ -52,6 +52,9 @@ enum {
>>  	NFSPROC4_CLNT_CB_NULL = 0,
>>  	NFSPROC4_CLNT_CB_RECALL,
>>  	NFSPROC4_CLNT_CB_SEQUENCE,
>> +
>> +	/* NFS v4.2 callback */
>> +	NFSPROC4_CLNT_CB_OFFLOAD,
>>  };
>>  
>>  struct nfs4_cb_compound_hdr {
>> @@ -110,6 +113,7 @@ enum nfs_cb_opnum4 {
>>  	OP_CB_WANTS_CANCELLED		= 12,
>>  	OP_CB_NOTIFY_LOCK		= 13,
>>  	OP_CB_NOTIFY_DEVICEID		= 14,
>> +	OP_CB_OFFLOAD			= 15,
>>  	OP_CB_ILLEGAL			= 10044
>>  };
>>  
>> @@ -469,6 +473,31 @@ out_default:
>>  	return nfs_cb_stat_to_errno(nfserr);
>>  }
>>  
>> +static void encode_cb_offload4args(struct xdr_stream *xdr,
>> +				   const struct nfs4_cb_offload *offload,
>> +				   struct nfs4_cb_compound_hdr *hdr)
>> +{
>> +	__be32 *p;
>> +
>> +	if (hdr->minorversion < 2)
>> +		return;
>> +
>> +	encode_nfs_cb_opnum4(xdr, OP_CB_OFFLOAD);
>> +	encode_nfs_fh4(xdr, &offload->co_dst_fh);
>> +	encode_stateid4(xdr, &offload->co_stid->sc_stateid);
>> +
>> +	p = xdr_reserve_space(xdr, 4);
>> +	*p = cpu_to_be32(1);
>> +	encode_stateid4(xdr, &offload->co_stid->sc_stateid);
>> +
>> +	p = xdr_reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
>> +	p = xdr_encode_hyper(p, offload->co_count);
>> +	*p++ = cpu_to_be32(offload->co_stable_how);
>> +	xdr_encode_opaque_fixed(p, offload->co_verifier.data, NFS4_VERIFIER_SIZE);
>> +
>> +	hdr->nops++;
>> +}
>> +
>>  /*
>>   * NFSv4.0 and NFSv4.1 XDR encode functions
>>   *
>> @@ -505,6 +534,23 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
>>  	encode_cb_nops(&hdr);
>>  }
>>  
>> +/*
>> + * CB_OFFLOAD
>> + */
>> +static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req, struct xdr_stream *xdr,
>> +				    const struct nfsd4_callback *cb)
>> +{
>> +	const struct nfs4_cb_offload *args = cb->cb_op;
>> +	struct nfs4_cb_compound_hdr hdr = {
>> +		.ident = cb->cb_clp->cl_cb_ident,
>> +		.minorversion = cb->cb_minorversion,
>> +	};
>> +
>> +	encode_cb_compound4args(xdr, &hdr);
>> +	encode_cb_sequence4args(xdr, cb, &hdr);
>> +	encode_cb_offload4args(xdr, args, &hdr);
>> +	encode_cb_nops(&hdr);
>> +}
>>  
>>  /*
>>   * NFSv4.0 and NFSv4.1 XDR decode functions
>> @@ -552,6 +598,36 @@ out:
>>  }
>>  
>>  /*
>> + * CB_OFFLOAD
>> + */
>> +static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
>> +				   struct nfsd4_callback *cb)
>> +{
>> +	struct nfs4_cb_compound_hdr hdr;
>> +	enum nfsstat4 nfserr;
>> +	int status;
>> +
>> +	status = decode_cb_compound4res(xdr, &hdr);
>> +	if (unlikely(status))
>> +		goto out;
>> +
>> +	if (cb != NULL) {
>> +		status = decode_cb_sequence4res(xdr, cb);
>> +		if (unlikely(status))
>> +			goto out;
>> +	}
>> +
>> +	status = decode_cb_op_status(xdr, OP_CB_OFFLOAD, &nfserr);
>> +	if (unlikely(status))
>> +		goto out;
>> +	if (unlikely(nfserr != NFS4_OK))
>> +		status = nfs_cb_stat_to_errno(nfserr);
>> +
>> +out:
>> +	return status;
>> +}
>> +
>> +/*
>>   * RPC procedure tables
>>   */
>>  #define PROC(proc, call, argtype, restype)				\
>> @@ -568,6 +644,7 @@ out:
>>  static struct rpc_procinfo nfs4_cb_procedures[] = {
>>  	PROC(CB_NULL,	NULL,		cb_null,	cb_null),
>>  	PROC(CB_RECALL,	COMPOUND,	cb_recall,	cb_recall),
>> +	PROC(CB_OFFLOAD, COMPOUND,	cb_offload,	cb_offload),
>>  };
>>  
>>  static struct rpc_version nfs_cb_version4 = {
>> @@ -1017,6 +1094,11 @@ void nfsd4_init_callback(struct nfsd4_callback *cb)
>>  	INIT_WORK(&cb->cb_work, nfsd4_do_callback_rpc);
>>  }
>>  
>> +void nfsd4_init_delayed_callback(struct nfsd4_callback *cb)
>> +{
>> +	INIT_DELAYED_WORK(&cb->cb_delay, nfsd4_do_callback_rpc);
>> +}
>> +
>>  void nfsd4_cb_recall(struct nfs4_delegation *dp)
>>  {
>>  	struct nfsd4_callback *cb = &dp->dl_recall;
>> @@ -1036,3 +1118,57 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
>>  
>>  	run_nfsd4_cb(&dp->dl_recall);
>>  }
>> +
>> +static void nfsd4_cb_offload_done(struct rpc_task *task, void *calldata)
>> +{
>> +	struct nfsd4_callback *cb = calldata;
>> +	struct nfs4_client *clp = cb->cb_clp;
>> +	struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
>> +
>> +	nfsd4_cb_done(task, calldata);
>> +
>> +	if (current_rpc_client != task->tk_client)
>> +		return;
>> +
>> +	if (cb->cb_done)
>> +		return;
>> +
>> +	if (task->tk_status != 0)
>> +		nfsd4_mark_cb_down(clp, task->tk_status);
>> +	cb->cb_done = true;
>> +}
>> +
>> +static void nfsd4_cb_offload_release(void *calldata)
>> +{
>> +	struct nfsd4_callback *cb = calldata;
>> +	struct nfs4_cb_offload *offload = container_of(cb, struct nfs4_cb_offload, co_callback);
>> +
>> +	if (cb->cb_done) {
>> +		nfs4_free_offload_stateid(offload->co_stid);
>> +		kfree(offload);
>> +	}
>> +}
>> +
>> +static const struct rpc_call_ops nfsd4_cb_offload_ops = {
>> +	.rpc_call_prepare = nfsd4_cb_prepare,
>> +	.rpc_call_done    = nfsd4_cb_offload_done,
>> +	.rpc_release      = nfsd4_cb_offload_release,
>> +};
>> +
>> +void nfsd4_cb_offload(struct nfs4_cb_offload *offload)
>> +{
>> +	struct nfsd4_callback *cb = &offload->co_callback;
>> +
>> +	cb->cb_op = offload;
>> +	cb->cb_clp = offload->co_stid->sc_client;
>> +	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_OFFLOAD];
>> +	cb->cb_msg.rpc_argp = cb;
>> +	cb->cb_msg.rpc_resp = cb;
>> +
>> +	cb->cb_ops = &nfsd4_cb_offload_ops;
>> +
>> +	INIT_LIST_HEAD(&cb->cb_per_client);
>> +	cb->cb_done = true;
>> +
>> +	run_nfsd4_cb(cb);
>> +}
>> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
>> index d4584ea..66a787f 100644
>> --- a/fs/nfsd/nfs4proc.c
>> +++ b/fs/nfsd/nfs4proc.c
>> @@ -35,6 +35,7 @@
>>  #include <linux/file.h>
>>  #include <linux/slab.h>
>>  
>> +#include "state.h"
>>  #include "idmap.h"
>>  #include "cache.h"
>>  #include "xdr4.h"
>> @@ -1062,29 +1063,57 @@ out:
>>  	return status;
>>  }
>>  
>> +static void
>> +nfsd4_copy_async(struct work_struct *w)
>> +{
>> +	__be32 status;
>> +	struct nfs4_cb_offload *offload;
>> +
>> +	offload = container_of(w, struct nfs4_cb_offload, co_work);
>> +	status  = nfsd_copy_range(offload->co_src_file, offload->co_src_pos,
>> +				  offload->co_dst_file, offload->co_dst_pos,
>> +				  offload->co_count);
>> +
>> +	if (status == nfs_ok) {
>> +		offload->co_stable_how = NFS_FILE_SYNC;
>> +		gen_boot_verifier(&offload->co_verifier, offload->co_net);
>> +		fput(offload->co_src_file);
>> +		fput(offload->co_dst_file);
>> +	}
>> +	nfsd4_cb_offload(offload);
>> +}
>> +
>>  static __be32
>>  nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>  		struct nfsd4_copy *copy)
>>  {
>> -	__be32 status;
>>  	struct file *src = NULL, *dst = NULL;
>> +	struct nfs4_cb_offload *offload;
>>  
>> -	status = nfsd4_verify_copy(rqstp, cstate, copy, &src, &dst);
>> -	if (status)
>> -		return status;
>> -
>> -	status = nfsd_copy_range(src, copy->cp_src_pos,
>> -				 dst, copy->cp_dst_pos,
>> -				 copy->cp_count);
>> +	if (nfsd4_verify_copy(rqstp, cstate, copy, &src, &dst))
>> +		return nfserr_jukebox;
>>  
>> -	if (status == nfs_ok) {
>> -		copy->cp_res.wr_stateid = NULL;
>> -		copy->cp_res.wr_bytes_written = copy->cp_count;
>> -		copy->cp_res.wr_stable_how = NFS_FILE_SYNC;
>> -		gen_boot_verifier(&copy->cp_res.wr_verifier, SVC_NET(rqstp));
>> -	}
>> +	offload = kmalloc(sizeof(struct nfs4_cb_offload), GFP_KERNEL);
>> +	if (!offload)
>> +		return nfserr_jukebox;
>>  
>> -	return status;
>> +	offload->co_src_file = get_file(src);
>> +	offload->co_dst_file = get_file(dst);
>> +	offload->co_src_pos  = copy->cp_src_pos;
>> +	offload->co_dst_pos  = copy->cp_dst_pos;
>> +	offload->co_count    = copy->cp_count;
>> +	offload->co_stid     = nfs4_alloc_offload_stateid(cstate->session->se_client);
>> +	offload->co_net      = SVC_NET(rqstp);
>> +	INIT_WORK(&offload->co_work, nfsd4_copy_async);
>> +	nfsd4_init_callback(&offload->co_callback);
>> +	memcpy(&offload->co_dst_fh, &cstate->current_fh, sizeof(struct knfsd_fh));
>> +
>> +	copy->cp_res.wr_stateid = &offload->co_stid->sc_stateid;
>> +	copy->cp_res.wr_bytes_written = 0;
>> +	copy->cp_res.wr_stable_how = NFS_UNSTABLE;
>> +
>> +	schedule_work(&offload->co_work);
>> +	return nfs_ok;
>>  }
>>  
>>  /* This routine never returns NFS_OK!  If there are no other errors, it
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index c4e270e..582edb5 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -364,6 +364,11 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
>>  	return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
>>  }
>>  
>> +struct nfs4_stid *nfs4_alloc_offload_stateid(struct nfs4_client *clp)
>> +{
>> +	return nfs4_alloc_stid(clp, stateid_slab);
>> +}
>> +
>>  static struct nfs4_delegation *
>>  alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
>>  {
>> @@ -617,6 +622,12 @@ static void free_generic_stateid(struct nfs4_ol_stateid *stp)
>>  	kmem_cache_free(stateid_slab, stp);
>>  }
>>  
>> +void nfs4_free_offload_stateid(struct nfs4_stid *stid)
>> +{
>> +	remove_stid(stid);
>> +	kmem_cache_free(stateid_slab, stid);
>> +}
>> +
>>  static void release_lock_stateid(struct nfs4_ol_stateid *stp)
>>  {
>>  	struct file *file;
>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>> index 2478805..56682fb 100644
>> --- a/fs/nfsd/state.h
>> +++ b/fs/nfsd/state.h
>> @@ -70,6 +70,7 @@ struct nfsd4_callback {
>>  	struct rpc_message cb_msg;
>>  	const struct rpc_call_ops *cb_ops;
>>  	struct work_struct cb_work;
>> +	struct delayed_work cb_delay;
>>  	bool cb_done;
>>  };
>>  
>> @@ -101,6 +102,22 @@ struct nfs4_delegation {
>>  	struct nfsd4_callback	dl_recall;
>>  };
>>  
>> +struct nfs4_cb_offload {
>> +	struct file	*co_src_file;
>> +	struct file	*co_dst_file;
>> +	u64		co_src_pos;
>> +	u64		co_dst_pos;
>> +	u64		co_count;
>> +	u32		co_stable_how;
>> +	struct knfsd_fh	co_dst_fh;
>> +	nfs4_verifier	co_verifier;
>> +	struct net	*co_net;
>> +
>> +	struct nfs4_stid		*co_stid;
>> +	struct work_struct		co_work;
>> +	struct nfsd4_callback		co_callback;
>> +};
>> +
>>  /* client delegation callback info */
>>  struct nfs4_cb_conn {
>>  	/* SETCLIENTID info */
>> @@ -468,10 +485,12 @@ extern void nfs4_free_openowner(struct nfs4_openowner *);
>>  extern void nfs4_free_lockowner(struct nfs4_lockowner *);
>>  extern int set_callback_cred(void);
>>  extern void nfsd4_init_callback(struct nfsd4_callback *);
>> +extern void nfsd4_init_delayed_callback(struct nfsd4_callback *);
>>  extern void nfsd4_probe_callback(struct nfs4_client *clp);
>>  extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
>>  extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
>>  extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
>> +extern void nfsd4_cb_offload(struct nfs4_cb_offload *);
>>  extern int nfsd4_create_callback_queue(void);
>>  extern void nfsd4_destroy_callback_queue(void);
>>  extern void nfsd4_shutdown_callback(struct nfs4_client *);
>> @@ -480,6 +499,8 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
>>  							struct nfsd_net *nn);
>>  extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
>>  extern void put_client_renew(struct nfs4_client *clp);
>> +extern struct nfs4_stid *nfs4_alloc_offload_stateid(struct nfs4_client *);
>> +extern void nfs4_free_offload_stateid(struct nfs4_stid *);
>>  
>>  /* nfs4recover operations */
>>  extern int nfsd4_client_tracking_init(struct net *net);
>> diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
>> index c5c55df..75b0ef7 100644
>> --- a/fs/nfsd/xdr4cb.h
>> +++ b/fs/nfsd/xdr4cb.h
>> @@ -21,3 +21,12 @@
>>  #define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
>>  					cb_sequence_dec_sz +            \
>>  					op_dec_sz)
>> +
>> +#define NFS4_enc_cb_offload_sz		(cb_compound_enc_hdr_sz +      \
>> +					 cb_sequence_enc_sz +          \
>> +					 1 + enc_stateid_sz + 2 + 1 +  \
>> +					 XDR_QUADLEN(NFS4_VERIFIER_SIZE))
>> +
>> +#define NFS4_dec_cb_offload_sz		(cb_compound_dec_hdr_sz +  \
>> +					 cb_sequence_dec_sz +      \
>> +					 op_dec_sz)
>> -- 
>> 1.8.3.3
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields July 22, 2013, 7:30 p.m. UTC | #3
On Mon, Jul 22, 2013 at 03:17:29PM -0400, Bryan Schumaker wrote:
> On 07/22/2013 02:50 PM, J. Bruce Fields wrote:
> > On Fri, Jul 19, 2013 at 05:03:49PM -0400, bjschuma@netapp.com wrote:
> >> From: Bryan Schumaker <bjschuma@netapp.com>
> >>
> >> Rather than performing the copy right away, schedule it to run later and
> >> reply to the client.  Later, send a callback to notify the client that
> >> the copy has finished.
> > 
> > I believe you need to implement the referring triple support described
> > in http://tools.ietf.org/html/rfc5661#section-2.10.6.3 to fix the race
> > described in
> > http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
> > .
> 
> I'll re-read and re-write.
> 
> > 
> > I see cb_delay initialized below, but not otherwise used.  Am I missing
> > anything?
> 
> Whoops!  I was using that earlier to try to fake up a callback, but I eventually decided it's easier to just do the copy asynchronously.  I must have forgotten to take it out :(
> 
> > 
> > What about OFFLOAD_STATUS and OFFLOAD_ABORT?
> 
> I haven't thought out those too much... I haven't thought about a use for them on the client yet.

If it might be a long-running copy, I assume the client needs the
ability to abort if the caller is killed.

(Dumb question: what happens on the network partition?  Does the server
abort the copy when it expires the client state?)

In any case,
http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
says "If a server's COPY operation returns a stateid, then the server
MUST also support these operations: CB_OFFLOAD, OFFLOAD_ABORT, and
OFFLOAD_STATUS."

So even if we've no use for them on the client then we still need to
implement them (and probably just write a basic pynfs test).  Either
that or update the spec.

> > In some common cases the reply will be very quick, and we might be
> > better off handling it synchronously.  Could we implement a heuristic
> > like "copy synchronously if the filesystem has special support or the
> > range is less than the maximum iosize, otherwise copy asynchronously"?
> 
> I'm sure that can be done, I'm just not sure how to do it yet...

OK, thanks.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bryan Schumaker July 22, 2013, 7:37 p.m. UTC | #4
On 07/22/2013 03:30 PM, J. Bruce Fields wrote:
> On Mon, Jul 22, 2013 at 03:17:29PM -0400, Bryan Schumaker wrote:
>> On 07/22/2013 02:50 PM, J. Bruce Fields wrote:
>>> On Fri, Jul 19, 2013 at 05:03:49PM -0400, bjschuma@netapp.com wrote:
>>>> From: Bryan Schumaker <bjschuma@netapp.com>
>>>>
>>>> Rather than performing the copy right away, schedule it to run later and
>>>> reply to the client.  Later, send a callback to notify the client that
>>>> the copy has finished.
>>>
>>> I believe you need to implement the referring triple support described
>>> in http://tools.ietf.org/html/rfc5661#section-2.10.6.3 to fix the race
>>> described in
>>> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
>>> .
>>
>> I'll re-read and re-write.
>>
>>>
>>> I see cb_delay initialized below, but not otherwise used.  Am I missing
>>> anything?
>>
>> Whoops!  I was using that earlier to try to fake up a callback, but I eventually decided it's easier to just do the copy asynchronously.  I must have forgotten to take it out :(
>>
>>>
>>> What about OFFLOAD_STATUS and OFFLOAD_ABORT?
>>
>> I haven't thought out those too much... I haven't thought about a use for them on the client yet.
> 
> If it might be a long-running copy, I assume the client needs the
> ability to abort if the caller is killed.
> 
> (Dumb question: what happens on the network partition?  Does the server
> abort the copy when it expires the client state?)
> 
> In any case,
> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
> says "If a server's COPY operation returns a stateid, then the server
> MUST also support these operations: CB_OFFLOAD, OFFLOAD_ABORT, and
> OFFLOAD_STATUS."
> 
> So even if we've no use for them on the client then we still need to
> implement them (and probably just write a basic pynfs test).  Either
> that or update the spec.

Fair enough.  I'll think it out and do something!  Easy solution: save this patch for later and only support the sync version of copy for the final version of this patch series.

- Bryan

> 
>>> In some common cases the reply will be very quick, and we might be
>>> better off handling it synchronously.  Could we implement a heuristic
>>> like "copy synchronously if the filesystem has special support or the
>>> range is less than the maximum iosize, otherwise copy asynchronously"?
>>
>> I'm sure that can be done, I'm just not sure how to do it yet...
> 
> OK, thanks.
> 
> --b.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields July 22, 2013, 7:43 p.m. UTC | #5
On Mon, Jul 22, 2013 at 03:37:00PM -0400, Bryan Schumaker wrote:
> On 07/22/2013 03:30 PM, J. Bruce Fields wrote:
> > On Mon, Jul 22, 2013 at 03:17:29PM -0400, Bryan Schumaker wrote:
> >> On 07/22/2013 02:50 PM, J. Bruce Fields wrote:
> >>> On Fri, Jul 19, 2013 at 05:03:49PM -0400, bjschuma@netapp.com wrote:
> >>>> From: Bryan Schumaker <bjschuma@netapp.com>
> >>>>
> >>>> Rather than performing the copy right away, schedule it to run later and
> >>>> reply to the client.  Later, send a callback to notify the client that
> >>>> the copy has finished.
> >>>
> >>> I believe you need to implement the referring triple support described
> >>> in http://tools.ietf.org/html/rfc5661#section-2.10.6.3 to fix the race
> >>> described in
> >>> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
> >>> .
> >>
> >> I'll re-read and re-write.
> >>
> >>>
> >>> I see cb_delay initialized below, but not otherwise used.  Am I missing
> >>> anything?
> >>
> >> Whoops!  I was using that earlier to try to fake up a callback, but I eventually decided it's easier to just do the copy asynchronously.  I must have forgotten to take it out :(
> >>
> >>>
> >>> What about OFFLOAD_STATUS and OFFLOAD_ABORT?
> >>
> >> I haven't thought out those too much... I haven't thought about a use for them on the client yet.
> > 
> > If it might be a long-running copy, I assume the client needs the
> > ability to abort if the caller is killed.
> > 
> > (Dumb question: what happens on the network partition?  Does the server
> > abort the copy when it expires the client state?)
> > 
> > In any case,
> > http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
> > says "If a server's COPY operation returns a stateid, then the server
> > MUST also support these operations: CB_OFFLOAD, OFFLOAD_ABORT, and
> > OFFLOAD_STATUS."
> > 
> > So even if we've no use for them on the client then we still need to
> > implement them (and probably just write a basic pynfs test).  Either
> > that or update the spec.
> 
> Fair enough.  I'll think it out and do something!  Easy solution: save this patch for later and only support the sync version of copy for the final version of this patch series.

I can't remember--does the spec give the server a clear way to bail out
and tell the client to fall back on a normal copy in cases where the
server knows the copy could take an unreasonable amount of time?

--b.

> 
> - Bryan
> 
> > 
> >>> In some common cases the reply will be very quick, and we might be
> >>> better off handling it synchronously.  Could we implement a heuristic
> >>> like "copy synchronously if the filesystem has special support or the
> >>> range is less than the maximum iosize, otherwise copy asynchronously"?
> >>
> >> I'm sure that can be done, I'm just not sure how to do it yet...
> > 
> > OK, thanks.
> > 
> > --b.
> > 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bryan Schumaker July 22, 2013, 7:54 p.m. UTC | #6
On 07/22/2013 03:43 PM, J. Bruce Fields wrote:
> On Mon, Jul 22, 2013 at 03:37:00PM -0400, Bryan Schumaker wrote:
>> On 07/22/2013 03:30 PM, J. Bruce Fields wrote:
>>> On Mon, Jul 22, 2013 at 03:17:29PM -0400, Bryan Schumaker wrote:
>>>> On 07/22/2013 02:50 PM, J. Bruce Fields wrote:
>>>>> On Fri, Jul 19, 2013 at 05:03:49PM -0400, bjschuma@netapp.com wrote:
>>>>>> From: Bryan Schumaker <bjschuma@netapp.com>
>>>>>>
>>>>>> Rather than performing the copy right away, schedule it to run later and
>>>>>> reply to the client.  Later, send a callback to notify the client that
>>>>>> the copy has finished.
>>>>>
>>>>> I believe you need to implement the referring triple support described
>>>>> in http://tools.ietf.org/html/rfc5661#section-2.10.6.3 to fix the race
>>>>> described in
>>>>> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
>>>>> .
>>>>
>>>> I'll re-read and re-write.
>>>>
>>>>>
>>>>> I see cb_delay initialized below, but not otherwise used.  Am I missing
>>>>> anything?
>>>>
>>>> Whoops!  I was using that earlier to try to fake up a callback, but I eventually decided it's easier to just do the copy asynchronously.  I must have forgotten to take it out :(
>>>>
>>>>>
>>>>> What about OFFLOAD_STATUS and OFFLOAD_ABORT?
>>>>
>>>> I haven't thought out those too much... I haven't thought about a use for them on the client yet.
>>>
>>> If it might be a long-running copy, I assume the client needs the
>>> ability to abort if the caller is killed.
>>>
>>> (Dumb question: what happens on the network partition?  Does the server
>>> abort the copy when it expires the client state?)
>>>
>>> In any case,
>>> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
>>> says "If a server's COPY operation returns a stateid, then the server
>>> MUST also support these operations: CB_OFFLOAD, OFFLOAD_ABORT, and
>>> OFFLOAD_STATUS."
>>>
>>> So even if we've no use for them on the client then we still need to
>>> implement them (and probably just write a basic pynfs test).  Either
>>> that or update the spec.
>>
>> Fair enough.  I'll think it out and do something!  Easy solution: save this patch for later and only support the sync version of copy for the final version of this patch series.
> 
> I can't remember--does the spec give the server a clear way to bail out
> and tell the client to fall back on a normal copy in cases where the
> server knows the copy could take an unreasonable amount of time?
> 
> --b.

I don't think so.  Is there ever a case where copying over the network would be slower than copying on the server?

> 
>>
>> - Bryan
>>
>>>
>>>>> In some common cases the reply will be very quick, and we might be
>>>>> better off handling it synchronously.  Could we implement a heuristic
>>>>> like "copy synchronously if the filesystem has special support or the
>>>>> range is less than the maximum iosize, otherwise copy asynchronously"?
>>>>
>>>> I'm sure that can be done, I'm just not sure how to do it yet...
>>>
>>> OK, thanks.
>>>
>>> --b.
>>>
>>

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields July 22, 2013, 7:55 p.m. UTC | #7
On Mon, Jul 22, 2013 at 03:54:00PM -0400, Bryan Schumaker wrote:
> On 07/22/2013 03:43 PM, J. Bruce Fields wrote:
> > On Mon, Jul 22, 2013 at 03:37:00PM -0400, Bryan Schumaker wrote:
> >> On 07/22/2013 03:30 PM, J. Bruce Fields wrote:
> >>> On Mon, Jul 22, 2013 at 03:17:29PM -0400, Bryan Schumaker wrote:
> >>>> On 07/22/2013 02:50 PM, J. Bruce Fields wrote:
> >>>>> On Fri, Jul 19, 2013 at 05:03:49PM -0400, bjschuma@netapp.com wrote:
> >>>>>> From: Bryan Schumaker <bjschuma@netapp.com>
> >>>>>>
> >>>>>> Rather than performing the copy right away, schedule it to run later and
> >>>>>> reply to the client.  Later, send a callback to notify the client that
> >>>>>> the copy has finished.
> >>>>>
> >>>>> I believe you need to implement the referring triple support described
> >>>>> in http://tools.ietf.org/html/rfc5661#section-2.10.6.3 to fix the race
> >>>>> described in
> >>>>> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
> >>>>> .
> >>>>
> >>>> I'll re-read and re-write.
> >>>>
> >>>>>
> >>>>> I see cb_delay initialized below, but not otherwise used.  Am I missing
> >>>>> anything?
> >>>>
> >>>> Whoops!  I was using that earlier to try to fake up a callback, but I eventually decided it's easier to just do the copy asynchronously.  I must have forgotten to take it out :(
> >>>>
> >>>>>
> >>>>> What about OFFLOAD_STATUS and OFFLOAD_ABORT?
> >>>>
> >>>> I haven't thought out those too much... I haven't thought about a use for them on the client yet.
> >>>
> >>> If it might be a long-running copy, I assume the client needs the
> >>> ability to abort if the caller is killed.
> >>>
> >>> (Dumb question: what happens on the network partition?  Does the server
> >>> abort the copy when it expires the client state?)
> >>>
> >>> In any case,
> >>> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
> >>> says "If a server's COPY operation returns a stateid, then the server
> >>> MUST also support these operations: CB_OFFLOAD, OFFLOAD_ABORT, and
> >>> OFFLOAD_STATUS."
> >>>
> >>> So even if we've no use for them on the client then we still need to
> >>> implement them (and probably just write a basic pynfs test).  Either
> >>> that or update the spec.
> >>
> >> Fair enough.  I'll think it out and do something!  Easy solution: save this patch for later and only support the sync version of copy for the final version of this patch series.
> > 
> > I can't remember--does the spec give the server a clear way to bail out
> > and tell the client to fall back on a normal copy in cases where the
> > server knows the copy could take an unreasonable amount of time?
> > 
> > --b.
> 
> I don't think so.  Is there ever a case where copying over the network would be slower than copying on the server?

Mybe not, but if the copy will take a minute, then we don't want to tie
up an rpc slot for a minute.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ric Wheeler Aug. 5, 2013, 8:38 a.m. UTC | #8
On 07/22/2013 08:55 PM, J. Bruce Fields wrote:
> On Mon, Jul 22, 2013 at 03:54:00PM -0400, Bryan Schumaker wrote:
>> On 07/22/2013 03:43 PM, J. Bruce Fields wrote:
>>> On Mon, Jul 22, 2013 at 03:37:00PM -0400, Bryan Schumaker wrote:
>>>> On 07/22/2013 03:30 PM, J. Bruce Fields wrote:
>>>>> On Mon, Jul 22, 2013 at 03:17:29PM -0400, Bryan Schumaker wrote:
>>>>>> On 07/22/2013 02:50 PM, J. Bruce Fields wrote:
>>>>>>> On Fri, Jul 19, 2013 at 05:03:49PM -0400, bjschuma@netapp.com wrote:
>>>>>>>> From: Bryan Schumaker <bjschuma@netapp.com>
>>>>>>>>
>>>>>>>> Rather than performing the copy right away, schedule it to run later and
>>>>>>>> reply to the client.  Later, send a callback to notify the client that
>>>>>>>> the copy has finished.
>>>>>>> I believe you need to implement the referring triple support described
>>>>>>> in http://tools.ietf.org/html/rfc5661#section-2.10.6.3 to fix the race
>>>>>>> described in
>>>>>>> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
>>>>>>> .
>>>>>> I'll re-read and re-write.
>>>>>>
>>>>>>> I see cb_delay initialized below, but not otherwise used.  Am I missing
>>>>>>> anything?
>>>>>> Whoops!  I was using that earlier to try to fake up a callback, but I eventually decided it's easier to just do the copy asynchronously.  I must have forgotten to take it out :(
>>>>>>
>>>>>>> What about OFFLOAD_STATUS and OFFLOAD_ABORT?
>>>>>> I haven't thought out those too much... I haven't thought about a use for them on the client yet.
>>>>> If it might be a long-running copy, I assume the client needs the
>>>>> ability to abort if the caller is killed.
>>>>>
>>>>> (Dumb question: what happens on the network partition?  Does the server
>>>>> abort the copy when it expires the client state?)
>>>>>
>>>>> In any case,
>>>>> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
>>>>> says "If a server's COPY operation returns a stateid, then the server
>>>>> MUST also support these operations: CB_OFFLOAD, OFFLOAD_ABORT, and
>>>>> OFFLOAD_STATUS."
>>>>>
>>>>> So even if we've no use for them on the client then we still need to
>>>>> implement them (and probably just write a basic pynfs test).  Either
>>>>> that or update the spec.
>>>> Fair enough.  I'll think it out and do something!  Easy solution: save this patch for later and only support the sync version of copy for the final version of this patch series.
>>> I can't remember--does the spec give the server a clear way to bail out
>>> and tell the client to fall back on a normal copy in cases where the
>>> server knows the copy could take an unreasonable amount of time?
>>>
>>> --b.
>> I don't think so.  Is there ever a case where copying over the network would be slower than copying on the server?
> Mybe not, but if the copy will take a minute, then we don't want to tie
> up an rpc slot for a minute.
>
> --b.

I think that we need to be able to handle copies that would take a lot longer 
than just a minute - this offload could take a very long time I assume depending 
on the size of the data getting copied and the back end storage device....

ric


--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Aug. 5, 2013, 2:41 p.m. UTC | #9
On Mon, Aug 05, 2013 at 09:38:20AM +0100, Ric Wheeler wrote:
> On 07/22/2013 08:55 PM, J. Bruce Fields wrote:
> >On Mon, Jul 22, 2013 at 03:54:00PM -0400, Bryan Schumaker wrote:
> >>On 07/22/2013 03:43 PM, J. Bruce Fields wrote:
> >>>On Mon, Jul 22, 2013 at 03:37:00PM -0400, Bryan Schumaker wrote:
> >>>>On 07/22/2013 03:30 PM, J. Bruce Fields wrote:
> >>>>>On Mon, Jul 22, 2013 at 03:17:29PM -0400, Bryan Schumaker wrote:
> >>>>>>On 07/22/2013 02:50 PM, J. Bruce Fields wrote:
> >>>>>>>On Fri, Jul 19, 2013 at 05:03:49PM -0400, bjschuma@netapp.com wrote:
> >>>>>>>>From: Bryan Schumaker <bjschuma@netapp.com>
> >>>>>>>>
> >>>>>>>>Rather than performing the copy right away, schedule it to run later and
> >>>>>>>>reply to the client.  Later, send a callback to notify the client that
> >>>>>>>>the copy has finished.
> >>>>>>>I believe you need to implement the referring triple support described
> >>>>>>>in http://tools.ietf.org/html/rfc5661#section-2.10.6.3 to fix the race
> >>>>>>>described in
> >>>>>>>http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
> >>>>>>>.
> >>>>>>I'll re-read and re-write.
> >>>>>>
> >>>>>>>I see cb_delay initialized below, but not otherwise used.  Am I missing
> >>>>>>>anything?
> >>>>>>Whoops!  I was using that earlier to try to fake up a callback, but I eventually decided it's easier to just do the copy asynchronously.  I must have forgotten to take it out :(
> >>>>>>
> >>>>>>>What about OFFLOAD_STATUS and OFFLOAD_ABORT?
> >>>>>>I haven't thought out those too much... I haven't thought about a use for them on the client yet.
> >>>>>If it might be a long-running copy, I assume the client needs the
> >>>>>ability to abort if the caller is killed.
> >>>>>
> >>>>>(Dumb question: what happens on the network partition?  Does the server
> >>>>>abort the copy when it expires the client state?)
> >>>>>
> >>>>>In any case,
> >>>>>http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
> >>>>>says "If a server's COPY operation returns a stateid, then the server
> >>>>>MUST also support these operations: CB_OFFLOAD, OFFLOAD_ABORT, and
> >>>>>OFFLOAD_STATUS."
> >>>>>
> >>>>>So even if we've no use for them on the client then we still need to
> >>>>>implement them (and probably just write a basic pynfs test).  Either
> >>>>>that or update the spec.
> >>>>Fair enough.  I'll think it out and do something!  Easy solution: save this patch for later and only support the sync version of copy for the final version of this patch series.
> >>>I can't remember--does the spec give the server a clear way to bail out
> >>>and tell the client to fall back on a normal copy in cases where the
> >>>server knows the copy could take an unreasonable amount of time?
> >>>
> >>>--b.
> >>I don't think so.  Is there ever a case where copying over the network would be slower than copying on the server?
> >Mybe not, but if the copy will take a minute, then we don't want to tie
> >up an rpc slot for a minute.
> >
> >--b.
> 
> I think that we need to be able to handle copies that would take a
> lot longer than just a minute - this offload could take a very long
> time I assume depending on the size of the data getting copied and
> the back end storage device....

Bryan suggested in offline discussion that one possibility might be to
copy, say, at most a gigabyte at a time before returning and making the
client continue the copy.

Where for "a gigabyte" read, "some amount that doesn't take too long to
copy but is still enough to allow close to full bandwidth".  Hopefully
that's an easy number to find.

But based on
http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-14.1.2
the COPY operation isn't designed for that--it doesn't give the option
of returning bytes_copied in the successful case.

Maybe we should fix that in the spec, or maybe we just need to implement
the asynchronous case.  I guess it depends on which is easier,

	a) implementing the asynchronous case (and the referring-triple
	   support to fix the COPY/callback races), or
	b) implementing this sort of "short copy" loop in a way that gives
	   good performance.

On the client side it's clearly a) since you're forced to handle that
case anyway.  (Unless we argue that *all* copies should work that way,
and that the spec should ditch the asynchronous case.) On the server
side, b) looks easier.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ric Wheeler Aug. 5, 2013, 2:44 p.m. UTC | #10
On 08/05/2013 03:41 PM, J. Bruce Fields wrote:
> On Mon, Aug 05, 2013 at 09:38:20AM +0100, Ric Wheeler wrote:
>> On 07/22/2013 08:55 PM, J. Bruce Fields wrote:
>>> On Mon, Jul 22, 2013 at 03:54:00PM -0400, Bryan Schumaker wrote:
>>>> On 07/22/2013 03:43 PM, J. Bruce Fields wrote:
>>>>> On Mon, Jul 22, 2013 at 03:37:00PM -0400, Bryan Schumaker wrote:
>>>>>> On 07/22/2013 03:30 PM, J. Bruce Fields wrote:
>>>>>>> On Mon, Jul 22, 2013 at 03:17:29PM -0400, Bryan Schumaker wrote:
>>>>>>>> On 07/22/2013 02:50 PM, J. Bruce Fields wrote:
>>>>>>>>> On Fri, Jul 19, 2013 at 05:03:49PM -0400, bjschuma@netapp.com wrote:
>>>>>>>>>> From: Bryan Schumaker <bjschuma@netapp.com>
>>>>>>>>>>
>>>>>>>>>> Rather than performing the copy right away, schedule it to run later and
>>>>>>>>>> reply to the client.  Later, send a callback to notify the client that
>>>>>>>>>> the copy has finished.
>>>>>>>>> I believe you need to implement the referring triple support described
>>>>>>>>> in http://tools.ietf.org/html/rfc5661#section-2.10.6.3 to fix the race
>>>>>>>>> described in
>>>>>>>>> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
>>>>>>>>> .
>>>>>>>> I'll re-read and re-write.
>>>>>>>>
>>>>>>>>> I see cb_delay initialized below, but not otherwise used.  Am I missing
>>>>>>>>> anything?
>>>>>>>> Whoops!  I was using that earlier to try to fake up a callback, but I eventually decided it's easier to just do the copy asynchronously.  I must have forgotten to take it out :(
>>>>>>>>
>>>>>>>>> What about OFFLOAD_STATUS and OFFLOAD_ABORT?
>>>>>>>> I haven't thought out those too much... I haven't thought about a use for them on the client yet.
>>>>>>> If it might be a long-running copy, I assume the client needs the
>>>>>>> ability to abort if the caller is killed.
>>>>>>>
>>>>>>> (Dumb question: what happens on the network partition?  Does the server
>>>>>>> abort the copy when it expires the client state?)
>>>>>>>
>>>>>>> In any case,
>>>>>>> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
>>>>>>> says "If a server's COPY operation returns a stateid, then the server
>>>>>>> MUST also support these operations: CB_OFFLOAD, OFFLOAD_ABORT, and
>>>>>>> OFFLOAD_STATUS."
>>>>>>>
>>>>>>> So even if we've no use for them on the client then we still need to
>>>>>>> implement them (and probably just write a basic pynfs test).  Either
>>>>>>> that or update the spec.
>>>>>> Fair enough.  I'll think it out and do something!  Easy solution: save this patch for later and only support the sync version of copy for the final version of this patch series.
>>>>> I can't remember--does the spec give the server a clear way to bail out
>>>>> and tell the client to fall back on a normal copy in cases where the
>>>>> server knows the copy could take an unreasonable amount of time?
>>>>>
>>>>> --b.
>>>> I don't think so.  Is there ever a case where copying over the network would be slower than copying on the server?
>>> Mybe not, but if the copy will take a minute, then we don't want to tie
>>> up an rpc slot for a minute.
>>>
>>> --b.
>> I think that we need to be able to handle copies that would take a
>> lot longer than just a minute - this offload could take a very long
>> time I assume depending on the size of the data getting copied and
>> the back end storage device....
> Bryan suggested in offline discussion that one possibility might be to
> copy, say, at most a gigabyte at a time before returning and making the
> client continue the copy.
>
> Where for "a gigabyte" read, "some amount that doesn't take too long to
> copy but is still enough to allow close to full bandwidth".  Hopefully
> that's an easy number to find.
>
> But based on
> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-14.1.2
> the COPY operation isn't designed for that--it doesn't give the option
> of returning bytes_copied in the successful case.
>
> Maybe we should fix that in the spec, or maybe we just need to implement
> the asynchronous case.  I guess it depends on which is easier,
>
> 	a) implementing the asynchronous case (and the referring-triple
> 	   support to fix the COPY/callback races), or
> 	b) implementing this sort of "short copy" loop in a way that gives
> 	   good performance.
>
> On the client side it's clearly a) since you're forced to handle that
> case anyway.  (Unless we argue that *all* copies should work that way,
> and that the spec should ditch the asynchronous case.) On the server
> side, b) looks easier.
>
> --b.

I am not sure that 1GB/time is enough - for a lot of servers, you could do an 
enormous range since no data is actually moved inside of the target (just 
pointers updated like in reflinked files for example)....

ric

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bryan Schumaker Aug. 5, 2013, 2:44 p.m. UTC | #11
On 08/05/2013 10:41 AM, J. Bruce Fields wrote:
> On Mon, Aug 05, 2013 at 09:38:20AM +0100, Ric Wheeler wrote:
>> On 07/22/2013 08:55 PM, J. Bruce Fields wrote:
>>> On Mon, Jul 22, 2013 at 03:54:00PM -0400, Bryan Schumaker wrote:
>>>> On 07/22/2013 03:43 PM, J. Bruce Fields wrote:
>>>>> On Mon, Jul 22, 2013 at 03:37:00PM -0400, Bryan Schumaker wrote:
>>>>>> On 07/22/2013 03:30 PM, J. Bruce Fields wrote:
>>>>>>> On Mon, Jul 22, 2013 at 03:17:29PM -0400, Bryan Schumaker wrote:
>>>>>>>> On 07/22/2013 02:50 PM, J. Bruce Fields wrote:
>>>>>>>>> On Fri, Jul 19, 2013 at 05:03:49PM -0400, bjschuma@netapp.com wrote:
>>>>>>>>>> From: Bryan Schumaker <bjschuma@netapp.com>
>>>>>>>>>>
>>>>>>>>>> Rather than performing the copy right away, schedule it to run later and
>>>>>>>>>> reply to the client.  Later, send a callback to notify the client that
>>>>>>>>>> the copy has finished.
>>>>>>>>> I believe you need to implement the referring triple support described
>>>>>>>>> in http://tools.ietf.org/html/rfc5661#section-2.10.6.3 to fix the race
>>>>>>>>> described in
>>>>>>>>> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
>>>>>>>>> .
>>>>>>>> I'll re-read and re-write.
>>>>>>>>
>>>>>>>>> I see cb_delay initialized below, but not otherwise used.  Am I missing
>>>>>>>>> anything?
>>>>>>>> Whoops!  I was using that earlier to try to fake up a callback, but I eventually decided it's easier to just do the copy asynchronously.  I must have forgotten to take it out :(
>>>>>>>>
>>>>>>>>> What about OFFLOAD_STATUS and OFFLOAD_ABORT?
>>>>>>>> I haven't thought out those too much... I haven't thought about a use for them on the client yet.
>>>>>>> If it might be a long-running copy, I assume the client needs the
>>>>>>> ability to abort if the caller is killed.
>>>>>>>
>>>>>>> (Dumb question: what happens on the network partition?  Does the server
>>>>>>> abort the copy when it expires the client state?)
>>>>>>>
>>>>>>> In any case,
>>>>>>> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
>>>>>>> says "If a server's COPY operation returns a stateid, then the server
>>>>>>> MUST also support these operations: CB_OFFLOAD, OFFLOAD_ABORT, and
>>>>>>> OFFLOAD_STATUS."
>>>>>>>
>>>>>>> So even if we've no use for them on the client then we still need to
>>>>>>> implement them (and probably just write a basic pynfs test).  Either
>>>>>>> that or update the spec.
>>>>>> Fair enough.  I'll think it out and do something!  Easy solution: save this patch for later and only support the sync version of copy for the final version of this patch series.
>>>>> I can't remember--does the spec give the server a clear way to bail out
>>>>> and tell the client to fall back on a normal copy in cases where the
>>>>> server knows the copy could take an unreasonable amount of time?
>>>>>
>>>>> --b.
>>>> I don't think so.  Is there ever a case where copying over the network would be slower than copying on the server?
>>> Mybe not, but if the copy will take a minute, then we don't want to tie
>>> up an rpc slot for a minute.
>>>
>>> --b.
>>
>> I think that we need to be able to handle copies that would take a
>> lot longer than just a minute - this offload could take a very long
>> time I assume depending on the size of the data getting copied and
>> the back end storage device....
> 
> Bryan suggested in offline discussion that one possibility might be to
> copy, say, at most a gigabyte at a time before returning and making the
> client continue the copy.
> 
> Where for "a gigabyte" read, "some amount that doesn't take too long to
> copy but is still enough to allow close to full bandwidth".  Hopefully
> that's an easy number to find.
> 
> But based on
> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-14.1.2
> the COPY operation isn't designed for that--it doesn't give the option
> of returning bytes_copied in the successful case.

Wouldn't the wr_count field in a write_response4 struct be the bytes copied?  I'm working on a patch for puting a limit on the amount copied - is there a "1 gigabyte in bytes" constant somewhere?

- Bryan

> 
> Maybe we should fix that in the spec, or maybe we just need to implement
> the asynchronous case.  I guess it depends on which is easier,
> 
> 	a) implementing the asynchronous case (and the referring-triple
> 	   support to fix the COPY/callback races), or
> 	b) implementing this sort of "short copy" loop in a way that gives
> 	   good performance.
> 
> On the client side it's clearly a) since you're forced to handle that
> case anyway.  (Unless we argue that *all* copies should work that way,
> and that the spec should ditch the asynchronous case.) On the server
> side, b) looks easier.
> 
> --b.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust Aug. 5, 2013, 2:50 p.m. UTC | #12
T24gTW9uLCAyMDEzLTA4LTA1IGF0IDEwOjQxIC0wNDAwLCBKLiBCcnVjZSBGaWVsZHMgd3JvdGU6
DQo+IE9uIE1vbiwgQXVnIDA1LCAyMDEzIGF0IDA5OjM4OjIwQU0gKzAxMDAsIFJpYyBXaGVlbGVy
IHdyb3RlOg0KPiA+IE9uIDA3LzIyLzIwMTMgMDg6NTUgUE0sIEouIEJydWNlIEZpZWxkcyB3cm90
ZToNCj4gPiA+T24gTW9uLCBKdWwgMjIsIDIwMTMgYXQgMDM6NTQ6MDBQTSAtMDQwMCwgQnJ5YW4g
U2NodW1ha2VyIHdyb3RlOg0KPiA+ID4+T24gMDcvMjIvMjAxMyAwMzo0MyBQTSwgSi4gQnJ1Y2Ug
RmllbGRzIHdyb3RlOg0KPiA+ID4+Pk9uIE1vbiwgSnVsIDIyLCAyMDEzIGF0IDAzOjM3OjAwUE0g
LTA0MDAsIEJyeWFuIFNjaHVtYWtlciB3cm90ZToNCj4gPiA+Pj4+T24gMDcvMjIvMjAxMyAwMzoz
MCBQTSwgSi4gQnJ1Y2UgRmllbGRzIHdyb3RlOg0KPiA+ID4+Pj4+T24gTW9uLCBKdWwgMjIsIDIw
MTMgYXQgMDM6MTc6MjlQTSAtMDQwMCwgQnJ5YW4gU2NodW1ha2VyIHdyb3RlOg0KPiA+ID4+Pj4+
Pk9uIDA3LzIyLzIwMTMgMDI6NTAgUE0sIEouIEJydWNlIEZpZWxkcyB3cm90ZToNCj4gPiA+Pj4+
Pj4+T24gRnJpLCBKdWwgMTksIDIwMTMgYXQgMDU6MDM6NDlQTSAtMDQwMCwgYmpzY2h1bWFAbmV0
YXBwLmNvbSB3cm90ZToNCj4gPiA+Pj4+Pj4+PkZyb206IEJyeWFuIFNjaHVtYWtlciA8YmpzY2h1
bWFAbmV0YXBwLmNvbT4NCj4gPiA+Pj4+Pj4+Pg0KPiA+ID4+Pj4+Pj4+UmF0aGVyIHRoYW4gcGVy
Zm9ybWluZyB0aGUgY29weSByaWdodCBhd2F5LCBzY2hlZHVsZSBpdCB0byBydW4gbGF0ZXIgYW5k
DQo+ID4gPj4+Pj4+Pj5yZXBseSB0byB0aGUgY2xpZW50LiAgTGF0ZXIsIHNlbmQgYSBjYWxsYmFj
ayB0byBub3RpZnkgdGhlIGNsaWVudCB0aGF0DQo+ID4gPj4+Pj4+Pj50aGUgY29weSBoYXMgZmlu
aXNoZWQuDQo+ID4gPj4+Pj4+PkkgYmVsaWV2ZSB5b3UgbmVlZCB0byBpbXBsZW1lbnQgdGhlIHJl
ZmVycmluZyB0cmlwbGUgc3VwcG9ydCBkZXNjcmliZWQNCj4gPiA+Pj4+Pj4+aW4gaHR0cDovL3Rv
b2xzLmlldGYub3JnL2h0bWwvcmZjNTY2MSNzZWN0aW9uLTIuMTAuNi4zIHRvIGZpeCB0aGUgcmFj
ZQ0KPiA+ID4+Pj4+Pj5kZXNjcmliZWQgaW4NCj4gPiA+Pj4+Pj4+aHR0cDovL3Rvb2xzLmlldGYu
b3JnL2h0bWwvZHJhZnQtaWV0Zi1uZnN2NC1taW5vcnZlcnNpb24yLTE5I3NlY3Rpb24tMTUuMS4z
DQo+ID4gPj4+Pj4+Pi4NCj4gPiA+Pj4+Pj5JJ2xsIHJlLXJlYWQgYW5kIHJlLXdyaXRlLg0KPiA+
ID4+Pj4+Pg0KPiA+ID4+Pj4+Pj5JIHNlZSBjYl9kZWxheSBpbml0aWFsaXplZCBiZWxvdywgYnV0
IG5vdCBvdGhlcndpc2UgdXNlZC4gIEFtIEkgbWlzc2luZw0KPiA+ID4+Pj4+Pj5hbnl0aGluZz8N
Cj4gPiA+Pj4+Pj5XaG9vcHMhICBJIHdhcyB1c2luZyB0aGF0IGVhcmxpZXIgdG8gdHJ5IHRvIGZh
a2UgdXAgYSBjYWxsYmFjaywgYnV0IEkgZXZlbnR1YWxseSBkZWNpZGVkIGl0J3MgZWFzaWVyIHRv
IGp1c3QgZG8gdGhlIGNvcHkgYXN5bmNocm9ub3VzbHkuICBJIG11c3QgaGF2ZSBmb3Jnb3R0ZW4g
dG8gdGFrZSBpdCBvdXQgOigNCj4gPiA+Pj4+Pj4NCj4gPiA+Pj4+Pj4+V2hhdCBhYm91dCBPRkZM
T0FEX1NUQVRVUyBhbmQgT0ZGTE9BRF9BQk9SVD8NCj4gPiA+Pj4+Pj5JIGhhdmVuJ3QgdGhvdWdo
dCBvdXQgdGhvc2UgdG9vIG11Y2guLi4gSSBoYXZlbid0IHRob3VnaHQgYWJvdXQgYSB1c2UgZm9y
IHRoZW0gb24gdGhlIGNsaWVudCB5ZXQuDQo+ID4gPj4+Pj5JZiBpdCBtaWdodCBiZSBhIGxvbmct
cnVubmluZyBjb3B5LCBJIGFzc3VtZSB0aGUgY2xpZW50IG5lZWRzIHRoZQ0KPiA+ID4+Pj4+YWJp
bGl0eSB0byBhYm9ydCBpZiB0aGUgY2FsbGVyIGlzIGtpbGxlZC4NCj4gPiA+Pj4+Pg0KPiA+ID4+
Pj4+KER1bWIgcXVlc3Rpb246IHdoYXQgaGFwcGVucyBvbiB0aGUgbmV0d29yayBwYXJ0aXRpb24/
ICBEb2VzIHRoZSBzZXJ2ZXINCj4gPiA+Pj4+PmFib3J0IHRoZSBjb3B5IHdoZW4gaXQgZXhwaXJl
cyB0aGUgY2xpZW50IHN0YXRlPykNCj4gPiA+Pj4+Pg0KPiA+ID4+Pj4+SW4gYW55IGNhc2UsDQo+
ID4gPj4+Pj5odHRwOi8vdG9vbHMuaWV0Zi5vcmcvaHRtbC9kcmFmdC1pZXRmLW5mc3Y0LW1pbm9y
dmVyc2lvbjItMTkjc2VjdGlvbi0xNS4xLjMNCj4gPiA+Pj4+PnNheXMgIklmIGEgc2VydmVyJ3Mg
Q09QWSBvcGVyYXRpb24gcmV0dXJucyBhIHN0YXRlaWQsIHRoZW4gdGhlIHNlcnZlcg0KPiA+ID4+
Pj4+TVVTVCBhbHNvIHN1cHBvcnQgdGhlc2Ugb3BlcmF0aW9uczogQ0JfT0ZGTE9BRCwgT0ZGTE9B
RF9BQk9SVCwgYW5kDQo+ID4gPj4+Pj5PRkZMT0FEX1NUQVRVUy4iDQo+ID4gPj4+Pj4NCj4gPiA+
Pj4+PlNvIGV2ZW4gaWYgd2UndmUgbm8gdXNlIGZvciB0aGVtIG9uIHRoZSBjbGllbnQgdGhlbiB3
ZSBzdGlsbCBuZWVkIHRvDQo+ID4gPj4+Pj5pbXBsZW1lbnQgdGhlbSAoYW5kIHByb2JhYmx5IGp1
c3Qgd3JpdGUgYSBiYXNpYyBweW5mcyB0ZXN0KS4gIEVpdGhlcg0KPiA+ID4+Pj4+dGhhdCBvciB1
cGRhdGUgdGhlIHNwZWMuDQo+ID4gPj4+PkZhaXIgZW5vdWdoLiAgSSdsbCB0aGluayBpdCBvdXQg
YW5kIGRvIHNvbWV0aGluZyEgIEVhc3kgc29sdXRpb246IHNhdmUgdGhpcyBwYXRjaCBmb3IgbGF0
ZXIgYW5kIG9ubHkgc3VwcG9ydCB0aGUgc3luYyB2ZXJzaW9uIG9mIGNvcHkgZm9yIHRoZSBmaW5h
bCB2ZXJzaW9uIG9mIHRoaXMgcGF0Y2ggc2VyaWVzLg0KPiA+ID4+PkkgY2FuJ3QgcmVtZW1iZXIt
LWRvZXMgdGhlIHNwZWMgZ2l2ZSB0aGUgc2VydmVyIGEgY2xlYXIgd2F5IHRvIGJhaWwgb3V0DQo+
ID4gPj4+YW5kIHRlbGwgdGhlIGNsaWVudCB0byBmYWxsIGJhY2sgb24gYSBub3JtYWwgY29weSBp
biBjYXNlcyB3aGVyZSB0aGUNCj4gPiA+Pj5zZXJ2ZXIga25vd3MgdGhlIGNvcHkgY291bGQgdGFr
ZSBhbiB1bnJlYXNvbmFibGUgYW1vdW50IG9mIHRpbWU/DQo+ID4gPj4+DQo+ID4gPj4+LS1iLg0K
PiA+ID4+SSBkb24ndCB0aGluayBzby4gIElzIHRoZXJlIGV2ZXIgYSBjYXNlIHdoZXJlIGNvcHlp
bmcgb3ZlciB0aGUgbmV0d29yayB3b3VsZCBiZSBzbG93ZXIgdGhhbiBjb3B5aW5nIG9uIHRoZSBz
ZXJ2ZXI/DQo+ID4gPk15YmUgbm90LCBidXQgaWYgdGhlIGNvcHkgd2lsbCB0YWtlIGEgbWludXRl
LCB0aGVuIHdlIGRvbid0IHdhbnQgdG8gdGllDQo+ID4gPnVwIGFuIHJwYyBzbG90IGZvciBhIG1p
bnV0ZS4NCj4gPiA+DQo+ID4gPi0tYi4NCj4gPiANCj4gPiBJIHRoaW5rIHRoYXQgd2UgbmVlZCB0
byBiZSBhYmxlIHRvIGhhbmRsZSBjb3BpZXMgdGhhdCB3b3VsZCB0YWtlIGENCj4gPiBsb3QgbG9u
Z2VyIHRoYW4ganVzdCBhIG1pbnV0ZSAtIHRoaXMgb2ZmbG9hZCBjb3VsZCB0YWtlIGEgdmVyeSBs
b25nDQo+ID4gdGltZSBJIGFzc3VtZSBkZXBlbmRpbmcgb24gdGhlIHNpemUgb2YgdGhlIGRhdGEg
Z2V0dGluZyBjb3BpZWQgYW5kDQo+ID4gdGhlIGJhY2sgZW5kIHN0b3JhZ2UgZGV2aWNlLi4uLg0K
PiANCj4gQnJ5YW4gc3VnZ2VzdGVkIGluIG9mZmxpbmUgZGlzY3Vzc2lvbiB0aGF0IG9uZSBwb3Nz
aWJpbGl0eSBtaWdodCBiZSB0bw0KPiBjb3B5LCBzYXksIGF0IG1vc3QgYSBnaWdhYnl0ZSBhdCBh
IHRpbWUgYmVmb3JlIHJldHVybmluZyBhbmQgbWFraW5nIHRoZQ0KPiBjbGllbnQgY29udGludWUg
dGhlIGNvcHkuDQo+IA0KPiBXaGVyZSBmb3IgImEgZ2lnYWJ5dGUiIHJlYWQsICJzb21lIGFtb3Vu
dCB0aGF0IGRvZXNuJ3QgdGFrZSB0b28gbG9uZyB0bw0KPiBjb3B5IGJ1dCBpcyBzdGlsbCBlbm91
Z2ggdG8gYWxsb3cgY2xvc2UgdG8gZnVsbCBiYW5kd2lkdGgiLiAgSG9wZWZ1bGx5DQo+IHRoYXQn
cyBhbiBlYXN5IG51bWJlciB0byBmaW5kLg0KPiANCj4gQnV0IGJhc2VkIG9uDQo+IGh0dHA6Ly90
b29scy5pZXRmLm9yZy9odG1sL2RyYWZ0LWlldGYtbmZzdjQtbWlub3J2ZXJzaW9uMi0xOSNzZWN0
aW9uLTE0LjEuMg0KPiB0aGUgQ09QWSBvcGVyYXRpb24gaXNuJ3QgZGVzaWduZWQgZm9yIHRoYXQt
LWl0IGRvZXNuJ3QgZ2l2ZSB0aGUgb3B0aW9uDQo+IG9mIHJldHVybmluZyBieXRlc19jb3BpZWQg
aW4gdGhlIHN1Y2Nlc3NmdWwgY2FzZS4NCg0KVGhlIHJlYXNvbiBpcyB0aGF0IHRoZSBzcGVjIHdy
aXRlcnMgZGlkIG5vdCB3YW50IHRvIGZvcmNlIHRoZSBzZXJ2ZXIgdG8NCmNvcHkgdGhlIGRhdGEg
aW4gc2VxdWVudGlhbCBvcmRlciAob3IgYW55IG90aGVyIHBhcnRpY3VsYXIgb3JkZXIgZm9yDQp0
aGF0IG1hdHRlcikuDQoNCklmIHRoZSBjb3B5IHdhcyBzaG9ydCwgdGhlbiB0aGUgY2xpZW50IGNh
bid0IGtub3cgd2hpY2ggYnl0ZXMgd2VyZQ0KY29waWVkOyB0aGV5IGNvdWxkIGJlIGF0IHRoZSBi
ZWdpbm5pbmcgb2YgdGhlIGZpbGUsIGluIHRoZSBtaWRkbGUsIG9yDQpldmVuIHRoZSB2ZXJ5IGVu
ZC4gQmFzaWNhbGx5LCBpdCBuZWVkcyB0byByZWRvIHRoZSBlbnRpcmUgY29weSBpbiBvcmRlcg0K
dG8gYmUgY2VydGFpbi4NCg0KPiBNYXliZSB3ZSBzaG91bGQgZml4IHRoYXQgaW4gdGhlIHNwZWMs
IG9yIG1heWJlIHdlIGp1c3QgbmVlZCB0byBpbXBsZW1lbnQNCj4gdGhlIGFzeW5jaHJvbm91cyBj
YXNlLiAgSSBndWVzcyBpdCBkZXBlbmRzIG9uIHdoaWNoIGlzIGVhc2llciwNCj4gDQo+IAlhKSBp
bXBsZW1lbnRpbmcgdGhlIGFzeW5jaHJvbm91cyBjYXNlIChhbmQgdGhlIHJlZmVycmluZy10cmlw
bGUNCj4gCSAgIHN1cHBvcnQgdG8gZml4IHRoZSBDT1BZL2NhbGxiYWNrIHJhY2VzKSwgb3INCj4g
CWIpIGltcGxlbWVudGluZyB0aGlzIHNvcnQgb2YgInNob3J0IGNvcHkiIGxvb3AgaW4gYSB3YXkg
dGhhdCBnaXZlcw0KPiAJICAgZ29vZCBwZXJmb3JtYW5jZS4NCj4gDQo+IE9uIHRoZSBjbGllbnQg
c2lkZSBpdCdzIGNsZWFybHkgYSkgc2luY2UgeW91J3JlIGZvcmNlZCB0byBoYW5kbGUgdGhhdA0K
PiBjYXNlIGFueXdheS4gIChVbmxlc3Mgd2UgYXJndWUgdGhhdCAqYWxsKiBjb3BpZXMgc2hvdWxk
IHdvcmsgdGhhdCB3YXksDQo+IGFuZCB0aGF0IHRoZSBzcGVjIHNob3VsZCBkaXRjaCB0aGUgYXN5
bmNocm9ub3VzIGNhc2UuKSBPbiB0aGUgc2VydmVyDQo+IHNpZGUsIGIpIGxvb2tzIGVhc2llci4N
Cj4gDQo+IC0tYi4NCg0KLS0gDQpUcm9uZCBNeWtsZWJ1c3QNCkxpbnV4IE5GUyBjbGllbnQgbWFp
bnRhaW5lcg0KDQpOZXRBcHANClRyb25kLk15a2xlYnVzdEBuZXRhcHAuY29tDQp3d3cubmV0YXBw
LmNvbQ0K
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Aug. 5, 2013, 2:56 p.m. UTC | #13
On Mon, Aug 05, 2013 at 03:44:18PM +0100, Ric Wheeler wrote:
> On 08/05/2013 03:41 PM, J. Bruce Fields wrote:
> >On Mon, Aug 05, 2013 at 09:38:20AM +0100, Ric Wheeler wrote:
> >>On 07/22/2013 08:55 PM, J. Bruce Fields wrote:
> >>>On Mon, Jul 22, 2013 at 03:54:00PM -0400, Bryan Schumaker wrote:
> >>>>On 07/22/2013 03:43 PM, J. Bruce Fields wrote:
> >>>>>On Mon, Jul 22, 2013 at 03:37:00PM -0400, Bryan Schumaker wrote:
> >>>>>>On 07/22/2013 03:30 PM, J. Bruce Fields wrote:
> >>>>>>>On Mon, Jul 22, 2013 at 03:17:29PM -0400, Bryan Schumaker wrote:
> >>>>>>>>On 07/22/2013 02:50 PM, J. Bruce Fields wrote:
> >>>>>>>>>On Fri, Jul 19, 2013 at 05:03:49PM -0400, bjschuma@netapp.com wrote:
> >>>>>>>>>>From: Bryan Schumaker <bjschuma@netapp.com>
> >>>>>>>>>>
> >>>>>>>>>>Rather than performing the copy right away, schedule it to run later and
> >>>>>>>>>>reply to the client.  Later, send a callback to notify the client that
> >>>>>>>>>>the copy has finished.
> >>>>>>>>>I believe you need to implement the referring triple support described
> >>>>>>>>>in http://tools.ietf.org/html/rfc5661#section-2.10.6.3 to fix the race
> >>>>>>>>>described in
> >>>>>>>>>http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
> >>>>>>>>>.
> >>>>>>>>I'll re-read and re-write.
> >>>>>>>>
> >>>>>>>>>I see cb_delay initialized below, but not otherwise used.  Am I missing
> >>>>>>>>>anything?
> >>>>>>>>Whoops!  I was using that earlier to try to fake up a callback, but I eventually decided it's easier to just do the copy asynchronously.  I must have forgotten to take it out :(
> >>>>>>>>
> >>>>>>>>>What about OFFLOAD_STATUS and OFFLOAD_ABORT?
> >>>>>>>>I haven't thought out those too much... I haven't thought about a use for them on the client yet.
> >>>>>>>If it might be a long-running copy, I assume the client needs the
> >>>>>>>ability to abort if the caller is killed.
> >>>>>>>
> >>>>>>>(Dumb question: what happens on the network partition?  Does the server
> >>>>>>>abort the copy when it expires the client state?)
> >>>>>>>
> >>>>>>>In any case,
> >>>>>>>http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-15.1.3
> >>>>>>>says "If a server's COPY operation returns a stateid, then the server
> >>>>>>>MUST also support these operations: CB_OFFLOAD, OFFLOAD_ABORT, and
> >>>>>>>OFFLOAD_STATUS."
> >>>>>>>
> >>>>>>>So even if we've no use for them on the client then we still need to
> >>>>>>>implement them (and probably just write a basic pynfs test).  Either
> >>>>>>>that or update the spec.
> >>>>>>Fair enough.  I'll think it out and do something!  Easy solution: save this patch for later and only support the sync version of copy for the final version of this patch series.
> >>>>>I can't remember--does the spec give the server a clear way to bail out
> >>>>>and tell the client to fall back on a normal copy in cases where the
> >>>>>server knows the copy could take an unreasonable amount of time?
> >>>>>
> >>>>>--b.
> >>>>I don't think so.  Is there ever a case where copying over the network would be slower than copying on the server?
> >>>Mybe not, but if the copy will take a minute, then we don't want to tie
> >>>up an rpc slot for a minute.
> >>>
> >>>--b.
> >>I think that we need to be able to handle copies that would take a
> >>lot longer than just a minute - this offload could take a very long
> >>time I assume depending on the size of the data getting copied and
> >>the back end storage device....
> >Bryan suggested in offline discussion that one possibility might be to
> >copy, say, at most a gigabyte at a time before returning and making the
> >client continue the copy.
> >
> >Where for "a gigabyte" read, "some amount that doesn't take too long to
> >copy but is still enough to allow close to full bandwidth".  Hopefully
> >that's an easy number to find.
> >
> >But based on
> >http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-14.1.2
> >the COPY operation isn't designed for that--it doesn't give the option
> >of returning bytes_copied in the successful case.
> >
> >Maybe we should fix that in the spec, or maybe we just need to implement
> >the asynchronous case.  I guess it depends on which is easier,
> >
> >	a) implementing the asynchronous case (and the referring-triple
> >	   support to fix the COPY/callback races), or
> >	b) implementing this sort of "short copy" loop in a way that gives
> >	   good performance.
> >
> >On the client side it's clearly a) since you're forced to handle that
> >case anyway.  (Unless we argue that *all* copies should work that way,
> >and that the spec should ditch the asynchronous case.) On the server
> >side, b) looks easier.
> >
> >--b.
> 
> I am not sure that 1GB/time is enough - for a lot of servers, you
> could do an enormous range since no data is actually moved inside of
> the target (just pointers updated like in reflinked files for
> example)....

Right, but the short copy return would be optional on the server's
part--so the client would request the whole range, and the server could
copy it all in the quick update-some-pointers case while copying only
the first gigabyte (or whatever) in the "dumb" read-write-loop case.

But in the "dumb" case the server still needs a large enough range to
make its IO efficient and to amortize the cost of the extra round trips,
since the client is forced to serialize the COPY requests by the need to
see the bytes_copied result.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Aug. 5, 2013, 6:11 p.m. UTC | #14
On Mon, Aug 05, 2013 at 02:50:38PM +0000, Myklebust, Trond wrote:
> On Mon, 2013-08-05 at 10:41 -0400, J. Bruce Fields wrote:
> > Bryan suggested in offline discussion that one possibility might be to
> > copy, say, at most a gigabyte at a time before returning and making the
> > client continue the copy.
> > 
> > Where for "a gigabyte" read, "some amount that doesn't take too long to
> > copy but is still enough to allow close to full bandwidth".  Hopefully
> > that's an easy number to find.
> > 
> > But based on
> > http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-14.1.2
> > the COPY operation isn't designed for that--it doesn't give the option
> > of returning bytes_copied in the successful case.
> 
> The reason is that the spec writers did not want to force the server to
> copy the data in sequential order (or any other particular order for
> that matter).

Well, servers would still have the option not to return success unless
the whole copy succeeded, so I'm not sure this *forces* servers to do
sequential copies.

(Unless we also got rid of the callback.)

--b.

> 
> If the copy was short, then the client can't know which bytes were
> copied; they could be at the beginning of the file, in the middle, or
> even the very end. Basically, it needs to redo the entire copy in order
> to be certain.
> 
> > Maybe we should fix that in the spec, or maybe we just need to implement
> > the asynchronous case.  I guess it depends on which is easier,
> > 
> > 	a) implementing the asynchronous case (and the referring-triple
> > 	   support to fix the COPY/callback races), or
> > 	b) implementing this sort of "short copy" loop in a way that gives
> > 	   good performance.
> > 
> > On the client side it's clearly a) since you're forced to handle that
> > case anyway.  (Unless we argue that *all* copies should work that way,
> > and that the spec should ditch the asynchronous case.) On the server
> > side, b) looks easier.
> > 
> > --b.
> 
> -- 
> Trond Myklebust
> Linux NFS client maintainer
> 
> NetApp
> Trond.Myklebust@netapp.com
> www.netapp.com
> N?????r??y????b?X???v?^?)?{.n?+????{???"??^n?r???z???h?????&???G???h?(????j"???m??????z?????f???h???~?m
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever III Aug. 5, 2013, 6:17 p.m. UTC | #15
On Aug 5, 2013, at 2:11 PM, "J. Bruce Fields" <bfields@fieldses.org> wrote:

> On Mon, Aug 05, 2013 at 02:50:38PM +0000, Myklebust, Trond wrote:
>> On Mon, 2013-08-05 at 10:41 -0400, J. Bruce Fields wrote:
>>> Bryan suggested in offline discussion that one possibility might be to
>>> copy, say, at most a gigabyte at a time before returning and making the
>>> client continue the copy.
>>> 
>>> Where for "a gigabyte" read, "some amount that doesn't take too long to
>>> copy but is still enough to allow close to full bandwidth".  Hopefully
>>> that's an easy number to find.
>>> 
>>> But based on
>>> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-14.1.2
>>> the COPY operation isn't designed for that--it doesn't give the option
>>> of returning bytes_copied in the successful case.
>> 
>> The reason is that the spec writers did not want to force the server to
>> copy the data in sequential order (or any other particular order for
>> that matter).
> 
> Well, servers would still have the option not to return success unless
> the whole copy succeeded, so I'm not sure this *forces* servers to do
> sequential copies.
> 
> (Unless we also got rid of the callback.)

If the client initiates a full-file copy and the operation fails, I would think that the client itself can try copying sufficiently large chunks of the file via separate individual COPY operations.  If any of those operations fails, then the client can fall back again to a traditional over-the-wire copy operation.


> --b.
> 
>> 
>> If the copy was short, then the client can't know which bytes were
>> copied; they could be at the beginning of the file, in the middle, or
>> even the very end. Basically, it needs to redo the entire copy in order
>> to be certain.
>> 
>>> Maybe we should fix that in the spec, or maybe we just need to implement
>>> the asynchronous case.  I guess it depends on which is easier,
>>> 
>>> 	a) implementing the asynchronous case (and the referring-triple
>>> 	   support to fix the COPY/callback races), or
>>> 	b) implementing this sort of "short copy" loop in a way that gives
>>> 	   good performance.
>>> 
>>> On the client side it's clearly a) since you're forced to handle that
>>> case anyway.  (Unless we argue that *all* copies should work that way,
>>> and that the spec should ditch the asynchronous case.) On the server
>>> side, b) looks easier.
>>> 
>>> --b.
>> 
>> -- 
>> Trond Myklebust
>> Linux NFS client maintainer
>> 
>> NetApp
>> Trond.Myklebust@netapp.com
>> www.netapp.com
>> N?????r??y????b?X???v?^?)?{.n?+????{???"??^n?r???z???h?????&???G???h?(????j"???m??????z?????f???h???~?m
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust Aug. 5, 2013, 6:24 p.m. UTC | #16
On Mon, 2013-08-05 at 14:17 -0400, Chuck Lever wrote:
> On Aug 5, 2013, at 2:11 PM, "J. Bruce Fields" <bfields@fieldses.org> wrote:

> 

> > On Mon, Aug 05, 2013 at 02:50:38PM +0000, Myklebust, Trond wrote:

> >> On Mon, 2013-08-05 at 10:41 -0400, J. Bruce Fields wrote:

> >>> Bryan suggested in offline discussion that one possibility might be to

> >>> copy, say, at most a gigabyte at a time before returning and making the

> >>> client continue the copy.

> >>> 

> >>> Where for "a gigabyte" read, "some amount that doesn't take too long to

> >>> copy but is still enough to allow close to full bandwidth".  Hopefully

> >>> that's an easy number to find.

> >>> 

> >>> But based on

> >>> http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-14.1.2

> >>> the COPY operation isn't designed for that--it doesn't give the option

> >>> of returning bytes_copied in the successful case.

> >> 

> >> The reason is that the spec writers did not want to force the server to

> >> copy the data in sequential order (or any other particular order for

> >> that matter).

> > 

> > Well, servers would still have the option not to return success unless

> > the whole copy succeeded, so I'm not sure this *forces* servers to do

> > sequential copies.

> > 

> > (Unless we also got rid of the callback.)

> 

> If the client initiates a full-file copy and the operation fails, I would think that the client itself can try copying sufficiently large chunks of the file via separate individual COPY operations.  If any of those operations fails, then the client can fall back again to a traditional over-the-wire copy operation.


How does the client determine what constitutes a "sufficiently large
chunk" in the mind of the server, and why do we want to add that
functionality in the first place? Fallback to traditional copy in the
case where the server doesn't support offload is fine, but all these
screwball special cases are not. We already have sync vs async. Now you
want to add chunked sync and chunked async too? NACK...

-- 
Trond Myklebust
Linux NFS client maintainer

NetApp
Trond.Myklebust@netapp.com
www.netapp.com
J. Bruce Fields Aug. 5, 2013, 6:30 p.m. UTC | #17
On Mon, Aug 05, 2013 at 02:11:21PM -0400, J. Bruce Fields wrote:
> On Mon, Aug 05, 2013 at 02:50:38PM +0000, Myklebust, Trond wrote:
> > On Mon, 2013-08-05 at 10:41 -0400, J. Bruce Fields wrote:
> > > Bryan suggested in offline discussion that one possibility might be to
> > > copy, say, at most a gigabyte at a time before returning and making the
> > > client continue the copy.
> > > 
> > > Where for "a gigabyte" read, "some amount that doesn't take too long to
> > > copy but is still enough to allow close to full bandwidth".  Hopefully
> > > that's an easy number to find.
> > > 
> > > But based on
> > > http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-19#section-14.1.2
> > > the COPY operation isn't designed for that--it doesn't give the option
> > > of returning bytes_copied in the successful case.
> > 
> > The reason is that the spec writers did not want to force the server to
> > copy the data in sequential order (or any other particular order for
> > that matter).
> 
> Well, servers would still have the option not to return success unless
> the whole copy succeeded, so I'm not sure this *forces* servers to do
> sequential copies.

Uh, sorry, I was confused, I missed the write_response4 in the COPY
result entirely.

Yeah obviously that's useless.  (Why's it there anyway?  No client or
application is going to care about anything other than whether it's 0 or
not, right?)

So maybe it would be useful to add a way for a server to optionally
communicate a sequential bytes_written, I don't know.

Without that, at least, I think the only reasonable implementation of
"dumb" server-side copies will need to implement the asynchronous case
(and referring triples).  Which might be worth doing.

But for the first cut maybe we should instead *only* implement this on
btrfs (or whoever else can do quick copies).

--b.

> 
> (Unless we also got rid of the callback.)
> > If the copy was short, then the client can't know which bytes were
> > copied; they could be at the beginning of the file, in the middle, or
> > even the very end. Basically, it needs to redo the entire copy in order
> > to be certain.
> > 
> > > Maybe we should fix that in the spec, or maybe we just need to implement
> > > the asynchronous case.  I guess it depends on which is easier,
> > > 
> > > 	a) implementing the asynchronous case (and the referring-triple
> > > 	   support to fix the COPY/callback races), or
> > > 	b) implementing this sort of "short copy" loop in a way that gives
> > > 	   good performance.
> > > 
> > > On the client side it's clearly a) since you're forced to handle that
> > > case anyway.  (Unless we argue that *all* copies should work that way,
> > > and that the spec should ditch the asynchronous case.) On the server
> > > side, b) looks easier.
> > > 
> > > --b.
> > 
> > -- 
> > Trond Myklebust
> > Linux NFS client maintainer
> > 
> > NetApp
> > Trond.Myklebust@netapp.com
> > www.netapp.com
> > N?????r??y????b?X???v?^?)?{.n?+????{???"??^n?r???z???h?????&???G???h?(????j"???m??????z?????f???h???~?m
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7f05cd1..8f797e1 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -52,6 +52,9 @@  enum {
 	NFSPROC4_CLNT_CB_NULL = 0,
 	NFSPROC4_CLNT_CB_RECALL,
 	NFSPROC4_CLNT_CB_SEQUENCE,
+
+	/* NFS v4.2 callback */
+	NFSPROC4_CLNT_CB_OFFLOAD,
 };
 
 struct nfs4_cb_compound_hdr {
@@ -110,6 +113,7 @@  enum nfs_cb_opnum4 {
 	OP_CB_WANTS_CANCELLED		= 12,
 	OP_CB_NOTIFY_LOCK		= 13,
 	OP_CB_NOTIFY_DEVICEID		= 14,
+	OP_CB_OFFLOAD			= 15,
 	OP_CB_ILLEGAL			= 10044
 };
 
@@ -469,6 +473,31 @@  out_default:
 	return nfs_cb_stat_to_errno(nfserr);
 }
 
+static void encode_cb_offload4args(struct xdr_stream *xdr,
+				   const struct nfs4_cb_offload *offload,
+				   struct nfs4_cb_compound_hdr *hdr)
+{
+	__be32 *p;
+
+	if (hdr->minorversion < 2)
+		return;
+
+	encode_nfs_cb_opnum4(xdr, OP_CB_OFFLOAD);
+	encode_nfs_fh4(xdr, &offload->co_dst_fh);
+	encode_stateid4(xdr, &offload->co_stid->sc_stateid);
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(1);
+	encode_stateid4(xdr, &offload->co_stid->sc_stateid);
+
+	p = xdr_reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
+	p = xdr_encode_hyper(p, offload->co_count);
+	*p++ = cpu_to_be32(offload->co_stable_how);
+	xdr_encode_opaque_fixed(p, offload->co_verifier.data, NFS4_VERIFIER_SIZE);
+
+	hdr->nops++;
+}
+
 /*
  * NFSv4.0 and NFSv4.1 XDR encode functions
  *
@@ -505,6 +534,23 @@  static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_cb_nops(&hdr);
 }
 
+/*
+ * CB_OFFLOAD
+ */
+static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req, struct xdr_stream *xdr,
+				    const struct nfsd4_callback *cb)
+{
+	const struct nfs4_cb_offload *args = cb->cb_op;
+	struct nfs4_cb_compound_hdr hdr = {
+		.ident = cb->cb_clp->cl_cb_ident,
+		.minorversion = cb->cb_minorversion,
+	};
+
+	encode_cb_compound4args(xdr, &hdr);
+	encode_cb_sequence4args(xdr, cb, &hdr);
+	encode_cb_offload4args(xdr, args, &hdr);
+	encode_cb_nops(&hdr);
+}
 
 /*
  * NFSv4.0 and NFSv4.1 XDR decode functions
@@ -552,6 +598,36 @@  out:
 }
 
 /*
+ * CB_OFFLOAD
+ */
+static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+				   struct nfsd4_callback *cb)
+{
+	struct nfs4_cb_compound_hdr hdr;
+	enum nfsstat4 nfserr;
+	int status;
+
+	status = decode_cb_compound4res(xdr, &hdr);
+	if (unlikely(status))
+		goto out;
+
+	if (cb != NULL) {
+		status = decode_cb_sequence4res(xdr, cb);
+		if (unlikely(status))
+			goto out;
+	}
+
+	status = decode_cb_op_status(xdr, OP_CB_OFFLOAD, &nfserr);
+	if (unlikely(status))
+		goto out;
+	if (unlikely(nfserr != NFS4_OK))
+		status = nfs_cb_stat_to_errno(nfserr);
+
+out:
+	return status;
+}
+
+/*
  * RPC procedure tables
  */
 #define PROC(proc, call, argtype, restype)				\
@@ -568,6 +644,7 @@  out:
 static struct rpc_procinfo nfs4_cb_procedures[] = {
 	PROC(CB_NULL,	NULL,		cb_null,	cb_null),
 	PROC(CB_RECALL,	COMPOUND,	cb_recall,	cb_recall),
+	PROC(CB_OFFLOAD, COMPOUND,	cb_offload,	cb_offload),
 };
 
 static struct rpc_version nfs_cb_version4 = {
@@ -1017,6 +1094,11 @@  void nfsd4_init_callback(struct nfsd4_callback *cb)
 	INIT_WORK(&cb->cb_work, nfsd4_do_callback_rpc);
 }
 
+void nfsd4_init_delayed_callback(struct nfsd4_callback *cb)
+{
+	INIT_DELAYED_WORK(&cb->cb_delay, nfsd4_do_callback_rpc);
+}
+
 void nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
 	struct nfsd4_callback *cb = &dp->dl_recall;
@@ -1036,3 +1118,57 @@  void nfsd4_cb_recall(struct nfs4_delegation *dp)
 
 	run_nfsd4_cb(&dp->dl_recall);
 }
+
+static void nfsd4_cb_offload_done(struct rpc_task *task, void *calldata)
+{
+	struct nfsd4_callback *cb = calldata;
+	struct nfs4_client *clp = cb->cb_clp;
+	struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
+
+	nfsd4_cb_done(task, calldata);
+
+	if (current_rpc_client != task->tk_client)
+		return;
+
+	if (cb->cb_done)
+		return;
+
+	if (task->tk_status != 0)
+		nfsd4_mark_cb_down(clp, task->tk_status);
+	cb->cb_done = true;
+}
+
+static void nfsd4_cb_offload_release(void *calldata)
+{
+	struct nfsd4_callback *cb = calldata;
+	struct nfs4_cb_offload *offload = container_of(cb, struct nfs4_cb_offload, co_callback);
+
+	if (cb->cb_done) {
+		nfs4_free_offload_stateid(offload->co_stid);
+		kfree(offload);
+	}
+}
+
+static const struct rpc_call_ops nfsd4_cb_offload_ops = {
+	.rpc_call_prepare = nfsd4_cb_prepare,
+	.rpc_call_done    = nfsd4_cb_offload_done,
+	.rpc_release      = nfsd4_cb_offload_release,
+};
+
+void nfsd4_cb_offload(struct nfs4_cb_offload *offload)
+{
+	struct nfsd4_callback *cb = &offload->co_callback;
+
+	cb->cb_op = offload;
+	cb->cb_clp = offload->co_stid->sc_client;
+	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_OFFLOAD];
+	cb->cb_msg.rpc_argp = cb;
+	cb->cb_msg.rpc_resp = cb;
+
+	cb->cb_ops = &nfsd4_cb_offload_ops;
+
+	INIT_LIST_HEAD(&cb->cb_per_client);
+	cb->cb_done = true;
+
+	run_nfsd4_cb(cb);
+}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index d4584ea..66a787f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -35,6 +35,7 @@ 
 #include <linux/file.h>
 #include <linux/slab.h>
 
+#include "state.h"
 #include "idmap.h"
 #include "cache.h"
 #include "xdr4.h"
@@ -1062,29 +1063,57 @@  out:
 	return status;
 }
 
+static void
+nfsd4_copy_async(struct work_struct *w)
+{
+	__be32 status;
+	struct nfs4_cb_offload *offload;
+
+	offload = container_of(w, struct nfs4_cb_offload, co_work);
+	status  = nfsd_copy_range(offload->co_src_file, offload->co_src_pos,
+				  offload->co_dst_file, offload->co_dst_pos,
+				  offload->co_count);
+
+	if (status == nfs_ok) {
+		offload->co_stable_how = NFS_FILE_SYNC;
+		gen_boot_verifier(&offload->co_verifier, offload->co_net);
+		fput(offload->co_src_file);
+		fput(offload->co_dst_file);
+	}
+	nfsd4_cb_offload(offload);
+}
+
 static __be32
 nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		struct nfsd4_copy *copy)
 {
-	__be32 status;
 	struct file *src = NULL, *dst = NULL;
+	struct nfs4_cb_offload *offload;
 
-	status = nfsd4_verify_copy(rqstp, cstate, copy, &src, &dst);
-	if (status)
-		return status;
-
-	status = nfsd_copy_range(src, copy->cp_src_pos,
-				 dst, copy->cp_dst_pos,
-				 copy->cp_count);
+	if (nfsd4_verify_copy(rqstp, cstate, copy, &src, &dst))
+		return nfserr_jukebox;
 
-	if (status == nfs_ok) {
-		copy->cp_res.wr_stateid = NULL;
-		copy->cp_res.wr_bytes_written = copy->cp_count;
-		copy->cp_res.wr_stable_how = NFS_FILE_SYNC;
-		gen_boot_verifier(&copy->cp_res.wr_verifier, SVC_NET(rqstp));
-	}
+	offload = kmalloc(sizeof(struct nfs4_cb_offload), GFP_KERNEL);
+	if (!offload)
+		return nfserr_jukebox;
 
-	return status;
+	offload->co_src_file = get_file(src);
+	offload->co_dst_file = get_file(dst);
+	offload->co_src_pos  = copy->cp_src_pos;
+	offload->co_dst_pos  = copy->cp_dst_pos;
+	offload->co_count    = copy->cp_count;
+	offload->co_stid     = nfs4_alloc_offload_stateid(cstate->session->se_client);
+	offload->co_net      = SVC_NET(rqstp);
+	INIT_WORK(&offload->co_work, nfsd4_copy_async);
+	nfsd4_init_callback(&offload->co_callback);
+	memcpy(&offload->co_dst_fh, &cstate->current_fh, sizeof(struct knfsd_fh));
+
+	copy->cp_res.wr_stateid = &offload->co_stid->sc_stateid;
+	copy->cp_res.wr_bytes_written = 0;
+	copy->cp_res.wr_stable_how = NFS_UNSTABLE;
+
+	schedule_work(&offload->co_work);
+	return nfs_ok;
 }
 
 /* This routine never returns NFS_OK!  If there are no other errors, it
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c4e270e..582edb5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -364,6 +364,11 @@  static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
 	return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
 }
 
+struct nfs4_stid *nfs4_alloc_offload_stateid(struct nfs4_client *clp)
+{
+	return nfs4_alloc_stid(clp, stateid_slab);
+}
+
 static struct nfs4_delegation *
 alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
 {
@@ -617,6 +622,12 @@  static void free_generic_stateid(struct nfs4_ol_stateid *stp)
 	kmem_cache_free(stateid_slab, stp);
 }
 
+void nfs4_free_offload_stateid(struct nfs4_stid *stid)
+{
+	remove_stid(stid);
+	kmem_cache_free(stateid_slab, stid);
+}
+
 static void release_lock_stateid(struct nfs4_ol_stateid *stp)
 {
 	struct file *file;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 2478805..56682fb 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -70,6 +70,7 @@  struct nfsd4_callback {
 	struct rpc_message cb_msg;
 	const struct rpc_call_ops *cb_ops;
 	struct work_struct cb_work;
+	struct delayed_work cb_delay;
 	bool cb_done;
 };
 
@@ -101,6 +102,22 @@  struct nfs4_delegation {
 	struct nfsd4_callback	dl_recall;
 };
 
+struct nfs4_cb_offload {
+	struct file	*co_src_file;
+	struct file	*co_dst_file;
+	u64		co_src_pos;
+	u64		co_dst_pos;
+	u64		co_count;
+	u32		co_stable_how;
+	struct knfsd_fh	co_dst_fh;
+	nfs4_verifier	co_verifier;
+	struct net	*co_net;
+
+	struct nfs4_stid		*co_stid;
+	struct work_struct		co_work;
+	struct nfsd4_callback		co_callback;
+};
+
 /* client delegation callback info */
 struct nfs4_cb_conn {
 	/* SETCLIENTID info */
@@ -468,10 +485,12 @@  extern void nfs4_free_openowner(struct nfs4_openowner *);
 extern void nfs4_free_lockowner(struct nfs4_lockowner *);
 extern int set_callback_cred(void);
 extern void nfsd4_init_callback(struct nfsd4_callback *);
+extern void nfsd4_init_delayed_callback(struct nfsd4_callback *);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
 extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
 extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
+extern void nfsd4_cb_offload(struct nfs4_cb_offload *);
 extern int nfsd4_create_callback_queue(void);
 extern void nfsd4_destroy_callback_queue(void);
 extern void nfsd4_shutdown_callback(struct nfs4_client *);
@@ -480,6 +499,8 @@  extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
 							struct nfsd_net *nn);
 extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
 extern void put_client_renew(struct nfs4_client *clp);
+extern struct nfs4_stid *nfs4_alloc_offload_stateid(struct nfs4_client *);
+extern void nfs4_free_offload_stateid(struct nfs4_stid *);
 
 /* nfs4recover operations */
 extern int nfsd4_client_tracking_init(struct net *net);
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index c5c55df..75b0ef7 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -21,3 +21,12 @@ 
 #define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
 					cb_sequence_dec_sz +            \
 					op_dec_sz)
+
+#define NFS4_enc_cb_offload_sz		(cb_compound_enc_hdr_sz +      \
+					 cb_sequence_enc_sz +          \
+					 1 + enc_stateid_sz + 2 + 1 +  \
+					 XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+
+#define NFS4_dec_cb_offload_sz		(cb_compound_dec_hdr_sz +  \
+					 cb_sequence_dec_sz +      \
+					 op_dec_sz)