diff mbox series

[RFC,v10,44/48] ceph: plumb in decryption during sync reads

Message ID 20220111191608.88762-45-jlayton@kernel.org (mailing list archive)
State Superseded
Headers show
Series ceph+fscrypt: full support | expand

Commit Message

Jeff Layton Jan. 11, 2022, 7:16 p.m. UTC
Note that the crypto block may be smaller than a page, but the reverse
cannot be true.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/file.c | 94 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 69 insertions(+), 25 deletions(-)

Comments

Xiubo Li Jan. 19, 2022, 5:18 a.m. UTC | #1
On 1/12/22 3:16 AM, Jeff Layton wrote:
> Note that the crypto block may be smaller than a page, but the reverse
> cannot be true.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>   fs/ceph/file.c | 94 ++++++++++++++++++++++++++++++++++++--------------
>   1 file changed, 69 insertions(+), 25 deletions(-)
>
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 41766b2012e9..b4f2fcd33837 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -926,9 +926,17 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
>   		bool more;
>   		int idx;
>   		size_t left;
> +		u64 read_off = off;
> +		u64 read_len = len;
> +
> +		/* determine new offset/length if encrypted */
> +		fscrypt_adjust_off_and_len(inode, &read_off, &read_len);
> +
> +		dout("sync_read orig %llu~%llu reading %llu~%llu",
> +		     off, len, read_off, read_len);
>   
>   		req = ceph_osdc_new_request(osdc, &ci->i_layout,
> -					ci->i_vino, off, &len, 0, 1,
> +					ci->i_vino, read_off, &read_len, 0, 1,
>   					CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
>   					NULL, ci->i_truncate_seq,
>   					ci->i_truncate_size, false);
> @@ -937,10 +945,13 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
>   			break;
>   		}
>   
> +		/* adjust len downward if the request truncated the len */
> +		if (off + len > read_off + read_len)
> +			len = read_off + read_len - off;
>   		more = len < iov_iter_count(to);
>   
> -		num_pages = calc_pages_for(off, len);
> -		page_off = off & ~PAGE_MASK;
> +		num_pages = calc_pages_for(read_off, read_len);
> +		page_off = offset_in_page(off);
>   		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
>   		if (IS_ERR(pages)) {
>   			ceph_osdc_put_request(req);
> @@ -948,7 +959,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
>   			break;
>   		}
>   
> -		osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
> +		osd_req_op_extent_osd_data_pages(req, 0, pages, read_len,
> +						 offset_in_page(read_off),
>   						 false, false);
>   		ret = ceph_osdc_start_request(osdc, req, false);
>   		if (!ret)
> @@ -957,23 +969,50 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
>   		ceph_update_read_metrics(&fsc->mdsc->metric,
>   					 req->r_start_latency,
>   					 req->r_end_latency,
> -					 len, ret);
> +					 read_len, ret);
>   
>   		if (ret > 0)
>   			objver = req->r_version;
>   		ceph_osdc_put_request(req);
> -
>   		i_size = i_size_read(inode);
>   		dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
>   		     off, len, ret, i_size, (more ? " MORE" : ""));
>   
> -		if (ret == -ENOENT)
> +		if (ret == -ENOENT) {
> +			/* No object? Then this is a hole */
>   			ret = 0;
> +		} else if (ret > 0 && IS_ENCRYPTED(inode)) {
> +			int fret;
> +
> +			fret = ceph_fscrypt_decrypt_pages(inode, pages, read_off, ret);
> +			if (fret < 0) {
> +				ceph_release_page_vector(pages, num_pages);
> +				ret = fret;
> +				break;
> +			}
> +
> +			dout("sync_read decrypted fret %d\n", fret);
> +
> +			/* account for any partial block at the beginning */
> +			fret -= (off - read_off);
> +
> +			/*
> +			 * Short read after big offset adjustment?
> +			 * Nothing is usable, just call it a zero
> +			 * len read.
> +			 */
> +			fret = max(fret, 0);
> +
> +			/* account for partial block at the end */
> +			ret = min_t(ssize_t, fret, len);
> +		}
> +
> +		/* Short read but not EOF? Zero out the remainder. */
>   		if (ret >= 0 && ret < len && (off + ret < i_size)) {
>   			int zlen = min(len - ret, i_size - off - ret);
>   			int zoff = page_off + ret;
>   			dout("sync_read zero gap %llu~%llu\n",
> -                             off + ret, off + ret + zlen);
> +			     off + ret, off + ret + zlen);
>   			ceph_zero_page_vector_range(zoff, zlen, pages);
>   			ret += zlen;
>   		}
> @@ -981,15 +1020,15 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
>   		idx = 0;
>   		left = ret > 0 ? ret : 0;
>   		while (left > 0) {
> -			size_t len, copied;
> -			page_off = off & ~PAGE_MASK;
> -			len = min_t(size_t, left, PAGE_SIZE - page_off);
> +			size_t plen, copied;
> +			plen = min_t(size_t, left, PAGE_SIZE - page_off);
>   			SetPageUptodate(pages[idx]);
>   			copied = copy_page_to_iter(pages[idx++],
> -						   page_off, len, to);
> +						   page_off, plen, to);
>   			off += copied;
>   			left -= copied;
> -			if (copied < len) {
> +			page_off = 0;
> +			if (copied < plen) {
>   				ret = -EFAULT;
>   				break;
>   			}
> @@ -1006,20 +1045,21 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
>   			break;
>   	}
>   
> -	if (off > *ki_pos) {
> -		if (off >= i_size) {
> -			*retry_op = CHECK_EOF;
> -			ret = i_size - *ki_pos;
> -			*ki_pos = i_size;
> -		} else {
> -			ret = off - *ki_pos;
> -			*ki_pos = off;
> +	if (ret > 0) {
> +		if (off > *ki_pos) {
> +			if (off >= i_size) {
> +				*retry_op = CHECK_EOF;
> +				ret = i_size - *ki_pos;
> +				*ki_pos = i_size;
> +			} else {
> +				ret = off - *ki_pos;
> +				*ki_pos = off;
> +			}
>   		}
> -	}
> -
> -	if (last_objver && ret > 0)
> -		*last_objver = objver;
>   
> +		if (last_objver)
> +			*last_objver = objver;
> +	}
>   	dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
>   	return ret;
>   }
> @@ -1532,6 +1572,9 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
>   		last = (pos + len) != (write_pos + write_len);
>   		rmw = first || last;
>   
> +		dout("sync_write ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n",
> +		     ci->i_vino.ino, pos, len, write_pos, write_len, rmw ? "" : "no ");
> +

Should this move to the previous patch ?


>   		/*
>   		 * The data is emplaced into the page as it would be if it were in
>   		 * an array of pagecache pages.
> @@ -1761,6 +1804,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
>   		ceph_clear_error_write(ci);
>   		pos += len;
>   		written += len;
> +		dout("sync_write written %d\n", written);
>   		if (pos > i_size_read(inode)) {
>   			check_caps = ceph_inode_set_size(inode, pos);
>   			if (check_caps)
Jeff Layton Jan. 19, 2022, 6:49 p.m. UTC | #2
On Wed, 2022-01-19 at 13:18 +0800, Xiubo Li wrote:
> On 1/12/22 3:16 AM, Jeff Layton wrote:
> > Note that the crypto block may be smaller than a page, but the reverse
> > cannot be true.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >   fs/ceph/file.c | 94 ++++++++++++++++++++++++++++++++++++--------------
> >   1 file changed, 69 insertions(+), 25 deletions(-)
> > 
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index 41766b2012e9..b4f2fcd33837 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -926,9 +926,17 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
> >   		bool more;
> >   		int idx;
> >   		size_t left;
> > +		u64 read_off = off;
> > +		u64 read_len = len;
> > +
> > +		/* determine new offset/length if encrypted */
> > +		fscrypt_adjust_off_and_len(inode, &read_off, &read_len);
> > +
> > +		dout("sync_read orig %llu~%llu reading %llu~%llu",
> > +		     off, len, read_off, read_len);
> >   
> >   		req = ceph_osdc_new_request(osdc, &ci->i_layout,
> > -					ci->i_vino, off, &len, 0, 1,
> > +					ci->i_vino, read_off, &read_len, 0, 1,
> >   					CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
> >   					NULL, ci->i_truncate_seq,
> >   					ci->i_truncate_size, false);
> > @@ -937,10 +945,13 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
> >   			break;
> >   		}
> >   
> > +		/* adjust len downward if the request truncated the len */
> > +		if (off + len > read_off + read_len)
> > +			len = read_off + read_len - off;
> >   		more = len < iov_iter_count(to);
> >   
> > -		num_pages = calc_pages_for(off, len);
> > -		page_off = off & ~PAGE_MASK;
> > +		num_pages = calc_pages_for(read_off, read_len);
> > +		page_off = offset_in_page(off);
> >   		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
> >   		if (IS_ERR(pages)) {
> >   			ceph_osdc_put_request(req);
> > @@ -948,7 +959,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
> >   			break;
> >   		}
> >   
> > -		osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
> > +		osd_req_op_extent_osd_data_pages(req, 0, pages, read_len,
> > +						 offset_in_page(read_off),
> >   						 false, false);
> >   		ret = ceph_osdc_start_request(osdc, req, false);
> >   		if (!ret)
> > @@ -957,23 +969,50 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
> >   		ceph_update_read_metrics(&fsc->mdsc->metric,
> >   					 req->r_start_latency,
> >   					 req->r_end_latency,
> > -					 len, ret);
> > +					 read_len, ret);
> >   
> >   		if (ret > 0)
> >   			objver = req->r_version;
> >   		ceph_osdc_put_request(req);
> > -
> >   		i_size = i_size_read(inode);
> >   		dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
> >   		     off, len, ret, i_size, (more ? " MORE" : ""));
> >   
> > -		if (ret == -ENOENT)
> > +		if (ret == -ENOENT) {
> > +			/* No object? Then this is a hole */
> >   			ret = 0;
> > +		} else if (ret > 0 && IS_ENCRYPTED(inode)) {
> > +			int fret;
> > +
> > +			fret = ceph_fscrypt_decrypt_pages(inode, pages, read_off, ret);
> > +			if (fret < 0) {
> > +				ceph_release_page_vector(pages, num_pages);
> > +				ret = fret;
> > +				break;
> > +			}
> > +
> > +			dout("sync_read decrypted fret %d\n", fret);
> > +
> > +			/* account for any partial block at the beginning */
> > +			fret -= (off - read_off);
> > +
> > +			/*
> > +			 * Short read after big offset adjustment?
> > +			 * Nothing is usable, just call it a zero
> > +			 * len read.
> > +			 */
> > +			fret = max(fret, 0);
> > +
> > +			/* account for partial block at the end */
> > +			ret = min_t(ssize_t, fret, len);
> > +		}
> > +
> > +		/* Short read but not EOF? Zero out the remainder. */
> >   		if (ret >= 0 && ret < len && (off + ret < i_size)) {
> >   			int zlen = min(len - ret, i_size - off - ret);
> >   			int zoff = page_off + ret;
> >   			dout("sync_read zero gap %llu~%llu\n",
> > -                             off + ret, off + ret + zlen);
> > +			     off + ret, off + ret + zlen);
> >   			ceph_zero_page_vector_range(zoff, zlen, pages);
> >   			ret += zlen;
> >   		}
> > @@ -981,15 +1020,15 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
> >   		idx = 0;
> >   		left = ret > 0 ? ret : 0;
> >   		while (left > 0) {
> > -			size_t len, copied;
> > -			page_off = off & ~PAGE_MASK;
> > -			len = min_t(size_t, left, PAGE_SIZE - page_off);
> > +			size_t plen, copied;
> > +			plen = min_t(size_t, left, PAGE_SIZE - page_off);
> >   			SetPageUptodate(pages[idx]);
> >   			copied = copy_page_to_iter(pages[idx++],
> > -						   page_off, len, to);
> > +						   page_off, plen, to);
> >   			off += copied;
> >   			left -= copied;
> > -			if (copied < len) {
> > +			page_off = 0;
> > +			if (copied < plen) {
> >   				ret = -EFAULT;
> >   				break;
> >   			}
> > @@ -1006,20 +1045,21 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
> >   			break;
> >   	}
> >   
> > -	if (off > *ki_pos) {
> > -		if (off >= i_size) {
> > -			*retry_op = CHECK_EOF;
> > -			ret = i_size - *ki_pos;
> > -			*ki_pos = i_size;
> > -		} else {
> > -			ret = off - *ki_pos;
> > -			*ki_pos = off;
> > +	if (ret > 0) {
> > +		if (off > *ki_pos) {
> > +			if (off >= i_size) {
> > +				*retry_op = CHECK_EOF;
> > +				ret = i_size - *ki_pos;
> > +				*ki_pos = i_size;
> > +			} else {
> > +				ret = off - *ki_pos;
> > +				*ki_pos = off;
> > +			}
> >   		}
> > -	}
> > -
> > -	if (last_objver && ret > 0)
> > -		*last_objver = objver;
> >   
> > +		if (last_objver)
> > +			*last_objver = objver;
> > +	}
> >   	dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
> >   	return ret;
> >   }
> > @@ -1532,6 +1572,9 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
> >   		last = (pos + len) != (write_pos + write_len);
> >   		rmw = first || last;
> >   
> > +		dout("sync_write ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n",
> > +		     ci->i_vino.ino, pos, len, write_pos, write_len, rmw ? "" : "no ");
> > +
> 
> Should this move to the previous patch ?
> 
> 

Yes, fixed in wip-fscrypt. Thanks!

> >   		/*
> >   		 * The data is emplaced into the page as it would be if it were in
> >   		 * an array of pagecache pages.
> > @@ -1761,6 +1804,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
> >   		ceph_clear_error_write(ci);
> >   		pos += len;
> >   		written += len;
> > +		dout("sync_write written %d\n", written);
> >   		if (pos > i_size_read(inode)) {
> >   			check_caps = ceph_inode_set_size(inode, pos);
> >   			if (check_caps)
>
diff mbox series

Patch

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 41766b2012e9..b4f2fcd33837 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -926,9 +926,17 @@  ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 		bool more;
 		int idx;
 		size_t left;
+		u64 read_off = off;
+		u64 read_len = len;
+
+		/* determine new offset/length if encrypted */
+		fscrypt_adjust_off_and_len(inode, &read_off, &read_len);
+
+		dout("sync_read orig %llu~%llu reading %llu~%llu",
+		     off, len, read_off, read_len);
 
 		req = ceph_osdc_new_request(osdc, &ci->i_layout,
-					ci->i_vino, off, &len, 0, 1,
+					ci->i_vino, read_off, &read_len, 0, 1,
 					CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
 					NULL, ci->i_truncate_seq,
 					ci->i_truncate_size, false);
@@ -937,10 +945,13 @@  ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 			break;
 		}
 
+		/* adjust len downward if the request truncated the len */
+		if (off + len > read_off + read_len)
+			len = read_off + read_len - off;
 		more = len < iov_iter_count(to);
 
-		num_pages = calc_pages_for(off, len);
-		page_off = off & ~PAGE_MASK;
+		num_pages = calc_pages_for(read_off, read_len);
+		page_off = offset_in_page(off);
 		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
 		if (IS_ERR(pages)) {
 			ceph_osdc_put_request(req);
@@ -948,7 +959,8 @@  ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 			break;
 		}
 
-		osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
+		osd_req_op_extent_osd_data_pages(req, 0, pages, read_len,
+						 offset_in_page(read_off),
 						 false, false);
 		ret = ceph_osdc_start_request(osdc, req, false);
 		if (!ret)
@@ -957,23 +969,50 @@  ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 		ceph_update_read_metrics(&fsc->mdsc->metric,
 					 req->r_start_latency,
 					 req->r_end_latency,
-					 len, ret);
+					 read_len, ret);
 
 		if (ret > 0)
 			objver = req->r_version;
 		ceph_osdc_put_request(req);
-
 		i_size = i_size_read(inode);
 		dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
 		     off, len, ret, i_size, (more ? " MORE" : ""));
 
-		if (ret == -ENOENT)
+		if (ret == -ENOENT) {
+			/* No object? Then this is a hole */
 			ret = 0;
+		} else if (ret > 0 && IS_ENCRYPTED(inode)) {
+			int fret;
+
+			fret = ceph_fscrypt_decrypt_pages(inode, pages, read_off, ret);
+			if (fret < 0) {
+				ceph_release_page_vector(pages, num_pages);
+				ret = fret;
+				break;
+			}
+
+			dout("sync_read decrypted fret %d\n", fret);
+
+			/* account for any partial block at the beginning */
+			fret -= (off - read_off);
+
+			/*
+			 * Short read after big offset adjustment?
+			 * Nothing is usable, just call it a zero
+			 * len read.
+			 */
+			fret = max(fret, 0);
+
+			/* account for partial block at the end */
+			ret = min_t(ssize_t, fret, len);
+		}
+
+		/* Short read but not EOF? Zero out the remainder. */
 		if (ret >= 0 && ret < len && (off + ret < i_size)) {
 			int zlen = min(len - ret, i_size - off - ret);
 			int zoff = page_off + ret;
 			dout("sync_read zero gap %llu~%llu\n",
-                             off + ret, off + ret + zlen);
+			     off + ret, off + ret + zlen);
 			ceph_zero_page_vector_range(zoff, zlen, pages);
 			ret += zlen;
 		}
@@ -981,15 +1020,15 @@  ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 		idx = 0;
 		left = ret > 0 ? ret : 0;
 		while (left > 0) {
-			size_t len, copied;
-			page_off = off & ~PAGE_MASK;
-			len = min_t(size_t, left, PAGE_SIZE - page_off);
+			size_t plen, copied;
+			plen = min_t(size_t, left, PAGE_SIZE - page_off);
 			SetPageUptodate(pages[idx]);
 			copied = copy_page_to_iter(pages[idx++],
-						   page_off, len, to);
+						   page_off, plen, to);
 			off += copied;
 			left -= copied;
-			if (copied < len) {
+			page_off = 0;
+			if (copied < plen) {
 				ret = -EFAULT;
 				break;
 			}
@@ -1006,20 +1045,21 @@  ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 			break;
 	}
 
-	if (off > *ki_pos) {
-		if (off >= i_size) {
-			*retry_op = CHECK_EOF;
-			ret = i_size - *ki_pos;
-			*ki_pos = i_size;
-		} else {
-			ret = off - *ki_pos;
-			*ki_pos = off;
+	if (ret > 0) {
+		if (off > *ki_pos) {
+			if (off >= i_size) {
+				*retry_op = CHECK_EOF;
+				ret = i_size - *ki_pos;
+				*ki_pos = i_size;
+			} else {
+				ret = off - *ki_pos;
+				*ki_pos = off;
+			}
 		}
-	}
-
-	if (last_objver && ret > 0)
-		*last_objver = objver;
 
+		if (last_objver)
+			*last_objver = objver;
+	}
 	dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
 	return ret;
 }
@@ -1532,6 +1572,9 @@  ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 		last = (pos + len) != (write_pos + write_len);
 		rmw = first || last;
 
+		dout("sync_write ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n",
+		     ci->i_vino.ino, pos, len, write_pos, write_len, rmw ? "" : "no ");
+
 		/*
 		 * The data is emplaced into the page as it would be if it were in
 		 * an array of pagecache pages.
@@ -1761,6 +1804,7 @@  ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 		ceph_clear_error_write(ci);
 		pos += len;
 		written += len;
+		dout("sync_write written %d\n", written);
 		if (pos > i_size_read(inode)) {
 			check_caps = ceph_inode_set_size(inode, pos);
 			if (check_caps)