Message ID | 1478259273-3471-8-git-send-email-jlayton@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, 2016-11-04 at 07:34 -0400, Jeff Layton wrote: > The userland ceph has MClientCaps at struct version 9. This brings the > kernel up the same version. > > With this change, we have to start tracking the btime and change_attr, > so that the client can pass back sane values in cap messages. The > client doesn't care about the btime at all, so this is just passed > around, but the change_attr is used when ceph is exported via NFS. > > For now, the new "sync" parm is left at 0, to preserve the existing > behavior of the client. > > Signed-off-by: Jeff Layton <jlayton@redhat.com> > --- > fs/ceph/caps.c | 33 +++++++++++++++++++++++++-------- > 1 file changed, 25 insertions(+), 8 deletions(-) > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > index 6e99866b1946..452f5024589f 100644 > --- a/fs/ceph/caps.c > +++ b/fs/ceph/caps.c > @@ -991,9 +991,9 @@ struct cap_msg_args { > struct ceph_mds_session *session; > u64 ino, cid, follows; > u64 flush_tid, oldest_flush_tid, size, max_size; > - u64 xattr_version; > + u64 xattr_version, change_attr; > struct ceph_buffer *xattr_buf; > - struct timespec atime, mtime, ctime; > + struct timespec atime, mtime, ctime, btime; > int op, caps, wanted, dirty; > u32 seq, issue_seq, mseq, time_warp_seq; > kuid_t uid; > @@ -1026,13 +1026,13 @@ static int send_cap_msg(struct cap_msg_args *arg) > > /* flock buffer size + inline version + inline data size + > * osd_epoch_barrier + oldest_flush_tid */ > - extra_len = 4 + 8 + 4 + 4 + 8; > + extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 1; > msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, > GFP_NOFS, false); > if (!msg) > return -ENOMEM; > > - msg->hdr.version = cpu_to_le16(6); > + msg->hdr.version = cpu_to_le16(9); > msg->hdr.tid = cpu_to_le64(arg->flush_tid); > > fc = msg->front.iov_base; > @@ -1068,17 +1068,30 @@ static int send_cap_msg(struct cap_msg_args *arg) > } > > p = fc + 1; > - /* flock buffer size */ > + /* flock buffer size (version 2) */ > ceph_encode_32(&p, 0); > - /* inline version */ > + /* inline version (version 4) */ > ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); > /* inline data size */ > ceph_encode_32(&p, 0); > - /* osd_epoch_barrier */ > + /* osd_epoch_barrier (version 5) */ > ceph_encode_32(&p, 0); > - /* oldest_flush_tid */ > + /* oldest_flush_tid (version 6) */ > ceph_encode_64(&p, arg->oldest_flush_tid); > > + /* caller_uid/caller_gid (version 7) */ > + ceph_encode_32(&p, (u32)-1); > + ceph_encode_32(&p, (u32)-1); A bit of self-review... Not sure if we want to set the above to something else -- maybe 0 or to current's creds? That may not always make sense though (during e.g. writeback). > + > + /* pool namespace (version 8) */ > + ceph_encode_32(&p, 0); > + I'm a little unclear on how the above should be set, but I'll look over the userland code and ape what it does. > + /* btime, change_attr, sync (version 9) */ > + ceph_encode_timespec(p, &arg->btime); > + p += sizeof(struct ceph_timespec); > + ceph_encode_64(&p, arg->change_attr); > + ceph_encode_8(&p, 0); > + > ceph_con_send(&arg->session->s_con, msg); > return 0; > } > @@ -1189,9 +1202,11 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, > arg.xattr_buf = NULL; > } > > + arg.change_attr = inode->i_version; > arg.mtime = inode->i_mtime; > arg.atime = inode->i_atime; > arg.ctime = inode->i_ctime; > + arg.btime = ci->i_btime; > > arg.op = op; > arg.caps = cap->implemented; > @@ -1241,10 +1256,12 @@ static inline int __send_flush_snap(struct inode *inode, > arg.max_size = 0; > arg.xattr_version = capsnap->xattr_version; > arg.xattr_buf = capsnap->xattr_blob; > + arg.change_attr = capsnap->change_attr; > > arg.atime = capsnap->atime; > arg.mtime = capsnap->mtime; > arg.ctime = capsnap->ctime; > + arg.btime = capsnap->btime; > > arg.op = CEPH_CAP_OP_FLUSHSNAP; > arg.caps = capsnap->issued;
On Fri, Nov 4, 2016 at 8:57 PM, Jeff Layton <jlayton@redhat.com> wrote: > On Fri, 2016-11-04 at 07:34 -0400, Jeff Layton wrote: >> The userland ceph has MClientCaps at struct version 9. This brings the >> kernel up the same version. >> >> With this change, we have to start tracking the btime and change_attr, >> so that the client can pass back sane values in cap messages. The >> client doesn't care about the btime at all, so this is just passed >> around, but the change_attr is used when ceph is exported via NFS. >> >> For now, the new "sync" parm is left at 0, to preserve the existing >> behavior of the client. >> >> Signed-off-by: Jeff Layton <jlayton@redhat.com> >> --- >> fs/ceph/caps.c | 33 +++++++++++++++++++++++++-------- >> 1 file changed, 25 insertions(+), 8 deletions(-) >> >> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c >> index 6e99866b1946..452f5024589f 100644 >> --- a/fs/ceph/caps.c >> +++ b/fs/ceph/caps.c >> @@ -991,9 +991,9 @@ struct cap_msg_args { >> struct ceph_mds_session *session; >> u64 ino, cid, follows; >> u64 flush_tid, oldest_flush_tid, size, max_size; >> - u64 xattr_version; >> + u64 xattr_version, change_attr; >> struct ceph_buffer *xattr_buf; >> - struct timespec atime, mtime, ctime; >> + struct timespec atime, mtime, ctime, btime; >> int op, caps, wanted, dirty; >> u32 seq, issue_seq, mseq, time_warp_seq; >> kuid_t uid; >> @@ -1026,13 +1026,13 @@ static int send_cap_msg(struct cap_msg_args *arg) >> >> /* flock buffer size + inline version + inline data size + >> * osd_epoch_barrier + oldest_flush_tid */ >> - extra_len = 4 + 8 + 4 + 4 + 8; >> + extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 1; >> msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, >> GFP_NOFS, false); >> if (!msg) >> return -ENOMEM; >> >> - msg->hdr.version = cpu_to_le16(6); >> + msg->hdr.version = cpu_to_le16(9); >> msg->hdr.tid = cpu_to_le64(arg->flush_tid); >> >> fc = msg->front.iov_base; >> @@ -1068,17 +1068,30 @@ static int send_cap_msg(struct cap_msg_args *arg) >> } >> >> p = fc + 1; >> - /* flock buffer size */ >> + /* flock buffer size (version 2) */ >> ceph_encode_32(&p, 0); >> - /* inline version */ >> + /* inline version (version 4) */ >> ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); >> /* inline data size */ >> ceph_encode_32(&p, 0); >> - /* osd_epoch_barrier */ >> + /* osd_epoch_barrier (version 5) */ >> ceph_encode_32(&p, 0); >> - /* oldest_flush_tid */ >> + /* oldest_flush_tid (version 6) */ >> ceph_encode_64(&p, arg->oldest_flush_tid); >> >> + /* caller_uid/caller_gid (version 7) */ >> + ceph_encode_32(&p, (u32)-1); >> + ceph_encode_32(&p, (u32)-1); > > A bit of self-review... > > Not sure if we want to set the above to something else -- maybe 0 or to > current's creds? That may not always make sense though (during e.g. > writeback). > >> + >> + /* pool namespace (version 8) */ >> + ceph_encode_32(&p, 0); >> + > > I'm a little unclear on how the above should be set, but I'll look over > the userland code and ape what it does. pool namespace is useless for client->mds cap message, set its length to 0 should be OK. > >> + /* btime, change_attr, sync (version 9) */ >> + ceph_encode_timespec(p, &arg->btime); >> + p += sizeof(struct ceph_timespec); >> + ceph_encode_64(&p, arg->change_attr); >> + ceph_encode_8(&p, 0); >> + >> ceph_con_send(&arg->session->s_con, msg); >> return 0; >> } >> @@ -1189,9 +1202,11 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, >> arg.xattr_buf = NULL; >> } >> >> + arg.change_attr = inode->i_version; >> arg.mtime = inode->i_mtime; >> arg.atime = inode->i_atime; >> arg.ctime = inode->i_ctime; >> + arg.btime = ci->i_btime; >> >> arg.op = op; >> arg.caps = cap->implemented; >> @@ -1241,10 +1256,12 @@ static inline int __send_flush_snap(struct inode *inode, >> arg.max_size = 0; >> arg.xattr_version = capsnap->xattr_version; >> arg.xattr_buf = capsnap->xattr_blob; >> + arg.change_attr = capsnap->change_attr; >> >> arg.atime = capsnap->atime; >> arg.mtime = capsnap->mtime; >> arg.ctime = capsnap->ctime; >> + arg.btime = capsnap->btime; >> >> arg.op = CEPH_CAP_OP_FLUSHSNAP; >> arg.caps = capsnap->issued; > > -- > Jeff Layton <jlayton@redhat.com> > -- > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2016-11-07 at 16:43 +0800, Yan, Zheng wrote: > On Fri, Nov 4, 2016 at 8:57 PM, Jeff Layton <jlayton@redhat.com> wrote: > > > > On Fri, 2016-11-04 at 07:34 -0400, Jeff Layton wrote: > > > > > > The userland ceph has MClientCaps at struct version 9. This brings the > > > kernel up the same version. > > > > > > With this change, we have to start tracking the btime and change_attr, > > > so that the client can pass back sane values in cap messages. The > > > client doesn't care about the btime at all, so this is just passed > > > around, but the change_attr is used when ceph is exported via NFS. > > > > > > For now, the new "sync" parm is left at 0, to preserve the existing > > > behavior of the client. > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com> > > > --- > > > fs/ceph/caps.c | 33 +++++++++++++++++++++++++-------- > > > 1 file changed, 25 insertions(+), 8 deletions(-) > > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > > > index 6e99866b1946..452f5024589f 100644 > > > --- a/fs/ceph/caps.c > > > +++ b/fs/ceph/caps.c > > > @@ -991,9 +991,9 @@ struct cap_msg_args { > > > struct ceph_mds_session *session; > > > u64 ino, cid, follows; > > > u64 flush_tid, oldest_flush_tid, size, max_size; > > > - u64 xattr_version; > > > + u64 xattr_version, change_attr; > > > struct ceph_buffer *xattr_buf; > > > - struct timespec atime, mtime, ctime; > > > + struct timespec atime, mtime, ctime, btime; > > > int op, caps, wanted, dirty; > > > u32 seq, issue_seq, mseq, time_warp_seq; > > > kuid_t uid; > > > @@ -1026,13 +1026,13 @@ static int send_cap_msg(struct cap_msg_args *arg) > > > > > > /* flock buffer size + inline version + inline data size + > > > * osd_epoch_barrier + oldest_flush_tid */ > > > - extra_len = 4 + 8 + 4 + 4 + 8; > > > + extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 1; > > > msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, > > > GFP_NOFS, false); > > > if (!msg) > > > return -ENOMEM; > > > > > > - msg->hdr.version = cpu_to_le16(6); > > > + msg->hdr.version = cpu_to_le16(9); > > > msg->hdr.tid = cpu_to_le64(arg->flush_tid); > > > > > > fc = msg->front.iov_base; > > > @@ -1068,17 +1068,30 @@ static int send_cap_msg(struct cap_msg_args *arg) > > > } > > > > > > p = fc + 1; > > > - /* flock buffer size */ > > > + /* flock buffer size (version 2) */ > > > ceph_encode_32(&p, 0); > > > - /* inline version */ > > > + /* inline version (version 4) */ > > > ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); > > > /* inline data size */ > > > ceph_encode_32(&p, 0); > > > - /* osd_epoch_barrier */ > > > + /* osd_epoch_barrier (version 5) */ > > > ceph_encode_32(&p, 0); > > > - /* oldest_flush_tid */ > > > + /* oldest_flush_tid (version 6) */ > > > ceph_encode_64(&p, arg->oldest_flush_tid); > > > > > > + /* caller_uid/caller_gid (version 7) */ > > > + ceph_encode_32(&p, (u32)-1); > > > + ceph_encode_32(&p, (u32)-1); > > > > A bit of self-review... > > > > Not sure if we want to set the above to something else -- maybe 0 or to > > current's creds? That may not always make sense though (during e.g. > > writeback). > > Looking further, I'm not quite sure I understand why we send creds at all in cap messages. Can you clarify where that matters? The way I look at it, would be to consider caps to be something like a more granular NFS delegation or SMB oplock. In that light, a cap flush is just the client sending updated attrs for the exclusive caps that it has already been granted. Is there a situation where we would ever want to refuse that update? Note that nothing ever checks the return code for _do_cap_update in the userland code. If the permissions check fails, then we'll end up silently dropping the updated attrs on the floor. > > > > > > + > > > + /* pool namespace (version 8) */ > > > + ceph_encode_32(&p, 0); > > > + > > > > I'm a little unclear on how the above should be set, but I'll look over > > the userland code and ape what it does. > > pool namespace is useless for client->mds cap message, set its length > to 0 should be OK. > Thanks. I went ahead and added a comment to that effect in the updated set I'm testing now. > > > > > > > > > > + /* btime, change_attr, sync (version 9) */ > > > + ceph_encode_timespec(p, &arg->btime); > > > + p += sizeof(struct ceph_timespec); > > > + ceph_encode_64(&p, arg->change_attr); > > > + ceph_encode_8(&p, 0); > > > + > > > ceph_con_send(&arg->session->s_con, msg); > > > return 0; > > > } > > > @@ -1189,9 +1202,11 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, > > > arg.xattr_buf = NULL; > > > } > > > > > > + arg.change_attr = inode->i_version; > > > arg.mtime = inode->i_mtime; > > > arg.atime = inode->i_atime; > > > arg.ctime = inode->i_ctime; > > > + arg.btime = ci->i_btime; > > > > > > arg.op = op; > > > arg.caps = cap->implemented; > > > @@ -1241,10 +1256,12 @@ static inline int __send_flush_snap(struct inode *inode, > > > arg.max_size = 0; > > > arg.xattr_version = capsnap->xattr_version; > > > arg.xattr_buf = capsnap->xattr_blob; > > > + arg.change_attr = capsnap->change_attr; > > > > > > arg.atime = capsnap->atime; > > > arg.mtime = capsnap->mtime; > > > arg.ctime = capsnap->ctime; > > > + arg.btime = capsnap->btime; > > > > > > arg.op = CEPH_CAP_OP_FLUSHSNAP; > > > arg.caps = capsnap->issued; > > > > -- > > Jeff Layton <jlayton@redhat.com> > > -- > > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 7 Nov 2016, Jeff Layton wrote: > On Mon, 2016-11-07 at 16:43 +0800, Yan, Zheng wrote: > > On Fri, Nov 4, 2016 at 8:57 PM, Jeff Layton <jlayton@redhat.com> wrote: > > > > > > On Fri, 2016-11-04 at 07:34 -0400, Jeff Layton wrote: > > > > > > > > The userland ceph has MClientCaps at struct version 9. This brings the > > > > kernel up the same version. > > > > > > > > With this change, we have to start tracking the btime and change_attr, > > > > so that the client can pass back sane values in cap messages. The > > > > client doesn't care about the btime at all, so this is just passed > > > > around, but the change_attr is used when ceph is exported via NFS. > > > > > > > > For now, the new "sync" parm is left at 0, to preserve the existing > > > > behavior of the client. > > > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com> > > > > --- > > > > fs/ceph/caps.c | 33 +++++++++++++++++++++++++-------- > > > > 1 file changed, 25 insertions(+), 8 deletions(-) > > > > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > > > > index 6e99866b1946..452f5024589f 100644 > > > > --- a/fs/ceph/caps.c > > > > +++ b/fs/ceph/caps.c > > > > @@ -991,9 +991,9 @@ struct cap_msg_args { > > > > struct ceph_mds_session *session; > > > > u64 ino, cid, follows; > > > > u64 flush_tid, oldest_flush_tid, size, max_size; > > > > - u64 xattr_version; > > > > + u64 xattr_version, change_attr; > > > > struct ceph_buffer *xattr_buf; > > > > - struct timespec atime, mtime, ctime; > > > > + struct timespec atime, mtime, ctime, btime; > > > > int op, caps, wanted, dirty; > > > > u32 seq, issue_seq, mseq, time_warp_seq; > > > > kuid_t uid; > > > > @@ -1026,13 +1026,13 @@ static int send_cap_msg(struct cap_msg_args *arg) > > > > > > > > /* flock buffer size + inline version + inline data size + > > > > * osd_epoch_barrier + oldest_flush_tid */ > > > > - extra_len = 4 + 8 + 4 + 4 + 8; > > > > + extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 1; > > > > msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, > > > > GFP_NOFS, false); > > > > if (!msg) > > > > return -ENOMEM; > > > > > > > > - msg->hdr.version = cpu_to_le16(6); > > > > + msg->hdr.version = cpu_to_le16(9); > > > > msg->hdr.tid = cpu_to_le64(arg->flush_tid); > > > > > > > > fc = msg->front.iov_base; > > > > @@ -1068,17 +1068,30 @@ static int send_cap_msg(struct cap_msg_args *arg) > > > > } > > > > > > > > p = fc + 1; > > > > - /* flock buffer size */ > > > > + /* flock buffer size (version 2) */ > > > > ceph_encode_32(&p, 0); > > > > - /* inline version */ > > > > + /* inline version (version 4) */ > > > > ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); > > > > /* inline data size */ > > > > ceph_encode_32(&p, 0); > > > > - /* osd_epoch_barrier */ > > > > + /* osd_epoch_barrier (version 5) */ > > > > ceph_encode_32(&p, 0); > > > > - /* oldest_flush_tid */ > > > > + /* oldest_flush_tid (version 6) */ > > > > ceph_encode_64(&p, arg->oldest_flush_tid); > > > > > > > > + /* caller_uid/caller_gid (version 7) */ > > > > + ceph_encode_32(&p, (u32)-1); > > > > + ceph_encode_32(&p, (u32)-1); > > > > > > A bit of self-review... > > > > > > Not sure if we want to set the above to something else -- maybe 0 or to > > > current's creds? That may not always make sense though (during e.g. > > > writeback). > > > > > Looking further, I'm not quite sure I understand why we send creds at > all in cap messages. Can you clarify where that matters? > > The way I look at it, would be to consider caps to be something like a > more granular NFS delegation or SMB oplock. > > In that light, a cap flush is just the client sending updated attrs for > the exclusive caps that it has already been granted. Is there a > situation where we would ever want to refuse that update? A chmod or chown can be done locally if you have excl caps and flushed back to the MDS via a caps message. We need to verify the user has permission to make the change. > Note that nothing ever checks the return code for _do_cap_update in the > userland code. If the permissions check fails, then we'll end up > silently dropping the updated attrs on the floor. Yeah. This was mainly for expediency... the protocol assumes that flushes don't fail. Given that the client does it's own permissions check, I think the way to improve this is to have it prevent the flush in the first place, so that it's only nefarious clients that are effected (and who cares if they get confused). I don't think we have a particularly good way to tell the client it can't, say, sudo chmod 0:0 a file, though. sage > > > > > > > > > + > > > > + /* pool namespace (version 8) */ > > > > + ceph_encode_32(&p, 0); > > > > + > > > > > > I'm a little unclear on how the above should be set, but I'll look over > > > the userland code and ape what it does. > > > > pool namespace is useless for client->mds cap message, set its length > > to 0 should be OK. > > > > Thanks. I went ahead and added a comment to that effect in the updated > set I'm testing now. > > > > > > > > > > > > > > > + /* btime, change_attr, sync (version 9) */ > > > > + ceph_encode_timespec(p, &arg->btime); > > > > + p += sizeof(struct ceph_timespec); > > > > + ceph_encode_64(&p, arg->change_attr); > > > > + ceph_encode_8(&p, 0); > > > > + > > > > ceph_con_send(&arg->session->s_con, msg); > > > > return 0; > > > > } > > > > @@ -1189,9 +1202,11 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, > > > > arg.xattr_buf = NULL; > > > > } > > > > > > > > + arg.change_attr = inode->i_version; > > > > arg.mtime = inode->i_mtime; > > > > arg.atime = inode->i_atime; > > > > arg.ctime = inode->i_ctime; > > > > + arg.btime = ci->i_btime; > > > > > > > > arg.op = op; > > > > arg.caps = cap->implemented; > > > > @@ -1241,10 +1256,12 @@ static inline int __send_flush_snap(struct inode *inode, > > > > arg.max_size = 0; > > > > arg.xattr_version = capsnap->xattr_version; > > > > arg.xattr_buf = capsnap->xattr_blob; > > > > + arg.change_attr = capsnap->change_attr; > > > > > > > > arg.atime = capsnap->atime; > > > > arg.mtime = capsnap->mtime; > > > > arg.ctime = capsnap->ctime; > > > > + arg.btime = capsnap->btime; > > > > > > > > arg.op = CEPH_CAP_OP_FLUSHSNAP; > > > > arg.caps = capsnap->issued; > > > > > > -- > > > Jeff Layton <jlayton@redhat.com> > > > -- > > > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in > > > the body of a message to majordomo@vger.kernel.org > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- > Jeff Layton <jlayton@redhat.com> > > -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2016-11-07 at 14:05 +0000, Sage Weil wrote: > On Mon, 7 Nov 2016, Jeff Layton wrote: > > On Mon, 2016-11-07 at 16:43 +0800, Yan, Zheng wrote: > > > On Fri, Nov 4, 2016 at 8:57 PM, Jeff Layton <jlayton@redhat.com> wrote: > > > > > > > > On Fri, 2016-11-04 at 07:34 -0400, Jeff Layton wrote: > > > > > > > > > > The userland ceph has MClientCaps at struct version 9. This brings the > > > > > kernel up the same version. > > > > > > > > > > With this change, we have to start tracking the btime and change_attr, > > > > > so that the client can pass back sane values in cap messages. The > > > > > client doesn't care about the btime at all, so this is just passed > > > > > around, but the change_attr is used when ceph is exported via NFS. > > > > > > > > > > For now, the new "sync" parm is left at 0, to preserve the existing > > > > > behavior of the client. > > > > > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com> > > > > > --- > > > > > fs/ceph/caps.c | 33 +++++++++++++++++++++++++-------- > > > > > 1 file changed, 25 insertions(+), 8 deletions(-) > > > > > > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > > > > > index 6e99866b1946..452f5024589f 100644 > > > > > --- a/fs/ceph/caps.c > > > > > +++ b/fs/ceph/caps.c > > > > > @@ -991,9 +991,9 @@ struct cap_msg_args { > > > > > struct ceph_mds_session *session; > > > > > u64 ino, cid, follows; > > > > > u64 flush_tid, oldest_flush_tid, size, max_size; > > > > > - u64 xattr_version; > > > > > + u64 xattr_version, change_attr; > > > > > struct ceph_buffer *xattr_buf; > > > > > - struct timespec atime, mtime, ctime; > > > > > + struct timespec atime, mtime, ctime, btime; > > > > > int op, caps, wanted, dirty; > > > > > u32 seq, issue_seq, mseq, time_warp_seq; > > > > > kuid_t uid; > > > > > @@ -1026,13 +1026,13 @@ static int send_cap_msg(struct cap_msg_args *arg) > > > > > > > > > > /* flock buffer size + inline version + inline data size + > > > > > * osd_epoch_barrier + oldest_flush_tid */ > > > > > - extra_len = 4 + 8 + 4 + 4 + 8; > > > > > + extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 1; > > > > > msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, > > > > > GFP_NOFS, false); > > > > > if (!msg) > > > > > return -ENOMEM; > > > > > > > > > > - msg->hdr.version = cpu_to_le16(6); > > > > > + msg->hdr.version = cpu_to_le16(9); > > > > > msg->hdr.tid = cpu_to_le64(arg->flush_tid); > > > > > > > > > > fc = msg->front.iov_base; > > > > > @@ -1068,17 +1068,30 @@ static int send_cap_msg(struct cap_msg_args *arg) > > > > > } > > > > > > > > > > p = fc + 1; > > > > > - /* flock buffer size */ > > > > > + /* flock buffer size (version 2) */ > > > > > ceph_encode_32(&p, 0); > > > > > - /* inline version */ > > > > > + /* inline version (version 4) */ > > > > > ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); > > > > > /* inline data size */ > > > > > ceph_encode_32(&p, 0); > > > > > - /* osd_epoch_barrier */ > > > > > + /* osd_epoch_barrier (version 5) */ > > > > > ceph_encode_32(&p, 0); > > > > > - /* oldest_flush_tid */ > > > > > + /* oldest_flush_tid (version 6) */ > > > > > ceph_encode_64(&p, arg->oldest_flush_tid); > > > > > > > > > > + /* caller_uid/caller_gid (version 7) */ > > > > > + ceph_encode_32(&p, (u32)-1); > > > > > + ceph_encode_32(&p, (u32)-1); > > > > > > > > A bit of self-review... > > > > > > > > Not sure if we want to set the above to something else -- maybe 0 or to > > > > current's creds? That may not always make sense though (during e.g. > > > > writeback). > > > > > > > > Looking further, I'm not quite sure I understand why we send creds at > > all in cap messages. Can you clarify where that matters? > > > > The way I look at it, would be to consider caps to be something like a > > more granular NFS delegation or SMB oplock. > > > > In that light, a cap flush is just the client sending updated attrs for > > the exclusive caps that it has already been granted. Is there a > > situation where we would ever want to refuse that update? > > A chmod or chown can be done locally if you have excl caps and flushed > back to the MDS via a caps message. We need to verify the user has > permission to make the change. > My take is that once the MDS has delegated Ax to the client, then it's effectively trusting the client to handle permissions enforcement correctly. I don't see why we should second guess that. > > Note that nothing ever checks the return code for _do_cap_update in the > > userland code. If the permissions check fails, then we'll end up > > silently dropping the updated attrs on the floor. > > Yeah. This was mainly for expediency... the protocol assumes that flushes > don't fail. Given that the client does it's own permissions check, I > think the way to improve this is to have it prevent the flush in the first > place, so that it's only nefarious clients that are effected (and who > cares if they get confused). I don't think we have a particularly good > way to tell the client it can't, say, sudo chmod 0:0 a file, though. > Sorry, I don't quite follow. How would we prevent the flush from a nefarious client (which is not something we can really control)? In any case...ISTM that the permissions check in _do_cap_update ought to be replaced by a cephx key check. IOW, what we really want to know is whether the client is truly the one to which we delegated the caps. If so, then we sort of have to trust that it's doing the right thing with respect to permissions checking here. Does that make sense? > > > > > > > > > > > > > + > > > > > + /* pool namespace (version 8) */ > > > > > + ceph_encode_32(&p, 0); > > > > > + > > > > > > > > I'm a little unclear on how the above should be set, but I'll look over > > > > the userland code and ape what it does. > > > > > > pool namespace is useless for client->mds cap message, set its length > > > to 0 should be OK. > > > > > > > Thanks. I went ahead and added a comment to that effect in the updated > > set I'm testing now. > > > > > > > > > > > > > > > > > > > > + /* btime, change_attr, sync (version 9) */ > > > > > + ceph_encode_timespec(p, &arg->btime); > > > > > + p += sizeof(struct ceph_timespec); > > > > > + ceph_encode_64(&p, arg->change_attr); > > > > > + ceph_encode_8(&p, 0); > > > > > + > > > > > ceph_con_send(&arg->session->s_con, msg); > > > > > return 0; > > > > > } > > > > > @@ -1189,9 +1202,11 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, > > > > > arg.xattr_buf = NULL; > > > > > } > > > > > > > > > > + arg.change_attr = inode->i_version; > > > > > arg.mtime = inode->i_mtime; > > > > > arg.atime = inode->i_atime; > > > > > arg.ctime = inode->i_ctime; > > > > > + arg.btime = ci->i_btime; > > > > > > > > > > arg.op = op; > > > > > arg.caps = cap->implemented; > > > > > @@ -1241,10 +1256,12 @@ static inline int __send_flush_snap(struct inode *inode, > > > > > arg.max_size = 0; > > > > > arg.xattr_version = capsnap->xattr_version; > > > > > arg.xattr_buf = capsnap->xattr_blob; > > > > > + arg.change_attr = capsnap->change_attr; > > > > > > > > > > arg.atime = capsnap->atime; > > > > > arg.mtime = capsnap->mtime; > > > > > arg.ctime = capsnap->ctime; > > > > > + arg.btime = capsnap->btime; > > > > > > > > > > arg.op = CEPH_CAP_OP_FLUSHSNAP; > > > > > arg.caps = capsnap->issued; > > > > > > > > -- > > > > Jeff Layton <jlayton@redhat.com> > > > > -- > > > > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in > > > > the body of a message to majordomo@vger.kernel.org > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > -- > > Jeff Layton <jlayton@redhat.com> > > > >
On Mon, 7 Nov 2016, Jeff Layton wrote: > On Mon, 2016-11-07 at 14:05 +0000, Sage Weil wrote: > > On Mon, 7 Nov 2016, Jeff Layton wrote: > > > On Mon, 2016-11-07 at 16:43 +0800, Yan, Zheng wrote: > > > > On Fri, Nov 4, 2016 at 8:57 PM, Jeff Layton <jlayton@redhat.com> wrote: > > > > > > > > > > On Fri, 2016-11-04 at 07:34 -0400, Jeff Layton wrote: > > > > > > > > > > > > The userland ceph has MClientCaps at struct version 9. This brings the > > > > > > kernel up the same version. > > > > > > > > > > > > With this change, we have to start tracking the btime and change_attr, > > > > > > so that the client can pass back sane values in cap messages. The > > > > > > client doesn't care about the btime at all, so this is just passed > > > > > > around, but the change_attr is used when ceph is exported via NFS. > > > > > > > > > > > > For now, the new "sync" parm is left at 0, to preserve the existing > > > > > > behavior of the client. > > > > > > > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com> > > > > > > --- > > > > > > fs/ceph/caps.c | 33 +++++++++++++++++++++++++-------- > > > > > > 1 file changed, 25 insertions(+), 8 deletions(-) > > > > > > > > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > > > > > > index 6e99866b1946..452f5024589f 100644 > > > > > > --- a/fs/ceph/caps.c > > > > > > +++ b/fs/ceph/caps.c > > > > > > @@ -991,9 +991,9 @@ struct cap_msg_args { > > > > > > struct ceph_mds_session *session; > > > > > > u64 ino, cid, follows; > > > > > > u64 flush_tid, oldest_flush_tid, size, max_size; > > > > > > - u64 xattr_version; > > > > > > + u64 xattr_version, change_attr; > > > > > > struct ceph_buffer *xattr_buf; > > > > > > - struct timespec atime, mtime, ctime; > > > > > > + struct timespec atime, mtime, ctime, btime; > > > > > > int op, caps, wanted, dirty; > > > > > > u32 seq, issue_seq, mseq, time_warp_seq; > > > > > > kuid_t uid; > > > > > > @@ -1026,13 +1026,13 @@ static int send_cap_msg(struct cap_msg_args *arg) > > > > > > > > > > > > /* flock buffer size + inline version + inline data size + > > > > > > * osd_epoch_barrier + oldest_flush_tid */ > > > > > > - extra_len = 4 + 8 + 4 + 4 + 8; > > > > > > + extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 1; > > > > > > msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, > > > > > > GFP_NOFS, false); > > > > > > if (!msg) > > > > > > return -ENOMEM; > > > > > > > > > > > > - msg->hdr.version = cpu_to_le16(6); > > > > > > + msg->hdr.version = cpu_to_le16(9); > > > > > > msg->hdr.tid = cpu_to_le64(arg->flush_tid); > > > > > > > > > > > > fc = msg->front.iov_base; > > > > > > @@ -1068,17 +1068,30 @@ static int send_cap_msg(struct cap_msg_args *arg) > > > > > > } > > > > > > > > > > > > p = fc + 1; > > > > > > - /* flock buffer size */ > > > > > > + /* flock buffer size (version 2) */ > > > > > > ceph_encode_32(&p, 0); > > > > > > - /* inline version */ > > > > > > + /* inline version (version 4) */ > > > > > > ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); > > > > > > /* inline data size */ > > > > > > ceph_encode_32(&p, 0); > > > > > > - /* osd_epoch_barrier */ > > > > > > + /* osd_epoch_barrier (version 5) */ > > > > > > ceph_encode_32(&p, 0); > > > > > > - /* oldest_flush_tid */ > > > > > > + /* oldest_flush_tid (version 6) */ > > > > > > ceph_encode_64(&p, arg->oldest_flush_tid); > > > > > > > > > > > > + /* caller_uid/caller_gid (version 7) */ > > > > > > + ceph_encode_32(&p, (u32)-1); > > > > > > + ceph_encode_32(&p, (u32)-1); > > > > > > > > > > A bit of self-review... > > > > > > > > > > Not sure if we want to set the above to something else -- maybe 0 or to > > > > > current's creds? That may not always make sense though (during e.g. > > > > > writeback). > > > > > > > > > > > Looking further, I'm not quite sure I understand why we send creds at > > > all in cap messages. Can you clarify where that matters? > > > > > > The way I look at it, would be to consider caps to be something like a > > > more granular NFS delegation or SMB oplock. > > > > > > In that light, a cap flush is just the client sending updated attrs for > > > the exclusive caps that it has already been granted. Is there a > > > situation where we would ever want to refuse that update? > > > > A chmod or chown can be done locally if you have excl caps and flushed > > back to the MDS via a caps message. We need to verify the user has > > permission to make the change. > > > > My take is that once the MDS has delegated Ax to the client, then it's > effectively trusting the client to handle permissions enforcement > correctly. I don't see why we should second guess that. > > > > Note that nothing ever checks the return code for _do_cap_update in the > > > userland code. If the permissions check fails, then we'll end up > > > silently dropping the updated attrs on the floor. > > > > Yeah. This was mainly for expediency... the protocol assumes that flushes > > don't fail. Given that the client does it's own permissions check, I > > think the way to improve this is to have it prevent the flush in the first > > place, so that it's only nefarious clients that are effected (and who > > cares if they get confused). I don't think we have a particularly good > > way to tell the client it can't, say, sudo chmod 0:0 a file, though. > > > > Sorry, I don't quite follow. How would we prevent the flush from a > nefarious client (which is not something we can really control)? > > In any case...ISTM that the permissions check in _do_cap_update ought to > be replaced by a cephx key check. IOW, what we really want to know is > whether the client is truly the one to which we delegated the caps. If > so, then we sort of have to trust that it's doing the right thing with > respect to permissions checking here. The capability can say "you are allowed to be uid 1000 or uid 1020." We want to delegate the EXCL caps to the client so that a create + chmod + chown + write can all happen efficiently, but we still need to ensure that the values they set are legal (a permitted uid/gid combo). A common example would be user workstations that are allowed access to /home/user and restricted via their mds caps to their uid/gid. We need to prevent them from doing a 'sudo chown 0:0 foo'... sage > > Does that make sense? > > > > > > > > > > > > > > > > > > + > > > > > > + /* pool namespace (version 8) */ > > > > > > + ceph_encode_32(&p, 0); > > > > > > + > > > > > > > > > > I'm a little unclear on how the above should be set, but I'll look over > > > > > the userland code and ape what it does. > > > > > > > > pool namespace is useless for client->mds cap message, set its length > > > > to 0 should be OK. > > > > > > > > > > Thanks. I went ahead and added a comment to that effect in the updated > > > set I'm testing now. > > > > > > > > > > > > > > > > > > > > > > > > > + /* btime, change_attr, sync (version 9) */ > > > > > > + ceph_encode_timespec(p, &arg->btime); > > > > > > + p += sizeof(struct ceph_timespec); > > > > > > + ceph_encode_64(&p, arg->change_attr); > > > > > > + ceph_encode_8(&p, 0); > > > > > > + > > > > > > ceph_con_send(&arg->session->s_con, msg); > > > > > > return 0; > > > > > > } > > > > > > @@ -1189,9 +1202,11 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, > > > > > > arg.xattr_buf = NULL; > > > > > > } > > > > > > > > > > > > + arg.change_attr = inode->i_version; > > > > > > arg.mtime = inode->i_mtime; > > > > > > arg.atime = inode->i_atime; > > > > > > arg.ctime = inode->i_ctime; > > > > > > + arg.btime = ci->i_btime; > > > > > > > > > > > > arg.op = op; > > > > > > arg.caps = cap->implemented; > > > > > > @@ -1241,10 +1256,12 @@ static inline int __send_flush_snap(struct inode *inode, > > > > > > arg.max_size = 0; > > > > > > arg.xattr_version = capsnap->xattr_version; > > > > > > arg.xattr_buf = capsnap->xattr_blob; > > > > > > + arg.change_attr = capsnap->change_attr; > > > > > > > > > > > > arg.atime = capsnap->atime; > > > > > > arg.mtime = capsnap->mtime; > > > > > > arg.ctime = capsnap->ctime; > > > > > > + arg.btime = capsnap->btime; > > > > > > > > > > > > arg.op = CEPH_CAP_OP_FLUSHSNAP; > > > > > > arg.caps = capsnap->issued; > > > > > > > > > > -- > > > > > Jeff Layton <jlayton@redhat.com> > > > > > -- > > > > > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in > > > > > the body of a message to majordomo@vger.kernel.org > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > -- > > > Jeff Layton <jlayton@redhat.com> > > > > > > > > -- > Jeff Layton <jlayton@redhat.com> > -- > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > >
On Mon, 2016-11-07 at 14:36 +0000, Sage Weil wrote: > On Mon, 7 Nov 2016, Jeff Layton wrote: > > On Mon, 2016-11-07 at 14:05 +0000, Sage Weil wrote: > > > On Mon, 7 Nov 2016, Jeff Layton wrote: > > > > On Mon, 2016-11-07 at 16:43 +0800, Yan, Zheng wrote: > > > > > On Fri, Nov 4, 2016 at 8:57 PM, Jeff Layton <jlayton@redhat.com> wrote: > > > > > > > > > > > > On Fri, 2016-11-04 at 07:34 -0400, Jeff Layton wrote: > > > > > > > > > > > > > > The userland ceph has MClientCaps at struct version 9. This brings the > > > > > > > kernel up the same version. > > > > > > > > > > > > > > With this change, we have to start tracking the btime and change_attr, > > > > > > > so that the client can pass back sane values in cap messages. The > > > > > > > client doesn't care about the btime at all, so this is just passed > > > > > > > around, but the change_attr is used when ceph is exported via NFS. > > > > > > > > > > > > > > For now, the new "sync" parm is left at 0, to preserve the existing > > > > > > > behavior of the client. > > > > > > > > > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com> > > > > > > > --- > > > > > > > fs/ceph/caps.c | 33 +++++++++++++++++++++++++-------- > > > > > > > 1 file changed, 25 insertions(+), 8 deletions(-) > > > > > > > > > > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > > > > > > > index 6e99866b1946..452f5024589f 100644 > > > > > > > --- a/fs/ceph/caps.c > > > > > > > +++ b/fs/ceph/caps.c > > > > > > > @@ -991,9 +991,9 @@ struct cap_msg_args { > > > > > > > struct ceph_mds_session *session; > > > > > > > u64 ino, cid, follows; > > > > > > > u64 flush_tid, oldest_flush_tid, size, max_size; > > > > > > > - u64 xattr_version; > > > > > > > + u64 xattr_version, change_attr; > > > > > > > struct ceph_buffer *xattr_buf; > > > > > > > - struct timespec atime, mtime, ctime; > > > > > > > + struct timespec atime, mtime, ctime, btime; > > > > > > > int op, caps, wanted, dirty; > > > > > > > u32 seq, issue_seq, mseq, time_warp_seq; > > > > > > > kuid_t uid; > > > > > > > @@ -1026,13 +1026,13 @@ static int send_cap_msg(struct cap_msg_args *arg) > > > > > > > > > > > > > > /* flock buffer size + inline version + inline data size + > > > > > > > * osd_epoch_barrier + oldest_flush_tid */ > > > > > > > - extra_len = 4 + 8 + 4 + 4 + 8; > > > > > > > + extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 1; > > > > > > > msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, > > > > > > > GFP_NOFS, false); > > > > > > > if (!msg) > > > > > > > return -ENOMEM; > > > > > > > > > > > > > > - msg->hdr.version = cpu_to_le16(6); > > > > > > > + msg->hdr.version = cpu_to_le16(9); > > > > > > > msg->hdr.tid = cpu_to_le64(arg->flush_tid); > > > > > > > > > > > > > > fc = msg->front.iov_base; > > > > > > > @@ -1068,17 +1068,30 @@ static int send_cap_msg(struct cap_msg_args *arg) > > > > > > > } > > > > > > > > > > > > > > p = fc + 1; > > > > > > > - /* flock buffer size */ > > > > > > > + /* flock buffer size (version 2) */ > > > > > > > ceph_encode_32(&p, 0); > > > > > > > - /* inline version */ > > > > > > > + /* inline version (version 4) */ > > > > > > > ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); > > > > > > > /* inline data size */ > > > > > > > ceph_encode_32(&p, 0); > > > > > > > - /* osd_epoch_barrier */ > > > > > > > + /* osd_epoch_barrier (version 5) */ > > > > > > > ceph_encode_32(&p, 0); > > > > > > > - /* oldest_flush_tid */ > > > > > > > + /* oldest_flush_tid (version 6) */ > > > > > > > ceph_encode_64(&p, arg->oldest_flush_tid); > > > > > > > > > > > > > > + /* caller_uid/caller_gid (version 7) */ > > > > > > > + ceph_encode_32(&p, (u32)-1); > > > > > > > + ceph_encode_32(&p, (u32)-1); > > > > > > > > > > > > A bit of self-review... > > > > > > > > > > > > Not sure if we want to set the above to something else -- maybe 0 or to > > > > > > current's creds? That may not always make sense though (during e.g. > > > > > > writeback). > > > > > > > > > > > > > > Looking further, I'm not quite sure I understand why we send creds at > > > > all in cap messages. Can you clarify where that matters? > > > > > > > > The way I look at it, would be to consider caps to be something like a > > > > more granular NFS delegation or SMB oplock. > > > > > > > > In that light, a cap flush is just the client sending updated attrs for > > > > the exclusive caps that it has already been granted. Is there a > > > > situation where we would ever want to refuse that update? > > > > > > A chmod or chown can be done locally if you have excl caps and flushed > > > back to the MDS via a caps message. We need to verify the user has > > > permission to make the change. > > > > > > > My take is that once the MDS has delegated Ax to the client, then it's > > effectively trusting the client to handle permissions enforcement > > correctly. I don't see why we should second guess that. > > > > > > Note that nothing ever checks the return code for _do_cap_update in the > > > > userland code. If the permissions check fails, then we'll end up > > > > silently dropping the updated attrs on the floor. > > > > > > Yeah. This was mainly for expediency... the protocol assumes that flushes > > > don't fail. Given that the client does it's own permissions check, I > > > think the way to improve this is to have it prevent the flush in the first > > > place, so that it's only nefarious clients that are effected (and who > > > cares if they get confused). I don't think we have a particularly good > > > way to tell the client it can't, say, sudo chmod 0:0 a file, though. > > > > > > > Sorry, I don't quite follow. How would we prevent the flush from a > > nefarious client (which is not something we can really control)? > > > > In any case...ISTM that the permissions check in _do_cap_update ought to > > be replaced by a cephx key check. IOW, what we really want to know is > > whether the client is truly the one to which we delegated the caps. If > > so, then we sort of have to trust that it's doing the right thing with > > respect to permissions checking here. > > The capability can say "you are allowed to be uid 1000 or uid 1020." We > want to delegate the EXCL caps to the client so that a create + chmod + > chown + write can all happen efficiently, but we still need to ensure that > the values they set are legal (a permitted uid/gid combo). > > A common example would be user workstations that are allowed access to > /home/user and restricted via their mds caps to their uid/gid. We need to > prevent them from doing a 'sudo chown 0:0 foo'... > > On what basis do you make such a decision though? For instance, NFS does root-squashing which is (generally) a per-export+per-client thing. It sounds like you're saying that ceph has different semantics here? (cc'ing Greg here) Also, chown (at least under POSIX) is reserved for superuser only, and now that I look, I think this check in MDSAuthCaps::is_capable may be wrong: // chown/chgrp if (mask & MAY_CHOWN) { if (new_uid != caller_uid || // you can't chown to someone else inode_uid != caller_uid) { // you can't chown from someone else continue; } } Shouldn't this just be a check for whether the caller_uid is 0 (or whatever the correct check for the equivalent to the kernel's CAP_CHOWN would be)?
On Mon, 7 Nov 2016, Jeff Layton wrote: > On Mon, 2016-11-07 at 14:36 +0000, Sage Weil wrote: > > On Mon, 7 Nov 2016, Jeff Layton wrote: > > > On Mon, 2016-11-07 at 14:05 +0000, Sage Weil wrote: > > > > On Mon, 7 Nov 2016, Jeff Layton wrote: > > > > > On Mon, 2016-11-07 at 16:43 +0800, Yan, Zheng wrote: > > > > > > On Fri, Nov 4, 2016 at 8:57 PM, Jeff Layton <jlayton@redhat.com> wrote: > > > > > > > > > > > > > > On Fri, 2016-11-04 at 07:34 -0400, Jeff Layton wrote: > > > > > > > > > > > > > > > > The userland ceph has MClientCaps at struct version 9. This brings the > > > > > > > > kernel up the same version. > > > > > > > > > > > > > > > > With this change, we have to start tracking the btime and change_attr, > > > > > > > > so that the client can pass back sane values in cap messages. The > > > > > > > > client doesn't care about the btime at all, so this is just passed > > > > > > > > around, but the change_attr is used when ceph is exported via NFS. > > > > > > > > > > > > > > > > For now, the new "sync" parm is left at 0, to preserve the existing > > > > > > > > behavior of the client. > > > > > > > > > > > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com> > > > > > > > > --- > > > > > > > > fs/ceph/caps.c | 33 +++++++++++++++++++++++++-------- > > > > > > > > 1 file changed, 25 insertions(+), 8 deletions(-) > > > > > > > > > > > > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > > > > > > > > index 6e99866b1946..452f5024589f 100644 > > > > > > > > --- a/fs/ceph/caps.c > > > > > > > > +++ b/fs/ceph/caps.c > > > > > > > > @@ -991,9 +991,9 @@ struct cap_msg_args { > > > > > > > > struct ceph_mds_session *session; > > > > > > > > u64 ino, cid, follows; > > > > > > > > u64 flush_tid, oldest_flush_tid, size, max_size; > > > > > > > > - u64 xattr_version; > > > > > > > > + u64 xattr_version, change_attr; > > > > > > > > struct ceph_buffer *xattr_buf; > > > > > > > > - struct timespec atime, mtime, ctime; > > > > > > > > + struct timespec atime, mtime, ctime, btime; > > > > > > > > int op, caps, wanted, dirty; > > > > > > > > u32 seq, issue_seq, mseq, time_warp_seq; > > > > > > > > kuid_t uid; > > > > > > > > @@ -1026,13 +1026,13 @@ static int send_cap_msg(struct cap_msg_args *arg) > > > > > > > > > > > > > > > > /* flock buffer size + inline version + inline data size + > > > > > > > > * osd_epoch_barrier + oldest_flush_tid */ > > > > > > > > - extra_len = 4 + 8 + 4 + 4 + 8; > > > > > > > > + extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 1; > > > > > > > > msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, > > > > > > > > GFP_NOFS, false); > > > > > > > > if (!msg) > > > > > > > > return -ENOMEM; > > > > > > > > > > > > > > > > - msg->hdr.version = cpu_to_le16(6); > > > > > > > > + msg->hdr.version = cpu_to_le16(9); > > > > > > > > msg->hdr.tid = cpu_to_le64(arg->flush_tid); > > > > > > > > > > > > > > > > fc = msg->front.iov_base; > > > > > > > > @@ -1068,17 +1068,30 @@ static int send_cap_msg(struct cap_msg_args *arg) > > > > > > > > } > > > > > > > > > > > > > > > > p = fc + 1; > > > > > > > > - /* flock buffer size */ > > > > > > > > + /* flock buffer size (version 2) */ > > > > > > > > ceph_encode_32(&p, 0); > > > > > > > > - /* inline version */ > > > > > > > > + /* inline version (version 4) */ > > > > > > > > ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); > > > > > > > > /* inline data size */ > > > > > > > > ceph_encode_32(&p, 0); > > > > > > > > - /* osd_epoch_barrier */ > > > > > > > > + /* osd_epoch_barrier (version 5) */ > > > > > > > > ceph_encode_32(&p, 0); > > > > > > > > - /* oldest_flush_tid */ > > > > > > > > + /* oldest_flush_tid (version 6) */ > > > > > > > > ceph_encode_64(&p, arg->oldest_flush_tid); > > > > > > > > > > > > > > > > + /* caller_uid/caller_gid (version 7) */ > > > > > > > > + ceph_encode_32(&p, (u32)-1); > > > > > > > > + ceph_encode_32(&p, (u32)-1); > > > > > > > > > > > > > > A bit of self-review... > > > > > > > > > > > > > > Not sure if we want to set the above to something else -- maybe 0 or to > > > > > > > current's creds? That may not always make sense though (during e.g. > > > > > > > writeback). > > > > > > > > > > > > > > > > > Looking further, I'm not quite sure I understand why we send creds at > > > > > all in cap messages. Can you clarify where that matters? > > > > > > > > > > The way I look at it, would be to consider caps to be something like a > > > > > more granular NFS delegation or SMB oplock. > > > > > > > > > > In that light, a cap flush is just the client sending updated attrs for > > > > > the exclusive caps that it has already been granted. Is there a > > > > > situation where we would ever want to refuse that update? > > > > > > > > A chmod or chown can be done locally if you have excl caps and flushed > > > > back to the MDS via a caps message. We need to verify the user has > > > > permission to make the change. > > > > > > > > > > My take is that once the MDS has delegated Ax to the client, then it's > > > effectively trusting the client to handle permissions enforcement > > > correctly. I don't see why we should second guess that. > > > > > > > > Note that nothing ever checks the return code for _do_cap_update in the > > > > > userland code. If the permissions check fails, then we'll end up > > > > > silently dropping the updated attrs on the floor. > > > > > > > > Yeah. This was mainly for expediency... the protocol assumes that flushes > > > > don't fail. Given that the client does it's own permissions check, I > > > > think the way to improve this is to have it prevent the flush in the first > > > > place, so that it's only nefarious clients that are effected (and who > > > > cares if they get confused). I don't think we have a particularly good > > > > way to tell the client it can't, say, sudo chmod 0:0 a file, though. > > > > > > > > > > Sorry, I don't quite follow. How would we prevent the flush from a > > > nefarious client (which is not something we can really control)? > > > > > > In any case...ISTM that the permissions check in _do_cap_update ought to > > > be replaced by a cephx key check. IOW, what we really want to know is > > > whether the client is truly the one to which we delegated the caps. If > > > so, then we sort of have to trust that it's doing the right thing with > > > respect to permissions checking here. > > > > The capability can say "you are allowed to be uid 1000 or uid 1020." We > > want to delegate the EXCL caps to the client so that a create + chmod + > > chown + write can all happen efficiently, but we still need to ensure that > > the values they set are legal (a permitted uid/gid combo). > > > > A common example would be user workstations that are allowed access to > > /home/user and restricted via their mds caps to their uid/gid. We need to > > prevent them from doing a 'sudo chown 0:0 foo'... > > > > > > > On what basis do you make such a decision though? For instance, NFS does > root-squashing which is (generally) a per-export+per-client thing. > It sounds like you're saying that ceph has different semantics here? I don't remember all the specifics :(, but I do remember a long discussion about whether root_squash made sense and ultimately deciding that the semantics were weak enough to not bother implementing. Instead, we explicitly enumerate the uids/gids that are allowed and just match against that. > (cc'ing Greg here) > > Also, chown (at least under POSIX) is reserved for superuser only, and > now that I look, I think this check in MDSAuthCaps::is_capable may be > wrong: > > // chown/chgrp > if (mask & MAY_CHOWN) { > if (new_uid != caller_uid || // you can't chown to someone else > inode_uid != caller_uid) { // you can't chown from someone else > continue; > } > } > > Shouldn't this just be a check for whether the caller_uid is 0 (or > whatever the correct check for the equivalent to the kernel's CAP_CHOWN > would be)? In the single-user case root_squash would be sufficient, provided that we also somehow ensured that whatever uid the new file was assigned was set to the "correct" value. It would have been a kludgey and/or limited solution, though. And that only addresses the UID portion... Instead, we validate whatever values come back against what is in the cap (uid *and* gid) to ensure it is allowed. Note that the capability can take a list of gids, e.g., "allow rw path=/foo uid=1 gids=1,2,3" This is also laying some of the groundwork for the future world in which the client *host* gets a capability saying something like "I trust you do be an honest broker of kerberos identities" and to pass per-user tickets to the MDS as users come and go from the host, such that a given request will be validated against some specific user's session (kerberos identity). Of course, in such a situation a malicious host could allow one user to impersonate some other user that is also logged into the host, but that is a still a much stronger model than simply trustly the host/client completely (ala AUTH_UNIX in NFS-land). The idea is that eventually the check_caps() would validate against a dynamic "user" session as well as the client session capability which would have per-user context pulled from kerberos or LDAP or whatever. In any case, the core issue is that it wouldn't be sufficient to, say, refuse to issue EXCL caps on a file to the client unless we are prepare to trust any values they send back. Or I guess it would, but it would rule out lots of common scenarios where there are huge benefits (performance-wise) to issueing the EXCL caps. That's why we have to validate the contents of the metadata in the cap flush messages... Does that make sense? sage
On Mon, Nov 7, 2016 at 10:39 AM, Jeff Layton <jlayton@redhat.com> wrote: > On Mon, 2016-11-07 at 14:36 +0000, Sage Weil wrote: >> On Mon, 7 Nov 2016, Jeff Layton wrote: >> > On Mon, 2016-11-07 at 14:05 +0000, Sage Weil wrote: >> > > On Mon, 7 Nov 2016, Jeff Layton wrote: >> > > > On Mon, 2016-11-07 at 16:43 +0800, Yan, Zheng wrote: >> > > > > On Fri, Nov 4, 2016 at 8:57 PM, Jeff Layton <jlayton@redhat.com> wrote: >> > > > > > >> > > > > > On Fri, 2016-11-04 at 07:34 -0400, Jeff Layton wrote: >> > > > > > > >> > > > > > > The userland ceph has MClientCaps at struct version 9. This brings the >> > > > > > > kernel up the same version. >> > > > > > > >> > > > > > > With this change, we have to start tracking the btime and change_attr, >> > > > > > > so that the client can pass back sane values in cap messages. The >> > > > > > > client doesn't care about the btime at all, so this is just passed >> > > > > > > around, but the change_attr is used when ceph is exported via NFS. >> > > > > > > >> > > > > > > For now, the new "sync" parm is left at 0, to preserve the existing >> > > > > > > behavior of the client. >> > > > > > > >> > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com> >> > > > > > > --- >> > > > > > > fs/ceph/caps.c | 33 +++++++++++++++++++++++++-------- >> > > > > > > 1 file changed, 25 insertions(+), 8 deletions(-) >> > > > > > > >> > > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c >> > > > > > > index 6e99866b1946..452f5024589f 100644 >> > > > > > > --- a/fs/ceph/caps.c >> > > > > > > +++ b/fs/ceph/caps.c >> > > > > > > @@ -991,9 +991,9 @@ struct cap_msg_args { >> > > > > > > struct ceph_mds_session *session; >> > > > > > > u64 ino, cid, follows; >> > > > > > > u64 flush_tid, oldest_flush_tid, size, max_size; >> > > > > > > - u64 xattr_version; >> > > > > > > + u64 xattr_version, change_attr; >> > > > > > > struct ceph_buffer *xattr_buf; >> > > > > > > - struct timespec atime, mtime, ctime; >> > > > > > > + struct timespec atime, mtime, ctime, btime; >> > > > > > > int op, caps, wanted, dirty; >> > > > > > > u32 seq, issue_seq, mseq, time_warp_seq; >> > > > > > > kuid_t uid; >> > > > > > > @@ -1026,13 +1026,13 @@ static int send_cap_msg(struct cap_msg_args *arg) >> > > > > > > >> > > > > > > /* flock buffer size + inline version + inline data size + >> > > > > > > * osd_epoch_barrier + oldest_flush_tid */ >> > > > > > > - extra_len = 4 + 8 + 4 + 4 + 8; >> > > > > > > + extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 1; >> > > > > > > msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, >> > > > > > > GFP_NOFS, false); >> > > > > > > if (!msg) >> > > > > > > return -ENOMEM; >> > > > > > > >> > > > > > > - msg->hdr.version = cpu_to_le16(6); >> > > > > > > + msg->hdr.version = cpu_to_le16(9); >> > > > > > > msg->hdr.tid = cpu_to_le64(arg->flush_tid); >> > > > > > > >> > > > > > > fc = msg->front.iov_base; >> > > > > > > @@ -1068,17 +1068,30 @@ static int send_cap_msg(struct cap_msg_args *arg) >> > > > > > > } >> > > > > > > >> > > > > > > p = fc + 1; >> > > > > > > - /* flock buffer size */ >> > > > > > > + /* flock buffer size (version 2) */ >> > > > > > > ceph_encode_32(&p, 0); >> > > > > > > - /* inline version */ >> > > > > > > + /* inline version (version 4) */ >> > > > > > > ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); >> > > > > > > /* inline data size */ >> > > > > > > ceph_encode_32(&p, 0); >> > > > > > > - /* osd_epoch_barrier */ >> > > > > > > + /* osd_epoch_barrier (version 5) */ >> > > > > > > ceph_encode_32(&p, 0); >> > > > > > > - /* oldest_flush_tid */ >> > > > > > > + /* oldest_flush_tid (version 6) */ >> > > > > > > ceph_encode_64(&p, arg->oldest_flush_tid); >> > > > > > > >> > > > > > > + /* caller_uid/caller_gid (version 7) */ >> > > > > > > + ceph_encode_32(&p, (u32)-1); >> > > > > > > + ceph_encode_32(&p, (u32)-1); >> > > > > > >> > > > > > A bit of self-review... >> > > > > > >> > > > > > Not sure if we want to set the above to something else -- maybe 0 or to >> > > > > > current's creds? That may not always make sense though (during e.g. >> > > > > > writeback). >> > > > > > >> > > > >> > > > Looking further, I'm not quite sure I understand why we send creds at >> > > > all in cap messages. Can you clarify where that matters? >> > > > >> > > > The way I look at it, would be to consider caps to be something like a >> > > > more granular NFS delegation or SMB oplock. >> > > > >> > > > In that light, a cap flush is just the client sending updated attrs for >> > > > the exclusive caps that it has already been granted. Is there a >> > > > situation where we would ever want to refuse that update? >> > > >> > > A chmod or chown can be done locally if you have excl caps and flushed >> > > back to the MDS via a caps message. We need to verify the user has >> > > permission to make the change. >> > > >> > >> > My take is that once the MDS has delegated Ax to the client, then it's >> > effectively trusting the client to handle permissions enforcement >> > correctly. I don't see why we should second guess that. >> > >> > > > Note that nothing ever checks the return code for _do_cap_update in the >> > > > userland code. If the permissions check fails, then we'll end up >> > > > silently dropping the updated attrs on the floor. >> > > >> > > Yeah. This was mainly for expediency... the protocol assumes that flushes >> > > don't fail. Given that the client does it's own permissions check, I >> > > think the way to improve this is to have it prevent the flush in the first >> > > place, so that it's only nefarious clients that are effected (and who >> > > cares if they get confused). I don't think we have a particularly good >> > > way to tell the client it can't, say, sudo chmod 0:0 a file, though. >> > > >> > >> > Sorry, I don't quite follow. How would we prevent the flush from a >> > nefarious client (which is not something we can really control)? >> > >> > In any case...ISTM that the permissions check in _do_cap_update ought to >> > be replaced by a cephx key check. IOW, what we really want to know is >> > whether the client is truly the one to which we delegated the caps. If >> > so, then we sort of have to trust that it's doing the right thing with >> > respect to permissions checking here. >> >> The capability can say "you are allowed to be uid 1000 or uid 1020." We >> want to delegate the EXCL caps to the client so that a create + chmod + >> chown + write can all happen efficiently, but we still need to ensure that >> the values they set are legal (a permitted uid/gid combo). >> >> A common example would be user workstations that are allowed access to >> /home/user and restricted via their mds caps to their uid/gid. We need to >> prevent them from doing a 'sudo chown 0:0 foo'... >> >> > > > On what basis do you make such a decision though? For instance, NFS does > root-squashing which is (generally) a per-export+per-client thing. > It sounds like you're saying that ceph has different semantics here? > > (cc'ing Greg here) As Sage says, we definitely avoid the root squash semantics. We discussed them last year and concluded they were an inappropriate match for Ceph's permission model. > > Also, chown (at least under POSIX) is reserved for superuser only, and > now that I look, I think this check in MDSAuthCaps::is_capable may be > wrong: > > // chown/chgrp > if (mask & MAY_CHOWN) { > if (new_uid != caller_uid || // you can't chown to someone else > inode_uid != caller_uid) { // you can't chown from someone else > continue; > } > } > > Shouldn't this just be a check for whether the caller_uid is 0 (or > whatever the correct check for the equivalent to the kernel's CAP_CHOWN > would be)? Without context, this does look a little weird — does it allow *any* change, given caller_uid needs to match both new and inode uid? Sort of the common case would be that the admin cap gets hit toward the beginning of the function and just allows it without ever reaching this point. -Greg > > -- > Jeff Layton <jlayton@redhat.com> -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 7 Nov 2016, Gregory Farnum wrote: > On Mon, Nov 7, 2016 at 10:39 AM, Jeff Layton <jlayton@redhat.com> wrote: > > On Mon, 2016-11-07 at 14:36 +0000, Sage Weil wrote: > >> On Mon, 7 Nov 2016, Jeff Layton wrote: > >> > On Mon, 2016-11-07 at 14:05 +0000, Sage Weil wrote: > >> > > On Mon, 7 Nov 2016, Jeff Layton wrote: > >> > > > On Mon, 2016-11-07 at 16:43 +0800, Yan, Zheng wrote: > >> > > > > On Fri, Nov 4, 2016 at 8:57 PM, Jeff Layton <jlayton@redhat.com> wrote: > >> > > > > > > >> > > > > > On Fri, 2016-11-04 at 07:34 -0400, Jeff Layton wrote: > >> > > > > > > > >> > > > > > > The userland ceph has MClientCaps at struct version 9. This brings the > >> > > > > > > kernel up the same version. > >> > > > > > > > >> > > > > > > With this change, we have to start tracking the btime and change_attr, > >> > > > > > > so that the client can pass back sane values in cap messages. The > >> > > > > > > client doesn't care about the btime at all, so this is just passed > >> > > > > > > around, but the change_attr is used when ceph is exported via NFS. > >> > > > > > > > >> > > > > > > For now, the new "sync" parm is left at 0, to preserve the existing > >> > > > > > > behavior of the client. > >> > > > > > > > >> > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com> > >> > > > > > > --- > >> > > > > > > fs/ceph/caps.c | 33 +++++++++++++++++++++++++-------- > >> > > > > > > 1 file changed, 25 insertions(+), 8 deletions(-) > >> > > > > > > > >> > > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > >> > > > > > > index 6e99866b1946..452f5024589f 100644 > >> > > > > > > --- a/fs/ceph/caps.c > >> > > > > > > +++ b/fs/ceph/caps.c > >> > > > > > > @@ -991,9 +991,9 @@ struct cap_msg_args { > >> > > > > > > struct ceph_mds_session *session; > >> > > > > > > u64 ino, cid, follows; > >> > > > > > > u64 flush_tid, oldest_flush_tid, size, max_size; > >> > > > > > > - u64 xattr_version; > >> > > > > > > + u64 xattr_version, change_attr; > >> > > > > > > struct ceph_buffer *xattr_buf; > >> > > > > > > - struct timespec atime, mtime, ctime; > >> > > > > > > + struct timespec atime, mtime, ctime, btime; > >> > > > > > > int op, caps, wanted, dirty; > >> > > > > > > u32 seq, issue_seq, mseq, time_warp_seq; > >> > > > > > > kuid_t uid; > >> > > > > > > @@ -1026,13 +1026,13 @@ static int send_cap_msg(struct cap_msg_args *arg) > >> > > > > > > > >> > > > > > > /* flock buffer size + inline version + inline data size + > >> > > > > > > * osd_epoch_barrier + oldest_flush_tid */ > >> > > > > > > - extra_len = 4 + 8 + 4 + 4 + 8; > >> > > > > > > + extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 1; > >> > > > > > > msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, > >> > > > > > > GFP_NOFS, false); > >> > > > > > > if (!msg) > >> > > > > > > return -ENOMEM; > >> > > > > > > > >> > > > > > > - msg->hdr.version = cpu_to_le16(6); > >> > > > > > > + msg->hdr.version = cpu_to_le16(9); > >> > > > > > > msg->hdr.tid = cpu_to_le64(arg->flush_tid); > >> > > > > > > > >> > > > > > > fc = msg->front.iov_base; > >> > > > > > > @@ -1068,17 +1068,30 @@ static int send_cap_msg(struct cap_msg_args *arg) > >> > > > > > > } > >> > > > > > > > >> > > > > > > p = fc + 1; > >> > > > > > > - /* flock buffer size */ > >> > > > > > > + /* flock buffer size (version 2) */ > >> > > > > > > ceph_encode_32(&p, 0); > >> > > > > > > - /* inline version */ > >> > > > > > > + /* inline version (version 4) */ > >> > > > > > > ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); > >> > > > > > > /* inline data size */ > >> > > > > > > ceph_encode_32(&p, 0); > >> > > > > > > - /* osd_epoch_barrier */ > >> > > > > > > + /* osd_epoch_barrier (version 5) */ > >> > > > > > > ceph_encode_32(&p, 0); > >> > > > > > > - /* oldest_flush_tid */ > >> > > > > > > + /* oldest_flush_tid (version 6) */ > >> > > > > > > ceph_encode_64(&p, arg->oldest_flush_tid); > >> > > > > > > > >> > > > > > > + /* caller_uid/caller_gid (version 7) */ > >> > > > > > > + ceph_encode_32(&p, (u32)-1); > >> > > > > > > + ceph_encode_32(&p, (u32)-1); > >> > > > > > > >> > > > > > A bit of self-review... > >> > > > > > > >> > > > > > Not sure if we want to set the above to something else -- maybe 0 or to > >> > > > > > current's creds? That may not always make sense though (during e.g. > >> > > > > > writeback). > >> > > > > > > >> > > > > >> > > > Looking further, I'm not quite sure I understand why we send creds at > >> > > > all in cap messages. Can you clarify where that matters? > >> > > > > >> > > > The way I look at it, would be to consider caps to be something like a > >> > > > more granular NFS delegation or SMB oplock. > >> > > > > >> > > > In that light, a cap flush is just the client sending updated attrs for > >> > > > the exclusive caps that it has already been granted. Is there a > >> > > > situation where we would ever want to refuse that update? > >> > > > >> > > A chmod or chown can be done locally if you have excl caps and flushed > >> > > back to the MDS via a caps message. We need to verify the user has > >> > > permission to make the change. > >> > > > >> > > >> > My take is that once the MDS has delegated Ax to the client, then it's > >> > effectively trusting the client to handle permissions enforcement > >> > correctly. I don't see why we should second guess that. > >> > > >> > > > Note that nothing ever checks the return code for _do_cap_update in the > >> > > > userland code. If the permissions check fails, then we'll end up > >> > > > silently dropping the updated attrs on the floor. > >> > > > >> > > Yeah. This was mainly for expediency... the protocol assumes that flushes > >> > > don't fail. Given that the client does it's own permissions check, I > >> > > think the way to improve this is to have it prevent the flush in the first > >> > > place, so that it's only nefarious clients that are effected (and who > >> > > cares if they get confused). I don't think we have a particularly good > >> > > way to tell the client it can't, say, sudo chmod 0:0 a file, though. > >> > > > >> > > >> > Sorry, I don't quite follow. How would we prevent the flush from a > >> > nefarious client (which is not something we can really control)? > >> > > >> > In any case...ISTM that the permissions check in _do_cap_update ought to > >> > be replaced by a cephx key check. IOW, what we really want to know is > >> > whether the client is truly the one to which we delegated the caps. If > >> > so, then we sort of have to trust that it's doing the right thing with > >> > respect to permissions checking here. > >> > >> The capability can say "you are allowed to be uid 1000 or uid 1020." We > >> want to delegate the EXCL caps to the client so that a create + chmod + > >> chown + write can all happen efficiently, but we still need to ensure that > >> the values they set are legal (a permitted uid/gid combo). > >> > >> A common example would be user workstations that are allowed access to > >> /home/user and restricted via their mds caps to their uid/gid. We need to > >> prevent them from doing a 'sudo chown 0:0 foo'... > >> > >> > > > > > > On what basis do you make such a decision though? For instance, NFS does > > root-squashing which is (generally) a per-export+per-client thing. > > It sounds like you're saying that ceph has different semantics here? > > > > (cc'ing Greg here) > > As Sage says, we definitely avoid the root squash semantics. We > discussed them last year and concluded they were an inappropriate > match for Ceph's permission model. > > > > > Also, chown (at least under POSIX) is reserved for superuser only, and > > now that I look, I think this check in MDSAuthCaps::is_capable may be > > wrong: > > > > // chown/chgrp > > if (mask & MAY_CHOWN) { > > if (new_uid != caller_uid || // you can't chown to someone else > > inode_uid != caller_uid) { // you can't chown from someone else > > continue; > > } > > } > > > > Shouldn't this just be a check for whether the caller_uid is 0 (or > > whatever the correct check for the equivalent to the kernel's CAP_CHOWN > > would be)? Oops, I skipped over this part ^ > Without context, this does look a little weird — does it allow *any* > change, given caller_uid needs to match both new and inode uid? > Sort of the common case would be that the admin cap gets hit toward > the beginning of the function and just allows it without ever reaching > this point. Yeah, the check is a bit weird. It looks like 1- A normal cap that specifies a uid can't ever change the uid. This conditional could be simplified/clarified... 2- If you have a pair of caps, like allow * uid=1, allow * uid=2 we still don't let you chown between uid 1 and 2. Well, not as caller_uid 1 or 2 (which is fine), but 3- Jeff is right, we don't allow root to chown between allowed uids. Like if you had allow * uid=0 shouldn't that let you chown anything? I didn't really consider this case since most users would just do allow * which can do anything (including chown). But probably the 'allow * uid=0' case should be handled properly. sage
On Mon, 2016-11-07 at 20:09 +0000, Sage Weil wrote: > On Mon, 7 Nov 2016, Gregory Farnum wrote: > > On Mon, Nov 7, 2016 at 10:39 AM, Jeff Layton <jlayton@redhat.com> wrote: > > > On Mon, 2016-11-07 at 14:36 +0000, Sage Weil wrote: > > >> On Mon, 7 Nov 2016, Jeff Layton wrote: > > >> > On Mon, 2016-11-07 at 14:05 +0000, Sage Weil wrote: > > >> > > On Mon, 7 Nov 2016, Jeff Layton wrote: > > >> > > > On Mon, 2016-11-07 at 16:43 +0800, Yan, Zheng wrote: > > >> > > > > On Fri, Nov 4, 2016 at 8:57 PM, Jeff Layton <jlayton@redhat.com> wrote: > > >> > > > > > > > >> > > > > > On Fri, 2016-11-04 at 07:34 -0400, Jeff Layton wrote: > > >> > > > > > > > > >> > > > > > > The userland ceph has MClientCaps at struct version 9. This brings the > > >> > > > > > > kernel up the same version. > > >> > > > > > > > > >> > > > > > > With this change, we have to start tracking the btime and change_attr, > > >> > > > > > > so that the client can pass back sane values in cap messages. The > > >> > > > > > > client doesn't care about the btime at all, so this is just passed > > >> > > > > > > around, but the change_attr is used when ceph is exported via NFS. > > >> > > > > > > > > >> > > > > > > For now, the new "sync" parm is left at 0, to preserve the existing > > >> > > > > > > behavior of the client. > > >> > > > > > > > > >> > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com> > > >> > > > > > > --- > > >> > > > > > > fs/ceph/caps.c | 33 +++++++++++++++++++++++++-------- > > >> > > > > > > 1 file changed, 25 insertions(+), 8 deletions(-) > > >> > > > > > > > > >> > > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > > >> > > > > > > index 6e99866b1946..452f5024589f 100644 > > >> > > > > > > --- a/fs/ceph/caps.c > > >> > > > > > > +++ b/fs/ceph/caps.c > > >> > > > > > > @@ -991,9 +991,9 @@ struct cap_msg_args { > > >> > > > > > > struct ceph_mds_session *session; > > >> > > > > > > u64 ino, cid, follows; > > >> > > > > > > u64 flush_tid, oldest_flush_tid, size, max_size; > > >> > > > > > > - u64 xattr_version; > > >> > > > > > > + u64 xattr_version, change_attr; > > >> > > > > > > struct ceph_buffer *xattr_buf; > > >> > > > > > > - struct timespec atime, mtime, ctime; > > >> > > > > > > + struct timespec atime, mtime, ctime, btime; > > >> > > > > > > int op, caps, wanted, dirty; > > >> > > > > > > u32 seq, issue_seq, mseq, time_warp_seq; > > >> > > > > > > kuid_t uid; > > >> > > > > > > @@ -1026,13 +1026,13 @@ static int send_cap_msg(struct cap_msg_args *arg) > > >> > > > > > > > > >> > > > > > > /* flock buffer size + inline version + inline data size + > > >> > > > > > > * osd_epoch_barrier + oldest_flush_tid */ > > >> > > > > > > - extra_len = 4 + 8 + 4 + 4 + 8; > > >> > > > > > > + extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 1; > > >> > > > > > > msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, > > >> > > > > > > GFP_NOFS, false); > > >> > > > > > > if (!msg) > > >> > > > > > > return -ENOMEM; > > >> > > > > > > > > >> > > > > > > - msg->hdr.version = cpu_to_le16(6); > > >> > > > > > > + msg->hdr.version = cpu_to_le16(9); > > >> > > > > > > msg->hdr.tid = cpu_to_le64(arg->flush_tid); > > >> > > > > > > > > >> > > > > > > fc = msg->front.iov_base; > > >> > > > > > > @@ -1068,17 +1068,30 @@ static int send_cap_msg(struct cap_msg_args *arg) > > >> > > > > > > } > > >> > > > > > > > > >> > > > > > > p = fc + 1; > > >> > > > > > > - /* flock buffer size */ > > >> > > > > > > + /* flock buffer size (version 2) */ > > >> > > > > > > ceph_encode_32(&p, 0); > > >> > > > > > > - /* inline version */ > > >> > > > > > > + /* inline version (version 4) */ > > >> > > > > > > ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); > > >> > > > > > > /* inline data size */ > > >> > > > > > > ceph_encode_32(&p, 0); > > >> > > > > > > - /* osd_epoch_barrier */ > > >> > > > > > > + /* osd_epoch_barrier (version 5) */ > > >> > > > > > > ceph_encode_32(&p, 0); > > >> > > > > > > - /* oldest_flush_tid */ > > >> > > > > > > + /* oldest_flush_tid (version 6) */ > > >> > > > > > > ceph_encode_64(&p, arg->oldest_flush_tid); > > >> > > > > > > > > >> > > > > > > + /* caller_uid/caller_gid (version 7) */ > > >> > > > > > > + ceph_encode_32(&p, (u32)-1); > > >> > > > > > > + ceph_encode_32(&p, (u32)-1); > > >> > > > > > > > >> > > > > > A bit of self-review... > > >> > > > > > > > >> > > > > > Not sure if we want to set the above to something else -- maybe 0 or to > > >> > > > > > current's creds? That may not always make sense though (during e.g. > > >> > > > > > writeback). > > >> > > > > > > > >> > > > > > >> > > > Looking further, I'm not quite sure I understand why we send creds at > > >> > > > all in cap messages. Can you clarify where that matters? > > >> > > > > > >> > > > The way I look at it, would be to consider caps to be something like a > > >> > > > more granular NFS delegation or SMB oplock. > > >> > > > > > >> > > > In that light, a cap flush is just the client sending updated attrs for > > >> > > > the exclusive caps that it has already been granted. Is there a > > >> > > > situation where we would ever want to refuse that update? > > >> > > > > >> > > A chmod or chown can be done locally if you have excl caps and flushed > > >> > > back to the MDS via a caps message. We need to verify the user has > > >> > > permission to make the change. > > >> > > > > >> > > > >> > My take is that once the MDS has delegated Ax to the client, then it's > > >> > effectively trusting the client to handle permissions enforcement > > >> > correctly. I don't see why we should second guess that. > > >> > > > >> > > > Note that nothing ever checks the return code for _do_cap_update in the > > >> > > > userland code. If the permissions check fails, then we'll end up > > >> > > > silently dropping the updated attrs on the floor. > > >> > > > > >> > > Yeah. This was mainly for expediency... the protocol assumes that flushes > > >> > > don't fail. Given that the client does it's own permissions check, I > > >> > > think the way to improve this is to have it prevent the flush in the first > > >> > > place, so that it's only nefarious clients that are effected (and who > > >> > > cares if they get confused). I don't think we have a particularly good > > >> > > way to tell the client it can't, say, sudo chmod 0:0 a file, though. > > >> > > > > >> > > > >> > Sorry, I don't quite follow. How would we prevent the flush from a > > >> > nefarious client (which is not something we can really control)? > > >> > > > >> > In any case...ISTM that the permissions check in _do_cap_update ought to > > >> > be replaced by a cephx key check. IOW, what we really want to know is > > >> > whether the client is truly the one to which we delegated the caps. If > > >> > so, then we sort of have to trust that it's doing the right thing with > > >> > respect to permissions checking here. > > >> > > >> The capability can say "you are allowed to be uid 1000 or uid 1020." We > > >> want to delegate the EXCL caps to the client so that a create + chmod + > > >> chown + write can all happen efficiently, but we still need to ensure that > > >> the values they set are legal (a permitted uid/gid combo). > > >> > > >> A common example would be user workstations that are allowed access to > > >> /home/user and restricted via their mds caps to their uid/gid. We need to > > >> prevent them from doing a 'sudo chown 0:0 foo'... > > >> > > >> > > > > > > > > > On what basis do you make such a decision though? For instance, NFS does > > > root-squashing which is (generally) a per-export+per-client thing. > > > It sounds like you're saying that ceph has different semantics here? > > > > > > (cc'ing Greg here) > > > > As Sage says, we definitely avoid the root squash semantics. We > > discussed them last year and concluded they were an inappropriate > > match for Ceph's permission model. > > > > > > > > Also, chown (at least under POSIX) is reserved for superuser only, and > > > now that I look, I think this check in MDSAuthCaps::is_capable may be > > > wrong: > > > > > > // chown/chgrp > > > if (mask & MAY_CHOWN) { > > > if (new_uid != caller_uid || // you can't chown to someone else > > > inode_uid != caller_uid) { // you can't chown from someone else > > > continue; > > > } > > > } > > > > > > Shouldn't this just be a check for whether the caller_uid is 0 (or > > > whatever the correct check for the equivalent to the kernel's CAP_CHOWN > > > would be)? > > Oops, I skipped over this part ^ > > > Without context, this does look a little weird — does it allow *any* > > change, given caller_uid needs to match both new and inode uid? > > Sort of the common case would be that the admin cap gets hit toward > > the beginning of the function and just allows it without ever reaching > > this point. > > Yeah, the check is a bit weird. It looks like > > 1- A normal cap that specifies a uid can't ever change the uid. This > conditional could be simplified/clarified... > > 2- If you have a pair of caps, like > > allow * uid=1, allow * uid=2 > > we still don't let you chown between uid 1 and 2. Well, not as caller_uid > 1 or 2 (which is fine), but > > 3- Jeff is right, we don't allow root to chown between allowed uids. > Like if you had > > allow * uid=0 > > shouldn't that let you chown anything? I didn't really consider this > case since most users would just do > > allow * > > which can do anything (including chown). But probably the 'allow * uid=0' > case should be handled properly. > > sage It still seems to me like that should just be a check for superuser status. Something like: if (mask & MAY_CHOWN) { // only root can chown if (i->match.uid != 0 || caller_uid != 0) continue; } } i.e. only allow chown if the capability has a uid of 0 and the caller_uid is also 0. I don't think we want to ever grant an unprivileged user the ability to chown, do we?
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 6e99866b1946..452f5024589f 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -991,9 +991,9 @@ struct cap_msg_args { struct ceph_mds_session *session; u64 ino, cid, follows; u64 flush_tid, oldest_flush_tid, size, max_size; - u64 xattr_version; + u64 xattr_version, change_attr; struct ceph_buffer *xattr_buf; - struct timespec atime, mtime, ctime; + struct timespec atime, mtime, ctime, btime; int op, caps, wanted, dirty; u32 seq, issue_seq, mseq, time_warp_seq; kuid_t uid; @@ -1026,13 +1026,13 @@ static int send_cap_msg(struct cap_msg_args *arg) /* flock buffer size + inline version + inline data size + * osd_epoch_barrier + oldest_flush_tid */ - extra_len = 4 + 8 + 4 + 4 + 8; + extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 1; msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, GFP_NOFS, false); if (!msg) return -ENOMEM; - msg->hdr.version = cpu_to_le16(6); + msg->hdr.version = cpu_to_le16(9); msg->hdr.tid = cpu_to_le64(arg->flush_tid); fc = msg->front.iov_base; @@ -1068,17 +1068,30 @@ static int send_cap_msg(struct cap_msg_args *arg) } p = fc + 1; - /* flock buffer size */ + /* flock buffer size (version 2) */ ceph_encode_32(&p, 0); - /* inline version */ + /* inline version (version 4) */ ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); /* inline data size */ ceph_encode_32(&p, 0); - /* osd_epoch_barrier */ + /* osd_epoch_barrier (version 5) */ ceph_encode_32(&p, 0); - /* oldest_flush_tid */ + /* oldest_flush_tid (version 6) */ ceph_encode_64(&p, arg->oldest_flush_tid); + /* caller_uid/caller_gid (version 7) */ + ceph_encode_32(&p, (u32)-1); + ceph_encode_32(&p, (u32)-1); + + /* pool namespace (version 8) */ + ceph_encode_32(&p, 0); + + /* btime, change_attr, sync (version 9) */ + ceph_encode_timespec(p, &arg->btime); + p += sizeof(struct ceph_timespec); + ceph_encode_64(&p, arg->change_attr); + ceph_encode_8(&p, 0); + ceph_con_send(&arg->session->s_con, msg); return 0; } @@ -1189,9 +1202,11 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, arg.xattr_buf = NULL; } + arg.change_attr = inode->i_version; arg.mtime = inode->i_mtime; arg.atime = inode->i_atime; arg.ctime = inode->i_ctime; + arg.btime = ci->i_btime; arg.op = op; arg.caps = cap->implemented; @@ -1241,10 +1256,12 @@ static inline int __send_flush_snap(struct inode *inode, arg.max_size = 0; arg.xattr_version = capsnap->xattr_version; arg.xattr_buf = capsnap->xattr_blob; + arg.change_attr = capsnap->change_attr; arg.atime = capsnap->atime; arg.mtime = capsnap->mtime; arg.ctime = capsnap->ctime; + arg.btime = capsnap->btime; arg.op = CEPH_CAP_OP_FLUSHSNAP; arg.caps = capsnap->issued;
The userland ceph has MClientCaps at struct version 9. This brings the kernel up the same version. With this change, we have to start tracking the btime and change_attr, so that the client can pass back sane values in cap messages. The client doesn't care about the btime at all, so this is just passed around, but the change_attr is used when ceph is exported via NFS. For now, the new "sync" parm is left at 0, to preserve the existing behavior of the client. Signed-off-by: Jeff Layton <jlayton@redhat.com> --- fs/ceph/caps.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-)