Message ID | 20160706181212.16267-8-ebiederm@xmission.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed 06-07-16 13:12:08, Eric W. Biederman wrote: > Introduce the helper qid_has_mapping and use it to ensure that the > quota system only considers qids that map to the filesystems > s_user_ns. > > In practice for quota supporting filesystems today this is the exact > same check as qid_valid. As only 0xffffffff aka (qid_t)-1 does not > map into init_user_ns. > > Replace the qid_valid calls with qid_has_mapping as values come in > from userspace. This is harmless today and it prepares the quota > system to work on filesystems with quotas but mounted by unprivileged > users. > > Call qid_has_mapping from dqget. This ensures the passed in qid has a > prepresentation on the underlying filesystem. Previously this was > unnecessary as filesystesm never had qids that could not map. With > the introduction of filesystems outside of s_user_ns this will not > remain true. > > All of this ensures the quota code never has to deal with qids that > don't map to the underlying filesystem. Does this and the following patch make sense when we actually don't allow quotas when s_user_ns is set? If things are untested, they will get broken so I think it is wiser to do the conversion only once we decide quota should be supported for namespaces != init_user_ns. Or do I miss something? Honza > > Cc: Jan Kara <jack@suse.cz> > Acked-by: Seth Forshee <seth.forshee@canonical.com> > Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> > --- > fs/quota/dquot.c | 3 +++ > fs/quota/quota.c | 12 ++++++------ > include/linux/quota.h | 10 ++++++++++ > 3 files changed, 19 insertions(+), 6 deletions(-) > > diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c > index ff21980d0119..74706b6aa747 100644 > --- a/fs/quota/dquot.c > +++ b/fs/quota/dquot.c > @@ -841,6 +841,9 @@ struct dquot *dqget(struct super_block *sb, struct kqid qid) > unsigned int hashent = hashfn(sb, qid); > struct dquot *dquot, *empty = NULL; > > + if (!qid_has_mapping(sb->s_user_ns, qid)) > + return ERR_PTR(-EINVAL); > + > if (!sb_has_quota_active(sb, qid.type)) > return ERR_PTR(-ESRCH); > we_slept: > diff --git a/fs/quota/quota.c b/fs/quota/quota.c > index 0f10ee9892ce..73f6f4cf0a21 100644 > --- a/fs/quota/quota.c > +++ b/fs/quota/quota.c > @@ -211,7 +211,7 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id, > if (!sb->s_qcop->get_dqblk) > return -ENOSYS; > qid = make_kqid(current_user_ns(), type, id); > - if (!qid_valid(qid)) > + if (!qid_has_mapping(sb->s_user_ns, qid)) > return -EINVAL; > ret = sb->s_qcop->get_dqblk(sb, qid, &fdq); > if (ret) > @@ -237,7 +237,7 @@ static int quota_getnextquota(struct super_block *sb, int type, qid_t id, > if (!sb->s_qcop->get_nextdqblk) > return -ENOSYS; > qid = make_kqid(current_user_ns(), type, id); > - if (!qid_valid(qid)) > + if (!qid_has_mapping(sb->s_user_ns, qid)) > return -EINVAL; > ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq); > if (ret) > @@ -288,7 +288,7 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id, > if (!sb->s_qcop->set_dqblk) > return -ENOSYS; > qid = make_kqid(current_user_ns(), type, id); > - if (!qid_valid(qid)) > + if (!qid_has_mapping(sb->s_user_ns, qid)) > return -EINVAL; > copy_from_if_dqblk(&fdq, &idq); > return sb->s_qcop->set_dqblk(sb, qid, &fdq); > @@ -581,7 +581,7 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id, > if (!sb->s_qcop->set_dqblk) > return -ENOSYS; > qid = make_kqid(current_user_ns(), type, id); > - if (!qid_valid(qid)) > + if (!qid_has_mapping(sb->s_user_ns, qid)) > return -EINVAL; > /* Are we actually setting timer / warning limits for all users? */ > if (from_kqid(&init_user_ns, qid) == 0 && > @@ -642,7 +642,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id, > if (!sb->s_qcop->get_dqblk) > return -ENOSYS; > qid = make_kqid(current_user_ns(), type, id); > - if (!qid_valid(qid)) > + if (!qid_has_mapping(sb->s_user_ns, qid)) > return -EINVAL; > ret = sb->s_qcop->get_dqblk(sb, qid, &qdq); > if (ret) > @@ -669,7 +669,7 @@ static int quota_getnextxquota(struct super_block *sb, int type, qid_t id, > if (!sb->s_qcop->get_nextdqblk) > return -ENOSYS; > qid = make_kqid(current_user_ns(), type, id); > - if (!qid_valid(qid)) > + if (!qid_has_mapping(sb->s_user_ns, qid)) > return -EINVAL; > ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq); > if (ret) > diff --git a/include/linux/quota.h b/include/linux/quota.h > index 9dfb6bce8c9e..1db16ee39b31 100644 > --- a/include/linux/quota.h > +++ b/include/linux/quota.h > @@ -179,6 +179,16 @@ static inline struct kqid make_kqid_projid(kprojid_t projid) > return kqid; > } > > +/** > + * qid_has_mapping - Report if a qid maps into a user namespace. > + * @ns: The user namespace to see if a value maps into. > + * @qid: The kernel internal quota identifier to test. > + */ > +static inline bool qid_has_mapping(struct user_namespace *ns, struct kqid qid) > +{ > + return from_kqid(ns, qid) != (qid_t) -1; > +} > + > > extern spinlock_t dq_data_lock; > > -- > 2.8.3 >
Jan Kara <jack@suse.cz> writes: > On Wed 06-07-16 13:12:08, Eric W. Biederman wrote: >> Introduce the helper qid_has_mapping and use it to ensure that the >> quota system only considers qids that map to the filesystems >> s_user_ns. >> >> In practice for quota supporting filesystems today this is the exact >> same check as qid_valid. As only 0xffffffff aka (qid_t)-1 does not >> map into init_user_ns. >> >> Replace the qid_valid calls with qid_has_mapping as values come in >> from userspace. This is harmless today and it prepares the quota >> system to work on filesystems with quotas but mounted by unprivileged >> users. >> >> Call qid_has_mapping from dqget. This ensures the passed in qid has a >> prepresentation on the underlying filesystem. Previously this was >> unnecessary as filesystesm never had qids that could not map. With >> the introduction of filesystems outside of s_user_ns this will not >> remain true. >> >> All of this ensures the quota code never has to deal with qids that >> don't map to the underlying filesystem. > > Does this and the following patch make sense when we actually don't allow > quotas when s_user_ns is set? If things are untested, they will get broken > so I think it is wiser to do the conversion only once we decide quota > should be supported for namespaces != init_user_ns. Or do I miss > something? All of the code will get exercised in the success case. So I don't think there is a strong chance of bitrot. The patches are trivially correct so I am not concerned about them. What the patches add are a logical consistency with the rest of the vfs. In my tree the rest of the vfs now handles uids and gids that can not be mapped to a filesystem and returns an error. In my tree the rest of the vfs now effectively assumes that uids and gids are stored in s_user_ns on a filesystem. So I freely admit there is a slight chance that some bit-rot will occur and the quota code will be changed in such a way that those assumptions are broken. Bugs happen. I think it much less likely if all of the generic code in the vfs makes the same assumptions. The place where I am concerned about thorough review and testing is someone poisoning quota files and then the kernel trying to use them. In the preliminary work we have done in other places in the kernel and for other filesystems there almost always winds up being some way to confuse the kernel and get it to misbave if you can poison the disk based inputs. As poison disk based inputs is not something filesystems are stronlgy concerned about. In most cases the disk the filesystem resides on is in the box and therefore under control of the OS at all times. Dave Chinner has even said he will never consider handling poisoned disk based inputs for XFS as the run time cost is too high. Between actually finding issues that can cause problems, and the increased amount of kernel code executed (and thus the increase in kernel attack surface) I am very paranoid about enabling code that trusts data that could be poisoned data from a hostile party. At the same time I am very uncomfortable with the fact the kernel does not protect against malicious disks and poisoned disk images. As poisoned disk images are a well known exploit vector in the wild. A well known and demonstrated attack vector that works is to leave a usb stick in a public place, and helpful people will place it into their computer to try and figure out who it belongs to. In trying to be helpful their computer will unbeknownst to them start executing code that does not serve the interests of the computer owner. I hate that we can not currently protect people from shenanigans like that. So I intend to be as responsible and as careful as I can, but also to push forward in the hopes that we can eventually not have to worry about our computers betraying us when we as human beings perform reasonable actions. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jul 11, 2016 at 01:12:49PM -0500, Eric W. Biederman wrote: > The place where I am concerned about thorough review and testing is > someone poisoning quota files and then the kernel trying to use them. > In the preliminary work we have done in other places in the kernel and > for other filesystems there almost always winds up being some way to > confuse the kernel and get it to misbave if you can poison the disk > based inputs. As poison disk based inputs is not something filesystems > are stronlgy concerned about. In most cases the disk the filesystem > resides on is in the box and therefore under control of the OS at all > times. Dave Chinner has even said he will never consider handling > poisoned disk based inputs for XFS as the run time cost is too high. I didn't say that. I said that comprehensive checks to catch all possible malicious inputs is too expensive to consider a viable solution for allowing user-mounts of arbitrary filesystem images through the kernel. We already have runtime validation that bounds check most on-disk fields when they are read - that deals with fuzz based poisoning testing quite well and provides some protection against directed structural attacks as well. IOWs, we already handle a large scope of poisoned inputs safely, but it's not comprehensive because we can't easily determine cross-object reference validity within the format-determined limits. e.g. we can check that the number of records in a btree block is within the valid bounds of a block, but we cannot determine that the record count has been incremented by 1 and a bogus record has been inserted somewhere inside the block and the CRC recalculated to match the modification. We can also check the records themselves for being within bound (e.g. we can check a freespace record points to block within range and of valid length) but we can't check that the extent is actually free space. That requires doing a full filesystem traversal to determine if the extent is actually free space or not. Of course, we could look up the rmap tree (if we have one) to determine if the space is actually used or not, but an attacker can also insert/remove records in that tree, too, so if we can't trust the free space tree, we can't trust the rmap tree either. Hence we have to fall back to brute force validation if we want to be certain that the metadata has not been tampered with. To bring this back to quota files, the only way to validate that a quota file has not been tampered with is to run a quotacheck on the filesystem once it has been mounted. This requires visiting every inode in the filesystem, so it an expensive operation. Only XFS has this functionality in kernel, so for untrusted mounts we could simply run it on every mount that has quotas enabled. Of course, users won't care that mounting their filesystem now takes several minutes (hours, even, when we have millions of inodes in the fs) while these checks are run... Detecting malicious corruptions that specifically manipulate the on-disk structure within the bounds of format validity are difficult to detect and costly to protect against. We'd need to move large parts of fsck into the kernel and run it to validate every piece of metadata read into the kernel. Then we've got a much larger attack surface in the kernel (all the validity checking code needs to be robust against invalid structures, too!), a lot more complexity (more bugs!) and a lot of additional runtime overhead (slow filesystem = unhappy users!). It's just not a practical solution to the problem. > Between actually finding issues that can cause problems, and the > increased amount of kernel code executed (and thus the increase in > kernel attack surface) I am very paranoid about enabling code that > trusts data that could be poisoned data from a hostile party. > > At the same time I am very uncomfortable with the fact the kernel does > not protect against malicious disks and poisoned disk images. As > poisoned disk images are a well known exploit vector in the wild. A > well known and demonstrated attack vector that works is to leave a usb > stick in a public place, and helpful people will place it into their > computer to try and figure out who it belongs to. In trying to be > helpful their computer will unbeknownst to them start executing code > that does not serve the interests of the computer owner. I hate that we > can not currently protect people from shenanigans like that. Yes, we know all about these problems. Unfortunately someone appears to not be listening when they being repeatedly told that hardening all the kernel filesystem implementations against poisoned images is simply not a viable solution to the problem. Move the parsing of untrusted structures out of the kernel - work with the various filesystem teams to build viable FUSE implementations (where it's much easier to incorporate parts of the userspace fsck code) and provide the FUSE filesystems to container users wanting to mount their own filesystem images. Cheers, Dave.
On Wed, Jul 13, 2016 at 11:34:36AM +1000, Dave Chinner wrote: > On Mon, Jul 11, 2016 at 01:12:49PM -0500, Eric W. Biederman wrote: > > The place where I am concerned about thorough review and testing is > > someone poisoning quota files and then the kernel trying to use them. > > In the preliminary work we have done in other places in the kernel and > > for other filesystems there almost always winds up being some way to > > confuse the kernel and get it to misbave if you can poison the disk > > based inputs. As poison disk based inputs is not something filesystems > > are stronlgy concerned about. In most cases the disk the filesystem > > resides on is in the box and therefore under control of the OS at all > > times. Dave Chinner has even said he will never consider handling > > poisoned disk based inputs for XFS as the run time cost is too high. > > I didn't say that. I said that comprehensive checks to catch all > possible malicious inputs is too expensive to consider a viable > solution for allowing user-mounts of arbitrary filesystem images > through the kernel. [.....] > To bring this back to quota files, the only way to validate that a > quota file has not been tampered with is to run a quotacheck on the > filesystem once it has been mounted. This requires visiting every > inode in the filesystem, so it an expensive operation. Only XFS has > this functionality in kernel, so for untrusted mounts we could > simply run it on every mount that has quotas enabled. Of course, > users won't care that mounting their filesystem now takes several > minutes (hours, even, when we have millions of inodes in the fs) > while these checks are run... So, over lunch I realised the problem with this. quotacheck is verifying the contents of the quota file, but we haven't verified the structure of the quota file to begin with. Hence just enabling quotas could cause the filesystem to do bad things in the kernel on mount if the quota file metadata has been tampered with. IOWs, it's not just quota data parsing that we have to be concerned with here - parsing the quota file structure itself could be an attack vector that triggers on mount. Cheers, Dave.
On Wed, Jul 13, 2016 at 11:34:36AM +1000, Dave Chinner wrote: > On Mon, Jul 11, 2016 at 01:12:49PM -0500, Eric W. Biederman wrote: > > The place where I am concerned about thorough review and testing is > > someone poisoning quota files and then the kernel trying to use them. > > In the preliminary work we have done in other places in the kernel and > > for other filesystems there almost always winds up being some way to > > confuse the kernel and get it to misbave if you can poison the disk > > based inputs. As poison disk based inputs is not something filesystems > > are stronlgy concerned about. In most cases the disk the filesystem > > resides on is in the box and therefore under control of the OS at all > > times. Dave Chinner has even said he will never consider handling > > poisoned disk based inputs for XFS as the run time cost is too high. > > I didn't say that. I said that comprehensive checks to catch all > possible malicious inputs is too expensive to consider a viable > solution for allowing user-mounts of arbitrary filesystem images > through the kernel. > [...] > > To bring this back to quota files, the only way to validate that a > quota file has not been tampered with is to run a quotacheck on the > filesystem once it has been mounted. This requires visiting every > inode in the filesystem, so it an expensive operation. Only XFS has > this functionality in kernel, so for untrusted mounts we could > simply run it on every mount that has quotas enabled. Of course, > users won't care that mounting their filesystem now takes several > minutes (hours, even, when we have millions of inodes in the fs) > while these checks are run... > > Detecting malicious corruptions that specifically manipulate the > on-disk structure within the bounds of format validity are difficult > to detect and costly to protect against. We'd need to move large > parts of fsck into the kernel and run it to validate every piece of > metadata read into the kernel. Then we've got a much larger attack > surface in the kernel (all the validity checking code needs to be > robust against invalid structures, too!), a lot more complexity > (more bugs!) and a lot of additional runtime overhead (slow > filesystem = unhappy users!). It's just not a practical solution to > the problem. And ideally, you'd want to also guard against an evil disk that suddenly changes its contents after you've run fsck on it, and you can't easily do that without making things complicated.
Jann Horn <jann@thejh.net> writes: > On Wed, Jul 13, 2016 at 11:34:36AM +1000, Dave Chinner wrote: >> On Mon, Jul 11, 2016 at 01:12:49PM -0500, Eric W. Biederman wrote: >> > The place where I am concerned about thorough review and testing is >> > someone poisoning quota files and then the kernel trying to use them. >> > In the preliminary work we have done in other places in the kernel and >> > for other filesystems there almost always winds up being some way to >> > confuse the kernel and get it to misbave if you can poison the disk >> > based inputs. As poison disk based inputs is not something filesystems >> > are stronlgy concerned about. In most cases the disk the filesystem >> > resides on is in the box and therefore under control of the OS at all >> > times. Dave Chinner has even said he will never consider handling >> > poisoned disk based inputs for XFS as the run time cost is too high. >> >> I didn't say that. I said that comprehensive checks to catch all >> possible malicious inputs is too expensive to consider a viable >> solution for allowing user-mounts of arbitrary filesystem images >> through the kernel. Dave you said that speaking as the XFS maintainer. So I take that to be your position and to refer to XFS. > [...] >> >> To bring this back to quota files, the only way to validate that a >> quota file has not been tampered with is to run a quotacheck on the >> filesystem once it has been mounted. This requires visiting every >> inode in the filesystem, so it an expensive operation. Only XFS has >> this functionality in kernel, so for untrusted mounts we could >> simply run it on every mount that has quotas enabled. Of course, >> users won't care that mounting their filesystem now takes several >> minutes (hours, even, when we have millions of inodes in the fs) >> while these checks are run... >> >> Detecting malicious corruptions that specifically manipulate the >> on-disk structure within the bounds of format validity are difficult >> to detect and costly to protect against. We'd need to move large >> parts of fsck into the kernel and run it to validate every piece of >> metadata read into the kernel. Then we've got a much larger attack >> surface in the kernel (all the validity checking code needs to be >> robust against invalid structures, too!), a lot more complexity >> (more bugs!) and a lot of additional runtime overhead (slow >> filesystem = unhappy users!). It's just not a practical solution to >> the problem. This critiqute as I read it confuses data integrity and safety from privilege escalation. If a filesystem image or it's backing store are malicious there is no need to be concerned about data integrity. > And ideally, you'd want to also guard against an evil disk that > suddenly changes its contents after you've run fsck on it, and you > can't easily do that without making things complicated. I agree an evil disk definitely should be part of any threat analysis. I also agree that anything that is complicated is not a practical as complicated means lots of code, and bugs are in general a function of the amount of code. At the same time my only concern when analyzing something for safety against a malicious filesystem is can the malicious data cause the kernel to misbehave. This includes things like stack overflows, and memory corruption. In the specific case of a quota file, if there is a quota file that has little to no resemblence to reality but the kernel doesn't misbehave, I don't think that is a problem from an unprivileged mount perspective. On the flip side if a malicious quota file allows an in kernel quota variable to go negative and that causes the kernel to misbehave that is a show stopper for allowing an unprivileged mount. Personally I see the quantity of code in current filesystems as making it hard to have a low enough probability of problems to allow unprivileged mounts. Changes for all of the weird cases a backing store for unprivileged mounts brings with it have been added to the VFS because that is the right place to implement them. Working at a filesystem independent level allows all of the right people involved in the review, and it allows clean and general solutions to the weird cases that come up with a uid or gid does not map into the kernel. All of that said the only filesystem with backing store that I see as a reasonable target to support right now is fuse. As fuse was designed from the get go to support filesystems from unprivileged users. Will fuse someday support quotas? I don't know. But the there is no extra cost to support that case in fs/quota/quota.c so I have added the necessary code. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ff21980d0119..74706b6aa747 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -841,6 +841,9 @@ struct dquot *dqget(struct super_block *sb, struct kqid qid) unsigned int hashent = hashfn(sb, qid); struct dquot *dquot, *empty = NULL; + if (!qid_has_mapping(sb->s_user_ns, qid)) + return ERR_PTR(-EINVAL); + if (!sb_has_quota_active(sb, qid.type)) return ERR_PTR(-ESRCH); we_slept: diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 0f10ee9892ce..73f6f4cf0a21 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -211,7 +211,7 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_dqblk(sb, qid, &fdq); if (ret) @@ -237,7 +237,7 @@ static int quota_getnextquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_nextdqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq); if (ret) @@ -288,7 +288,7 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->set_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; copy_from_if_dqblk(&fdq, &idq); return sb->s_qcop->set_dqblk(sb, qid, &fdq); @@ -581,7 +581,7 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->set_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; /* Are we actually setting timer / warning limits for all users? */ if (from_kqid(&init_user_ns, qid) == 0 && @@ -642,7 +642,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_dqblk(sb, qid, &qdq); if (ret) @@ -669,7 +669,7 @@ static int quota_getnextxquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_nextdqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq); if (ret) diff --git a/include/linux/quota.h b/include/linux/quota.h index 9dfb6bce8c9e..1db16ee39b31 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -179,6 +179,16 @@ static inline struct kqid make_kqid_projid(kprojid_t projid) return kqid; } +/** + * qid_has_mapping - Report if a qid maps into a user namespace. + * @ns: The user namespace to see if a value maps into. + * @qid: The kernel internal quota identifier to test. + */ +static inline bool qid_has_mapping(struct user_namespace *ns, struct kqid qid) +{ + return from_kqid(ns, qid) != (qid_t) -1; +} + extern spinlock_t dq_data_lock;