Message ID | 1472097235-6332-3-git-send-email-kwankhede@nvidia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, 25 Aug 2016 09:23:53 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote: [...] Dear Kirti, I just rebased my vfio-ccw patches to this series. With a little fix, which was pointed it out in my reply to the #3 patch, it works fine. > +static long vfio_mdev_unlocked_ioctl(void *device_data, > + unsigned int cmd, unsigned long arg) > +{ > + int ret = 0; > + struct vfio_mdev *vmdev = device_data; > + struct parent_device *parent = vmdev->mdev->parent; > + unsigned long minsz; > + > + switch (cmd) { > + case VFIO_DEVICE_GET_INFO: > + { > + struct vfio_device_info info; > + > + minsz = offsetofend(struct vfio_device_info, num_irqs); > + > + if (copy_from_user(&info, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (info.argsz < minsz) > + return -EINVAL; > + > + if (parent->ops->get_device_info) > + ret = parent->ops->get_device_info(vmdev->mdev, &info); > + else > + return -EINVAL; > + > + if (ret) > + return ret; > + > + if (parent->ops->reset) > + info.flags |= VFIO_DEVICE_FLAGS_RESET; Shouldn't this be done inside the get_device_info callback? > + > + memcpy(&vmdev->dev_info, &info, sizeof(info)); > + > + return copy_to_user((void __user *)arg, &info, minsz); > + } [...] > + > +static ssize_t vfio_mdev_read(void *device_data, char __user *buf, > + size_t count, loff_t *ppos) > +{ > + struct vfio_mdev *vmdev = device_data; > + struct mdev_device *mdev = vmdev->mdev; > + struct parent_device *parent = mdev->parent; > + unsigned int done = 0; > + int ret; > + > + if (!parent->ops->read) > + return -EINVAL; > + > + while (count) { Here, I have to say sorry to you guys for that I didn't notice the bad impact of this change to my patches during the v6 discussion. For vfio-ccw, I introduced an I/O region to input/output I/O instruction parameters and results for Qemu. The @count of these data currently is 140. So supporting arbitrary lengths in one shot here, and also in vfio_mdev_write, seems the better option for this case. I believe that if the pci drivers want to iterate in a 4 bytes step, you can do that in the parent read/write callbacks instead. What do you think? > + size_t filled; > + > + if (count >= 4 && !(*ppos % 4)) { > + u32 val; > + > + ret = parent->ops->read(mdev, (char *)&val, sizeof(val), > + *ppos); > + if (ret <= 0) > + goto read_err; > + > + if (copy_to_user(buf, &val, sizeof(val))) > + goto read_err; > + > + filled = 4; > + } else if (count >= 2 && !(*ppos % 2)) { > + u16 val; > + > + ret = parent->ops->read(mdev, (char *)&val, sizeof(val), > + *ppos); > + if (ret <= 0) > + goto read_err; > + > + if (copy_to_user(buf, &val, sizeof(val))) > + goto read_err; > + > + filled = 2; > + } else { > + u8 val; > + > + ret = parent->ops->read(mdev, &val, sizeof(val), *ppos); > + if (ret <= 0) > + goto read_err; > + > + if (copy_to_user(buf, &val, sizeof(val))) > + goto read_err; > + > + filled = 1; > + } > + > + count -= filled; > + done += filled; > + *ppos += filled; > + buf += filled; > + } > + > + return done; > + > +read_err: > + return -EFAULT; > +} [...] -------- Dong Jia -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 8/25/2016 2:52 PM, Dong Jia wrote: > On Thu, 25 Aug 2016 09:23:53 +0530 > Kirti Wankhede <kwankhede@nvidia.com> wrote: > > [...] > > Dear Kirti, > > I just rebased my vfio-ccw patches to this series. > With a little fix, which was pointed it out in my reply to the #3 > patch, it works fine. > Thanks for update. Glad to know this works for you. >> +static long vfio_mdev_unlocked_ioctl(void *device_data, >> + unsigned int cmd, unsigned long arg) >> +{ >> + int ret = 0; >> + struct vfio_mdev *vmdev = device_data; >> + struct parent_device *parent = vmdev->mdev->parent; >> + unsigned long minsz; >> + >> + switch (cmd) { >> + case VFIO_DEVICE_GET_INFO: >> + { >> + struct vfio_device_info info; >> + >> + minsz = offsetofend(struct vfio_device_info, num_irqs); >> + >> + if (copy_from_user(&info, (void __user *)arg, minsz)) >> + return -EFAULT; >> + >> + if (info.argsz < minsz) >> + return -EINVAL; >> + >> + if (parent->ops->get_device_info) >> + ret = parent->ops->get_device_info(vmdev->mdev, &info); >> + else >> + return -EINVAL; >> + >> + if (ret) >> + return ret; >> + >> + if (parent->ops->reset) >> + info.flags |= VFIO_DEVICE_FLAGS_RESET; > Shouldn't this be done inside the get_device_info callback? > I would like Vendor driver to set device type only. Reset flag should be set on basis of reset() callback provided. >> + >> + memcpy(&vmdev->dev_info, &info, sizeof(info)); >> + >> + return copy_to_user((void __user *)arg, &info, minsz); >> + } > [...] > >> + >> +static ssize_t vfio_mdev_read(void *device_data, char __user *buf, >> + size_t count, loff_t *ppos) >> +{ >> + struct vfio_mdev *vmdev = device_data; >> + struct mdev_device *mdev = vmdev->mdev; >> + struct parent_device *parent = mdev->parent; >> + unsigned int done = 0; >> + int ret; >> + >> + if (!parent->ops->read) >> + return -EINVAL; >> + >> + while (count) { > Here, I have to say sorry to you guys for that I didn't notice the > bad impact of this change to my patches during the v6 discussion. > > For vfio-ccw, I introduced an I/O region to input/output I/O > instruction parameters and results for Qemu. The @count of these data > currently is 140. So supporting arbitrary lengths in one shot here, and > also in vfio_mdev_write, seems the better option for this case. > > I believe that if the pci drivers want to iterate in a 4 bytes step, you > can do that in the parent read/write callbacks instead. > > What do you think? > I would like to know Alex's thought on this. He raised concern with this approach in v6 reviews: "But I think this is exploitable, it lets the user make the kernel allocate an arbitrarily sized buffer." Thanks, Kirti >> + size_t filled; >> + >> + if (count >= 4 && !(*ppos % 4)) { >> + u32 val; >> + >> + ret = parent->ops->read(mdev, (char *)&val, sizeof(val), >> + *ppos); >> + if (ret <= 0) >> + goto read_err; >> + >> + if (copy_to_user(buf, &val, sizeof(val))) >> + goto read_err; >> + >> + filled = 4; >> + } else if (count >= 2 && !(*ppos % 2)) { >> + u16 val; >> + >> + ret = parent->ops->read(mdev, (char *)&val, sizeof(val), >> + *ppos); >> + if (ret <= 0) >> + goto read_err; >> + >> + if (copy_to_user(buf, &val, sizeof(val))) >> + goto read_err; >> + >> + filled = 2; >> + } else { >> + u8 val; >> + >> + ret = parent->ops->read(mdev, &val, sizeof(val), *ppos); >> + if (ret <= 0) >> + goto read_err; >> + >> + if (copy_to_user(buf, &val, sizeof(val))) >> + goto read_err; >> + >> + filled = 1; >> + } >> + >> + count -= filled; >> + done += filled; >> + *ppos += filled; >> + buf += filled; >> + } >> + >> + return done; >> + >> +read_err: >> + return -EFAULT; >> +} > [...] > > -------- > Dong Jia > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/26/2016 10:13 PM, Kirti Wankhede wrote: > > > On 8/25/2016 2:52 PM, Dong Jia wrote: >> On Thu, 25 Aug 2016 09:23:53 +0530 >> Kirti Wankhede <kwankhede@nvidia.com> wrote: >> >> [...] >> >> Dear Kirti, >> >> I just rebased my vfio-ccw patches to this series. >> With a little fix, which was pointed it out in my reply to the #3 >> patch, it works fine. >> > > Thanks for update. Glad to know this works for you. > > >>> +static long vfio_mdev_unlocked_ioctl(void *device_data, >>> + unsigned int cmd, unsigned long arg) >>> +{ >>> + int ret = 0; >>> + struct vfio_mdev *vmdev = device_data; >>> + struct parent_device *parent = vmdev->mdev->parent; >>> + unsigned long minsz; >>> + >>> + switch (cmd) { >>> + case VFIO_DEVICE_GET_INFO: >>> + { >>> + struct vfio_device_info info; >>> + >>> + minsz = offsetofend(struct vfio_device_info, num_irqs); >>> + >>> + if (copy_from_user(&info, (void __user *)arg, minsz)) >>> + return -EFAULT; >>> + >>> + if (info.argsz < minsz) >>> + return -EINVAL; >>> + >>> + if (parent->ops->get_device_info) >>> + ret = parent->ops->get_device_info(vmdev->mdev, &info); >>> + else >>> + return -EINVAL; >>> + >>> + if (ret) >>> + return ret; >>> + >>> + if (parent->ops->reset) >>> + info.flags |= VFIO_DEVICE_FLAGS_RESET; >> Shouldn't this be done inside the get_device_info callback? >> > > I would like Vendor driver to set device type only. Reset flag should be > set on basis of reset() callback provided. > >>> + >>> + memcpy(&vmdev->dev_info, &info, sizeof(info)); >>> + >>> + return copy_to_user((void __user *)arg, &info, minsz); >>> + } >> [...] >> >>> + >>> +static ssize_t vfio_mdev_read(void *device_data, char __user *buf, >>> + size_t count, loff_t *ppos) >>> +{ >>> + struct vfio_mdev *vmdev = device_data; >>> + struct mdev_device *mdev = vmdev->mdev; >>> + struct parent_device *parent = mdev->parent; >>> + unsigned int done = 0; >>> + int ret; >>> + >>> + if (!parent->ops->read) >>> + return -EINVAL; >>> + >>> + while (count) { >> Here, I have to say sorry to you guys for that I didn't notice the >> bad impact of this change to my patches during the v6 discussion. >> >> For vfio-ccw, I introduced an I/O region to input/output I/O >> instruction parameters and results for Qemu. The @count of these data >> currently is 140. So supporting arbitrary lengths in one shot here, and >> also in vfio_mdev_write, seems the better option for this case. >> >> I believe that if the pci drivers want to iterate in a 4 bytes step, you >> can do that in the parent read/write callbacks instead. >> >> What do you think? >> > > I would like to know Alex's thought on this. He raised concern with this > approach in v6 reviews: > "But I think this is exploitable, it lets the user make the kernel > allocate an arbitrarily sized buffer." It is impossible to check count here, because one simply doesn't have the knowledge of this region. VFIO_DEVICE_GET_REGION_INFO was implemented in vfio-mdev.ko, while decoding the vfio_mdev_read to a particular MMIO region was expected to be implemented in vendor driver, that results in unbalanced interfaces. To have balanced interfaces, you either: - call ioctl instead of GET_REGION_INFO - call read instead of decoding REGION or: - call GET_REGION_INFO instead of ioctl - decode REGION in read, and check its validity, call region-specific read function V6 was the latter, v7 is kind of a mixture of these two, while I believe the former will completely address such problem :) -- Thanks, Jike >>> + size_t filled; >>> + >>> + if (count >= 4 && !(*ppos % 4)) { >>> + u32 val; >>> + >>> + ret = parent->ops->read(mdev, (char *)&val, sizeof(val), >>> + *ppos); >>> + if (ret <= 0) >>> + goto read_err; >>> + >>> + if (copy_to_user(buf, &val, sizeof(val))) >>> + goto read_err; >>> + >>> + filled = 4; >>> + } else if (count >= 2 && !(*ppos % 2)) { >>> + u16 val; >>> + >>> + ret = parent->ops->read(mdev, (char *)&val, sizeof(val), >>> + *ppos); >>> + if (ret <= 0) >>> + goto read_err; >>> + >>> + if (copy_to_user(buf, &val, sizeof(val))) >>> + goto read_err; >>> + >>> + filled = 2; >>> + } else { >>> + u8 val; >>> + >>> + ret = parent->ops->read(mdev, &val, sizeof(val), *ppos); >>> + if (ret <= 0) >>> + goto read_err; >>> + >>> + if (copy_to_user(buf, &val, sizeof(val))) >>> + goto read_err; >>> + >>> + filled = 1; >>> + } >>> + >>> + count -= filled; >>> + done += filled; >>> + *ppos += filled; >>> + buf += filled; >>> + } >>> + >>> + return done; >>> + >>> +read_err: >>> + return -EFAULT; >>> +} >> [...] >> >> -------- >> Dong Jia >> -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/25/2016 05:22 PM, Dong Jia wrote: > On Thu, 25 Aug 2016 09:23:53 +0530 > Kirti Wankhede <kwankhede@nvidia.com> wrote: > > [...] > > Dear Kirti, > > I just rebased my vfio-ccw patches to this series. > With a little fix, which was pointed it out in my reply to the #3 > patch, it works fine. > Hi Jia, Sorry I didn't follow a lot in previous discussion, but since vfio-mdev in v7 patchset is at least PCI-agnostic, would you share with us why you still need a vfio-ccw? -- Thanks, Jike >> +static long vfio_mdev_unlocked_ioctl(void *device_data, >> + unsigned int cmd, unsigned long arg) >> +{ >> + int ret = 0; >> + struct vfio_mdev *vmdev = device_data; >> + struct parent_device *parent = vmdev->mdev->parent; >> + unsigned long minsz; >> + >> + switch (cmd) { >> + case VFIO_DEVICE_GET_INFO: >> + { >> + struct vfio_device_info info; >> + >> + minsz = offsetofend(struct vfio_device_info, num_irqs); >> + >> + if (copy_from_user(&info, (void __user *)arg, minsz)) >> + return -EFAULT; >> + >> + if (info.argsz < minsz) >> + return -EINVAL; >> + >> + if (parent->ops->get_device_info) >> + ret = parent->ops->get_device_info(vmdev->mdev, &info); >> + else >> + return -EINVAL; >> + >> + if (ret) >> + return ret; >> + >> + if (parent->ops->reset) >> + info.flags |= VFIO_DEVICE_FLAGS_RESET; > Shouldn't this be done inside the get_device_info callback? > >> + >> + memcpy(&vmdev->dev_info, &info, sizeof(info)); >> + >> + return copy_to_user((void __user *)arg, &info, minsz); >> + } > [...] > >> + >> +static ssize_t vfio_mdev_read(void *device_data, char __user *buf, >> + size_t count, loff_t *ppos) >> +{ >> + struct vfio_mdev *vmdev = device_data; >> + struct mdev_device *mdev = vmdev->mdev; >> + struct parent_device *parent = mdev->parent; >> + unsigned int done = 0; >> + int ret; >> + >> + if (!parent->ops->read) >> + return -EINVAL; >> + >> + while (count) { > Here, I have to say sorry to you guys for that I didn't notice the > bad impact of this change to my patches during the v6 discussion. > > For vfio-ccw, I introduced an I/O region to input/output I/O > instruction parameters and results for Qemu. The @count of these data > currently is 140. So supporting arbitrary lengths in one shot here, and > also in vfio_mdev_write, seems the better option for this case. > > I believe that if the pci drivers want to iterate in a 4 bytes step, you > can do that in the parent read/write callbacks instead. > > What do you think? > >> + size_t filled; >> + >> + if (count >= 4 && !(*ppos % 4)) { >> + u32 val; >> + >> + ret = parent->ops->read(mdev, (char *)&val, sizeof(val), >> + *ppos); >> + if (ret <= 0) >> + goto read_err; >> + >> + if (copy_to_user(buf, &val, sizeof(val))) >> + goto read_err; >> + >> + filled = 4; >> + } else if (count >= 2 && !(*ppos % 2)) { >> + u16 val; >> + >> + ret = parent->ops->read(mdev, (char *)&val, sizeof(val), >> + *ppos); >> + if (ret <= 0) >> + goto read_err; >> + >> + if (copy_to_user(buf, &val, sizeof(val))) >> + goto read_err; >> + >> + filled = 2; >> + } else { >> + u8 val; >> + >> + ret = parent->ops->read(mdev, &val, sizeof(val), *ppos); >> + if (ret <= 0) >> + goto read_err; >> + >> + if (copy_to_user(buf, &val, sizeof(val))) >> + goto read_err; >> + >> + filled = 1; >> + } >> + >> + count -= filled; >> + done += filled; >> + *ppos += filled; >> + buf += filled; >> + } >> + >> + return done; >> + >> +read_err: >> + return -EFAULT; >> +} > [...] > > -------- > Dong Jia > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/08/2016 10:45 AM, Jike Song wrote: > On 08/25/2016 05:22 PM, Dong Jia wrote: >> On Thu, 25 Aug 2016 09:23:53 +0530 >> Kirti Wankhede <kwankhede@nvidia.com> wrote: >> >> [...] >> >> Dear Kirti, >> >> I just rebased my vfio-ccw patches to this series. >> With a little fix, which was pointed it out in my reply to the #3 >> patch, it works fine. >> > > Hi Jia, > > Sorry I didn't follow a lot in previous discussion, but since > vfio-mdev in v7 patchset is at least PCI-agnostic, would you share > with us why you still need a vfio-ccw? Kind ping :) Hi Dong Jia, Since Kirti has confirmed that in v7 it is designed to have only one vfio-mdev driver for all mdev devices, would you please tell us the reason of your vfio-ccw? It could possibly be an architectural gap and the earlier we discuss it the better :) -- Thanks, Jike >>> +static long vfio_mdev_unlocked_ioctl(void *device_data, >>> + unsigned int cmd, unsigned long arg) >>> +{ >>> + int ret = 0; >>> + struct vfio_mdev *vmdev = device_data; >>> + struct parent_device *parent = vmdev->mdev->parent; >>> + unsigned long minsz; >>> + >>> + switch (cmd) { >>> + case VFIO_DEVICE_GET_INFO: >>> + { >>> + struct vfio_device_info info; >>> + >>> + minsz = offsetofend(struct vfio_device_info, num_irqs); >>> + >>> + if (copy_from_user(&info, (void __user *)arg, minsz)) >>> + return -EFAULT; >>> + >>> + if (info.argsz < minsz) >>> + return -EINVAL; >>> + >>> + if (parent->ops->get_device_info) >>> + ret = parent->ops->get_device_info(vmdev->mdev, &info); >>> + else >>> + return -EINVAL; >>> + >>> + if (ret) >>> + return ret; >>> + >>> + if (parent->ops->reset) >>> + info.flags |= VFIO_DEVICE_FLAGS_RESET; >> Shouldn't this be done inside the get_device_info callback? >> >>> + >>> + memcpy(&vmdev->dev_info, &info, sizeof(info)); >>> + >>> + return copy_to_user((void __user *)arg, &info, minsz); >>> + } >> [...] >> >>> + >>> +static ssize_t vfio_mdev_read(void *device_data, char __user *buf, >>> + size_t count, loff_t *ppos) >>> +{ >>> + struct vfio_mdev *vmdev = device_data; >>> + struct mdev_device *mdev = vmdev->mdev; >>> + struct parent_device *parent = mdev->parent; >>> + unsigned int done = 0; >>> + int ret; >>> + >>> + if (!parent->ops->read) >>> + return -EINVAL; >>> + >>> + while (count) { >> Here, I have to say sorry to you guys for that I didn't notice the >> bad impact of this change to my patches during the v6 discussion. >> >> For vfio-ccw, I introduced an I/O region to input/output I/O >> instruction parameters and results for Qemu. The @count of these data >> currently is 140. So supporting arbitrary lengths in one shot here, and >> also in vfio_mdev_write, seems the better option for this case. >> >> I believe that if the pci drivers want to iterate in a 4 bytes step, you >> can do that in the parent read/write callbacks instead. >> >> What do you think? >> >>> + size_t filled; >>> + >>> + if (count >= 4 && !(*ppos % 4)) { >>> + u32 val; >>> + >>> + ret = parent->ops->read(mdev, (char *)&val, sizeof(val), >>> + *ppos); >>> + if (ret <= 0) >>> + goto read_err; >>> + >>> + if (copy_to_user(buf, &val, sizeof(val))) >>> + goto read_err; >>> + >>> + filled = 4; >>> + } else if (count >= 2 && !(*ppos % 2)) { >>> + u16 val; >>> + >>> + ret = parent->ops->read(mdev, (char *)&val, sizeof(val), >>> + *ppos); >>> + if (ret <= 0) >>> + goto read_err; >>> + >>> + if (copy_to_user(buf, &val, sizeof(val))) >>> + goto read_err; >>> + >>> + filled = 2; >>> + } else { >>> + u8 val; >>> + >>> + ret = parent->ops->read(mdev, &val, sizeof(val), *ppos); >>> + if (ret <= 0) >>> + goto read_err; >>> + >>> + if (copy_to_user(buf, &val, sizeof(val))) >>> + goto read_err; >>> + >>> + filled = 1; >>> + } >>> + >>> + count -= filled; >>> + done += filled; >>> + *ppos += filled; >>> + buf += filled; >>> + } >>> + >>> + return done; >>> + >>> +read_err: >>> + return -EFAULT; >>> +} >> [...] >> >> -------- >> Dong Jia >> -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 8/26/2016 7:43 PM, Kirti Wankhede wrote: > * PGP Signed: 08/26/2016 at 07:15:44 AM, Decrypted > On 8/25/2016 2:52 PM, Dong Jia wrote: >> On Thu, 25 Aug 2016 09:23:53 +0530 >>> + >>> +static ssize_t vfio_mdev_read(void *device_data, char __user *buf, >>> + size_t count, loff_t *ppos) >>> +{ >>> + struct vfio_mdev *vmdev = device_data; >>> + struct mdev_device *mdev = vmdev->mdev; >>> + struct parent_device *parent = mdev->parent; >>> + unsigned int done = 0; >>> + int ret; >>> + >>> + if (!parent->ops->read) >>> + return -EINVAL; >>> + >>> + while (count) { >> Here, I have to say sorry to you guys for that I didn't notice the >> bad impact of this change to my patches during the v6 discussion. >> >> For vfio-ccw, I introduced an I/O region to input/output I/O >> instruction parameters and results for Qemu. The @count of these data >> currently is 140. So supporting arbitrary lengths in one shot here, and >> also in vfio_mdev_write, seems the better option for this case. >> >> I believe that if the pci drivers want to iterate in a 4 bytes step, you >> can do that in the parent read/write callbacks instead. >> >> What do you think? >> > > I would like to know Alex's thought on this. He raised concern with this > approach in v6 reviews: > "But I think this is exploitable, it lets the user make the kernel > allocate an arbitrarily sized buffer." > Read/write callbacks are for slow path, emulation of mmio region which are mainly device registers. I do feel it shouldn't support arbitrary lengths. Alex, I would like to know your thoughts. Thanks, Kirti -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 19 Sep 2016 23:52:36 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote: > On 8/26/2016 7:43 PM, Kirti Wankhede wrote: > > * PGP Signed: 08/26/2016 at 07:15:44 AM, Decrypted > > On 8/25/2016 2:52 PM, Dong Jia wrote: > >> On Thu, 25 Aug 2016 09:23:53 +0530 > > >>> + > >>> +static ssize_t vfio_mdev_read(void *device_data, char __user *buf, > >>> + size_t count, loff_t *ppos) > >>> +{ > >>> + struct vfio_mdev *vmdev = device_data; > >>> + struct mdev_device *mdev = vmdev->mdev; > >>> + struct parent_device *parent = mdev->parent; > >>> + unsigned int done = 0; > >>> + int ret; > >>> + > >>> + if (!parent->ops->read) > >>> + return -EINVAL; > >>> + > >>> + while (count) { > >> Here, I have to say sorry to you guys for that I didn't notice the > >> bad impact of this change to my patches during the v6 discussion. > >> > >> For vfio-ccw, I introduced an I/O region to input/output I/O > >> instruction parameters and results for Qemu. The @count of these data > >> currently is 140. So supporting arbitrary lengths in one shot here, and > >> also in vfio_mdev_write, seems the better option for this case. > >> > >> I believe that if the pci drivers want to iterate in a 4 bytes step, you > >> can do that in the parent read/write callbacks instead. > >> > >> What do you think? > >> > > > > I would like to know Alex's thought on this. He raised concern with this > > approach in v6 reviews: > > "But I think this is exploitable, it lets the user make the kernel > > allocate an arbitrarily sized buffer." > > > > Read/write callbacks are for slow path, emulation of mmio region which > are mainly device registers. I do feel it shouldn't support arbitrary > lengths. > Alex, I would like to know your thoughts. The exploit was that the mdev layer allocated a buffer and copied the entire user buffer into kernel space before passing it to the vendor driver. The solution is to pass the __user *buf to the vendor driver and let them sanitize and split it however makes sense for their device. We shouldn't be assuming naturally aligned, up to dword accesses in the generic mdev layers. Those sorts of semantics are defined by the device type. This is another case where making the mdev layer as thin as possible is actually the best thing to do to make it really device type agnostic. Thanks, Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 9/20/2016 12:06 AM, Alex Williamson wrote: > On Mon, 19 Sep 2016 23:52:36 +0530 > Kirti Wankhede <kwankhede@nvidia.com> wrote: > >> On 8/26/2016 7:43 PM, Kirti Wankhede wrote: >>> * PGP Signed: 08/26/2016 at 07:15:44 AM, Decrypted >>> On 8/25/2016 2:52 PM, Dong Jia wrote: >>>> On Thu, 25 Aug 2016 09:23:53 +0530 >> >>>>> + >>>>> +static ssize_t vfio_mdev_read(void *device_data, char __user *buf, >>>>> + size_t count, loff_t *ppos) >>>>> +{ >>>>> + struct vfio_mdev *vmdev = device_data; >>>>> + struct mdev_device *mdev = vmdev->mdev; >>>>> + struct parent_device *parent = mdev->parent; >>>>> + unsigned int done = 0; >>>>> + int ret; >>>>> + >>>>> + if (!parent->ops->read) >>>>> + return -EINVAL; >>>>> + >>>>> + while (count) { >>>> Here, I have to say sorry to you guys for that I didn't notice the >>>> bad impact of this change to my patches during the v6 discussion. >>>> >>>> For vfio-ccw, I introduced an I/O region to input/output I/O >>>> instruction parameters and results for Qemu. The @count of these data >>>> currently is 140. So supporting arbitrary lengths in one shot here, and >>>> also in vfio_mdev_write, seems the better option for this case. >>>> >>>> I believe that if the pci drivers want to iterate in a 4 bytes step, you >>>> can do that in the parent read/write callbacks instead. >>>> >>>> What do you think? >>>> >>> >>> I would like to know Alex's thought on this. He raised concern with this >>> approach in v6 reviews: >>> "But I think this is exploitable, it lets the user make the kernel >>> allocate an arbitrarily sized buffer." >>> >> >> Read/write callbacks are for slow path, emulation of mmio region which >> are mainly device registers. I do feel it shouldn't support arbitrary >> lengths. >> Alex, I would like to know your thoughts. > > The exploit was that the mdev layer allocated a buffer and copied the > entire user buffer into kernel space before passing it to the vendor > driver. The solution is to pass the __user *buf to the vendor driver > and let them sanitize and split it however makes sense for their > device. We shouldn't be assuming naturally aligned, up to dword > accesses in the generic mdev layers. Those sorts of semantics are > defined by the device type. This is another case where making > the mdev layer as thin as possible is actually the best thing to > do to make it really device type agnostic. Thanks, > Alex, These were the comments on v6 patch: >>> Do we really need to support arbitrary lengths in one shot? Seems >>> like >>> we could just use a 4 or 8 byte variable on the stack and iterate >>> until >>> done. >>> >> >> We just want to pass the arguments to vendor driver as is here.Vendor >> driver could take care of that. > But I think this is exploitable, it lets the user make the kernel > allocate an arbitrarily sized buffer. As per above discussion in v7 version, this module don't allocated memory from heap. If vendor driver allocates arbitrary memory in kernel space through mdev module interface, isn't that would be exploit? Thanks, Kirti ----------------------------------------------------------------------------------- This email message is for the sole use of the intended recipient(s) and may contain confidential information. Any unauthorized review, use, disclosure or distribution is prohibited. If you are not the intended recipient, please contact the sender by reply email and destroy all copies of the original message. ----------------------------------------------------------------------------------- -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 20 Sep 2016 00:43:15 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote: > On 9/20/2016 12:06 AM, Alex Williamson wrote: > > On Mon, 19 Sep 2016 23:52:36 +0530 > > Kirti Wankhede <kwankhede@nvidia.com> wrote: > > > >> On 8/26/2016 7:43 PM, Kirti Wankhede wrote: > >>> * PGP Signed: 08/26/2016 at 07:15:44 AM, Decrypted > >>> On 8/25/2016 2:52 PM, Dong Jia wrote: > >>>> On Thu, 25 Aug 2016 09:23:53 +0530 > >> > >>>>> + > >>>>> +static ssize_t vfio_mdev_read(void *device_data, char __user *buf, > >>>>> + size_t count, loff_t *ppos) > >>>>> +{ > >>>>> + struct vfio_mdev *vmdev = device_data; > >>>>> + struct mdev_device *mdev = vmdev->mdev; > >>>>> + struct parent_device *parent = mdev->parent; > >>>>> + unsigned int done = 0; > >>>>> + int ret; > >>>>> + > >>>>> + if (!parent->ops->read) > >>>>> + return -EINVAL; > >>>>> + > >>>>> + while (count) { > >>>> Here, I have to say sorry to you guys for that I didn't notice the > >>>> bad impact of this change to my patches during the v6 discussion. > >>>> > >>>> For vfio-ccw, I introduced an I/O region to input/output I/O > >>>> instruction parameters and results for Qemu. The @count of these data > >>>> currently is 140. So supporting arbitrary lengths in one shot here, and > >>>> also in vfio_mdev_write, seems the better option for this case. > >>>> > >>>> I believe that if the pci drivers want to iterate in a 4 bytes step, you > >>>> can do that in the parent read/write callbacks instead. > >>>> > >>>> What do you think? > >>>> > >>> > >>> I would like to know Alex's thought on this. He raised concern with this > >>> approach in v6 reviews: > >>> "But I think this is exploitable, it lets the user make the kernel > >>> allocate an arbitrarily sized buffer." > >>> > >> > >> Read/write callbacks are for slow path, emulation of mmio region which > >> are mainly device registers. I do feel it shouldn't support arbitrary > >> lengths. > >> Alex, I would like to know your thoughts. > > > > The exploit was that the mdev layer allocated a buffer and copied the > > entire user buffer into kernel space before passing it to the vendor > > driver. The solution is to pass the __user *buf to the vendor driver > > and let them sanitize and split it however makes sense for their > > device. We shouldn't be assuming naturally aligned, up to dword > > accesses in the generic mdev layers. Those sorts of semantics are > > defined by the device type. This is another case where making > > the mdev layer as thin as possible is actually the best thing to > > do to make it really device type agnostic. Thanks, > > > > > Alex, > > These were the comments on v6 patch: > > >>> Do we really need to support arbitrary lengths in one shot? Seems > >>> like > >>> we could just use a 4 or 8 byte variable on the stack and iterate > >>> until > >>> done. > >>> > >> > >> We just want to pass the arguments to vendor driver as is here.Vendor > >> driver could take care of that. > > > But I think this is exploitable, it lets the user make the kernel > > allocate an arbitrarily sized buffer. > > As per above discussion in v7 version, this module don't allocated > memory from heap. > > If vendor driver allocates arbitrary memory in kernel space through mdev > module interface, isn't that would be exploit? Yep, my 4-8/byte chunking idea was too PCI specific. If a vendor driver introduces an exploit, that's a bug in the vendor driver. I'm not sure if we can or should attempt to guard against that. Ultimately the vendor driver is either open source and we can inspect it for such exploits or it's closed source, taints the kernel, and we hope for the best. It might make a good unit test to perform substantially sized reads/writes to the mdev device. Perhaps the only sanity test we can make in the core is to verify the access doesn't exceed the size of the region as advertised by the vendor driver. Thanks, Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/20/2016 04:03 AM, Alex Williamson wrote: > On Tue, 20 Sep 2016 00:43:15 +0530 > Kirti Wankhede <kwankhede@nvidia.com> wrote: > >> On 9/20/2016 12:06 AM, Alex Williamson wrote: >>> On Mon, 19 Sep 2016 23:52:36 +0530 >>> Kirti Wankhede <kwankhede@nvidia.com> wrote: >>> >>>> On 8/26/2016 7:43 PM, Kirti Wankhede wrote: >>>>> * PGP Signed: 08/26/2016 at 07:15:44 AM, Decrypted >>>>> On 8/25/2016 2:52 PM, Dong Jia wrote: >>>>>> On Thu, 25 Aug 2016 09:23:53 +0530 >>>> >>>>>>> + >>>>>>> +static ssize_t vfio_mdev_read(void *device_data, char __user *buf, >>>>>>> + size_t count, loff_t *ppos) >>>>>>> +{ >>>>>>> + struct vfio_mdev *vmdev = device_data; >>>>>>> + struct mdev_device *mdev = vmdev->mdev; >>>>>>> + struct parent_device *parent = mdev->parent; >>>>>>> + unsigned int done = 0; >>>>>>> + int ret; >>>>>>> + >>>>>>> + if (!parent->ops->read) >>>>>>> + return -EINVAL; >>>>>>> + >>>>>>> + while (count) { >>>>>> Here, I have to say sorry to you guys for that I didn't notice the >>>>>> bad impact of this change to my patches during the v6 discussion. >>>>>> >>>>>> For vfio-ccw, I introduced an I/O region to input/output I/O >>>>>> instruction parameters and results for Qemu. The @count of these data >>>>>> currently is 140. So supporting arbitrary lengths in one shot here, and >>>>>> also in vfio_mdev_write, seems the better option for this case. >>>>>> >>>>>> I believe that if the pci drivers want to iterate in a 4 bytes step, you >>>>>> can do that in the parent read/write callbacks instead. >>>>>> >>>>>> What do you think? >>>>>> >>>>> >>>>> I would like to know Alex's thought on this. He raised concern with this >>>>> approach in v6 reviews: >>>>> "But I think this is exploitable, it lets the user make the kernel >>>>> allocate an arbitrarily sized buffer." >>>>> >>>> >>>> Read/write callbacks are for slow path, emulation of mmio region which >>>> are mainly device registers. I do feel it shouldn't support arbitrary >>>> lengths. >>>> Alex, I would like to know your thoughts. >>> >>> The exploit was that the mdev layer allocated a buffer and copied the >>> entire user buffer into kernel space before passing it to the vendor >>> driver. The solution is to pass the __user *buf to the vendor driver >>> and let them sanitize and split it however makes sense for their >>> device. We shouldn't be assuming naturally aligned, up to dword >>> accesses in the generic mdev layers. Those sorts of semantics are >>> defined by the device type. This is another case where making >>> the mdev layer as thin as possible is actually the best thing to >>> do to make it really device type agnostic. Thanks, >>> >> >> >> Alex, >> >> These were the comments on v6 patch: >> >>>>> Do we really need to support arbitrary lengths in one shot? Seems >>>>> like >>>>> we could just use a 4 or 8 byte variable on the stack and iterate >>>>> until >>>>> done. >>>>> >>>> >>>> We just want to pass the arguments to vendor driver as is here.Vendor >>>> driver could take care of that. >> >>> But I think this is exploitable, it lets the user make the kernel >>> allocate an arbitrarily sized buffer. >> >> As per above discussion in v7 version, this module don't allocated >> memory from heap. >> >> If vendor driver allocates arbitrary memory in kernel space through mdev >> module interface, isn't that would be exploit? > > Yep, my 4-8/byte chunking idea was too PCI specific. If a vendor > driver introduces an exploit, that's a bug in the vendor driver. I'm > not sure if we can or should attempt to guard against that. Ultimately > the vendor driver is either open source and we can inspect it for such > exploits or it's closed source, taints the kernel, and we hope for the > best. It might make a good unit test to perform substantially sized > reads/writes to the mdev device. Can't agree more! :-) > Perhaps the only sanity test we can > make in the core is to verify the access doesn't exceed the size of > the region as advertised by the vendor driver. Thanks, Even performing a lightweight sanity check, would require vfio-mdev to be able to decode the ppos into a particular region, that means information of all regions should be stored in the framework. I guess it is not your preferred way :) -- Thanks, Jike -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/20/2016 01:48 PM, Dong Jia Shi wrote: > * Jike Song <jike.song@intel.com> [2016-09-13 10:35:11 +0800]: > >> On 09/08/2016 10:45 AM, Jike Song wrote: >>> On 08/25/2016 05:22 PM, Dong Jia wrote: >>>> On Thu, 25 Aug 2016 09:23:53 +0530 >>>> Kirti Wankhede <kwankhede@nvidia.com> wrote: >>>> >>>> [...] >>>> >>>> Dear Kirti, >>>> >>>> I just rebased my vfio-ccw patches to this series. >>>> With a little fix, which was pointed it out in my reply to the #3 >>>> patch, it works fine. >>>> >>> >>> Hi Jia, >>> >>> Sorry I didn't follow a lot in previous discussion, but since >>> vfio-mdev in v7 patchset is at least PCI-agnostic, would you share >>> with us why you still need a vfio-ccw? >> >> Kind ping :) >> >> >> Hi Dong Jia, >> >> Since Kirti has confirmed that in v7 it is designed to have only one >> vfio-mdev driver for all mdev devices, would you please tell us the >> reason of your vfio-ccw? It could possibly be an architectural gap and >> the earlier we discuss it the better :) > > Hi Jike, > > Sorry for the late response. > NP, thanks for the info :) > I think you may mix up vfio-ccw with vfio-mccw, which is in the same > level with vfio-mpci in v6 (or vfio-mdev in v7). :> > > The term of vfio-ccw is in the same semantic level with vfio-pci. You > can understand vfio-ccw as the parent device driver in the ccw case. > So 'vfio-ccw' is actually the driver of physical device. > As you mentioned above, v7 provides an universal mdev driver (vfio-mdev) > for the mediated devices. So I don't need to provide a vfio-mccw (a > vendor specific mediated device driver) anymore, but I'd still need my > vfio-ccw (the parent device driver). Glad to know that vfio-mdev works for all mdev devices :) -- Thanks, Jike -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 20 Sep 2016 10:50:47 +0800 Jike Song <jike.song@intel.com> wrote: > On 09/20/2016 04:03 AM, Alex Williamson wrote: > > On Tue, 20 Sep 2016 00:43:15 +0530 > > Kirti Wankhede <kwankhede@nvidia.com> wrote: > > > >> On 9/20/2016 12:06 AM, Alex Williamson wrote: > >>> On Mon, 19 Sep 2016 23:52:36 +0530 > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote: > >>> > >>>> On 8/26/2016 7:43 PM, Kirti Wankhede wrote: > >>>>> * PGP Signed: 08/26/2016 at 07:15:44 AM, Decrypted > >>>>> On 8/25/2016 2:52 PM, Dong Jia wrote: > >>>>>> On Thu, 25 Aug 2016 09:23:53 +0530 > >>>> > >>>>>>> + > >>>>>>> +static ssize_t vfio_mdev_read(void *device_data, char __user *buf, > >>>>>>> + size_t count, loff_t *ppos) > >>>>>>> +{ > >>>>>>> + struct vfio_mdev *vmdev = device_data; > >>>>>>> + struct mdev_device *mdev = vmdev->mdev; > >>>>>>> + struct parent_device *parent = mdev->parent; > >>>>>>> + unsigned int done = 0; > >>>>>>> + int ret; > >>>>>>> + > >>>>>>> + if (!parent->ops->read) > >>>>>>> + return -EINVAL; > >>>>>>> + > >>>>>>> + while (count) { > >>>>>> Here, I have to say sorry to you guys for that I didn't notice the > >>>>>> bad impact of this change to my patches during the v6 discussion. > >>>>>> > >>>>>> For vfio-ccw, I introduced an I/O region to input/output I/O > >>>>>> instruction parameters and results for Qemu. The @count of these data > >>>>>> currently is 140. So supporting arbitrary lengths in one shot here, and > >>>>>> also in vfio_mdev_write, seems the better option for this case. > >>>>>> > >>>>>> I believe that if the pci drivers want to iterate in a 4 bytes step, you > >>>>>> can do that in the parent read/write callbacks instead. > >>>>>> > >>>>>> What do you think? > >>>>>> > >>>>> > >>>>> I would like to know Alex's thought on this. He raised concern with this > >>>>> approach in v6 reviews: > >>>>> "But I think this is exploitable, it lets the user make the kernel > >>>>> allocate an arbitrarily sized buffer." > >>>>> > >>>> > >>>> Read/write callbacks are for slow path, emulation of mmio region which > >>>> are mainly device registers. I do feel it shouldn't support arbitrary > >>>> lengths. > >>>> Alex, I would like to know your thoughts. > >>> > >>> The exploit was that the mdev layer allocated a buffer and copied the > >>> entire user buffer into kernel space before passing it to the vendor > >>> driver. The solution is to pass the __user *buf to the vendor driver > >>> and let them sanitize and split it however makes sense for their > >>> device. We shouldn't be assuming naturally aligned, up to dword > >>> accesses in the generic mdev layers. Those sorts of semantics are > >>> defined by the device type. This is another case where making > >>> the mdev layer as thin as possible is actually the best thing to > >>> do to make it really device type agnostic. Thanks, > >>> > >> > >> > >> Alex, > >> > >> These were the comments on v6 patch: > >> > >>>>> Do we really need to support arbitrary lengths in one shot? Seems > >>>>> like > >>>>> we could just use a 4 or 8 byte variable on the stack and iterate > >>>>> until > >>>>> done. > >>>>> > >>>> > >>>> We just want to pass the arguments to vendor driver as is here.Vendor > >>>> driver could take care of that. > >> > >>> But I think this is exploitable, it lets the user make the kernel > >>> allocate an arbitrarily sized buffer. > >> > >> As per above discussion in v7 version, this module don't allocated > >> memory from heap. > >> > >> If vendor driver allocates arbitrary memory in kernel space through mdev > >> module interface, isn't that would be exploit? > > > > Yep, my 4-8/byte chunking idea was too PCI specific. If a vendor > > driver introduces an exploit, that's a bug in the vendor driver. I'm > > not sure if we can or should attempt to guard against that. Ultimately > > the vendor driver is either open source and we can inspect it for such > > exploits or it's closed source, taints the kernel, and we hope for the > > best. It might make a good unit test to perform substantially sized > > reads/writes to the mdev device. > > Can't agree more! :-) > > > Perhaps the only sanity test we can > > make in the core is to verify the access doesn't exceed the size of > > the region as advertised by the vendor driver. Thanks, > > Even performing a lightweight sanity check, would require vfio-mdev > to be able to decode the ppos into a particular region, that means > information of all regions should be stored in the framework. I guess > it is not your preferred way :) There's certainly a trade-off there, we don't support dynamic regions, the user expects them to be stable and the mdev-core code can expect that also. It might simplify the vendor drivers slightly if the core could perform such a basic sanity test, but the cost to do so would be that the core needs to have an understanding of the region layout of the device. That seems like non-trivial overhead to consolidate testing that the vendor driver itself can do much more efficiently. Thanks, Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/21/2016 12:24 AM, Alex Williamson wrote: > On Tue, 20 Sep 2016 10:50:47 +0800 > Jike Song <jike.song@intel.com> wrote: /* trim the quotations */ >> Even performing a lightweight sanity check, would require vfio-mdev >> to be able to decode the ppos into a particular region, that means >> information of all regions should be stored in the framework. I guess >> it is not your preferred way :) > > There's certainly a trade-off there, we don't support dynamic regions, > the user expects them to be stable and the mdev-core code can expect > that also. It might simplify the vendor drivers slightly if the core > could perform such a basic sanity test, but the cost to do so would be > that the core needs to have an understanding of the region layout of > the device. I agree with why the requirement is, but I am suspicious that, if we assume the regions are stable, try to encode/decode that within the mdev-core framework - instead of vendor drivers - that is because we want mdev to be API compatible with vfio-pci? Being API compatible with vfio-pci is (IMHO) the most beautiful thing in current mdev design, but is it necessary to make it mandatory? How about letting the underlining vendor drivers to decide whether it is API compatible with vfio-pci, or will have a different set of userspace API? > That seems like non-trivial overhead to consolidate > testing that the vendor driver itself can do much more efficiently. Yes, this is also a trade-off if adopted :( -- Thanks, Jike -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 21 Sep 2016 11:19:17 +0800 Jike Song <jike.song@intel.com> wrote: > On 09/21/2016 12:24 AM, Alex Williamson wrote: > > On Tue, 20 Sep 2016 10:50:47 +0800 > > Jike Song <jike.song@intel.com> wrote: > > /* trim the quotations */ > > >> Even performing a lightweight sanity check, would require vfio-mdev > >> to be able to decode the ppos into a particular region, that means > >> information of all regions should be stored in the framework. I guess > >> it is not your preferred way :) > > > > There's certainly a trade-off there, we don't support dynamic regions, > > the user expects them to be stable and the mdev-core code can expect > > that also. It might simplify the vendor drivers slightly if the core > > could perform such a basic sanity test, but the cost to do so would be > > that the core needs to have an understanding of the region layout of > > the device. > > I agree with why the requirement is, but I am suspicious that, > if we assume the regions are stable, try to encode/decode that within > the mdev-core framework - instead of vendor drivers - that is because > we want mdev to be API compatible with vfio-pci? > > Being API compatible with vfio-pci is (IMHO) the most beautiful thing > in current mdev design, but is it necessary to make it mandatory? > How about letting the underlining vendor drivers to decide whether > it is API compatible with vfio-pci, or will have a different set of > userspace API? Are you assuming that I'm suggesting using VFIO_PCI_OFFSET_TO_INDEX in the mdev core? We've been through that, I've rejected it, that's not at all what I'm describing. The vfio bus driver defines the region layout, but once defined it is fixed for a given device instance. A user does not need to call ioctl(VFIO_DEVICE_GET_REGION_INFO) prior to every region access to make sure the region offsets haven't changed dynamically. If it's fixed to the user than it's also fixed to the mdev core for a given device instance, so nothing prevents the core code from doing its own enumeration of the region offsets and sizes and caching them into data structures. That has nothing whatsoever to do with vfio-pci and makes no assumptions about the layout of regions within device fd. Thanks, Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/21/2016 12:51 PM, Alex Williamson wrote: > On Wed, 21 Sep 2016 11:19:17 +0800 > Jike Song <jike.song@intel.com> wrote: > >> On 09/21/2016 12:24 AM, Alex Williamson wrote: >>> On Tue, 20 Sep 2016 10:50:47 +0800 >>> Jike Song <jike.song@intel.com> wrote: >> >> /* trim the quotations */ >> >>>> Even performing a lightweight sanity check, would require vfio-mdev >>>> to be able to decode the ppos into a particular region, that means >>>> information of all regions should be stored in the framework. I guess >>>> it is not your preferred way :) >>> >>> There's certainly a trade-off there, we don't support dynamic regions, >>> the user expects them to be stable and the mdev-core code can expect >>> that also. It might simplify the vendor drivers slightly if the core >>> could perform such a basic sanity test, but the cost to do so would be >>> that the core needs to have an understanding of the region layout of >>> the device. >> >> I agree with why the requirement is, but I am suspicious that, >> if we assume the regions are stable, try to encode/decode that within >> the mdev-core framework - instead of vendor drivers - that is because >> we want mdev to be API compatible with vfio-pci? >> >> Being API compatible with vfio-pci is (IMHO) the most beautiful thing >> in current mdev design, but is it necessary to make it mandatory? >> How about letting the underlining vendor drivers to decide whether >> it is API compatible with vfio-pci, or will have a different set of >> userspace API? > > Are you assuming that I'm suggesting using VFIO_PCI_OFFSET_TO_INDEX in > the mdev core? We've been through that, I've rejected it, that's not > at all what I'm describing. The vfio bus driver defines the region > layout, but once defined it is fixed for a given device instance. A > user does not need to call ioctl(VFIO_DEVICE_GET_REGION_INFO) prior to > every region access to make sure the region offsets haven't changed > dynamically. If it's fixed to the user than it's also fixed to the > mdev core for a given device instance, so nothing prevents the core > code from doing its own enumeration of the region offsets and sizes and > caching them into data structures. That has nothing whatsoever to do > with vfio-pci and makes no assumptions about the layout of regions > within device fd. Thanks, > I misunderstood that previously and I understand the whole idea now. Thanks for the kind explanation! :) -- Thanks, Jike -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig index a34fbc66f92f..703abd0a9bff 100644 --- a/drivers/vfio/mdev/Kconfig +++ b/drivers/vfio/mdev/Kconfig @@ -9,4 +9,10 @@ config VFIO_MDEV If you don't know what do here, say N. +config VFIO_MDEV_DEVICE + tristate "VFIO support for Mediated devices" + depends on VFIO && VFIO_MDEV + default n + help + VFIO based driver for mediated devices. diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile index 56a75e689582..e5087ed83a34 100644 --- a/drivers/vfio/mdev/Makefile +++ b/drivers/vfio/mdev/Makefile @@ -2,4 +2,5 @@ mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o obj-$(CONFIG_VFIO_MDEV) += mdev.o +obj-$(CONFIG_VFIO_MDEV_DEVICE) += vfio_mdev.o diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c new file mode 100644 index 000000000000..28f13aeaa46b --- /dev/null +++ b/drivers/vfio/mdev/vfio_mdev.c @@ -0,0 +1,467 @@ +/* + * VFIO based Mediated PCI device driver + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/uuid.h> +#include <linux/vfio.h> +#include <linux/iommu.h> +#include <linux/mdev.h> + +#include "mdev_private.h" + +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "NVIDIA Corporation" +#define DRIVER_DESC "VFIO based Mediated PCI device driver" + +struct vfio_mdev { + struct iommu_group *group; + struct mdev_device *mdev; + struct vfio_device_info dev_info; +}; + +static int vfio_mdev_open(void *device_data) +{ + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + return ret; +} + +static void vfio_mdev_close(void *device_data) +{ + module_put(THIS_MODULE); +} + +static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type) +{ + struct vfio_info_cap_header *header; + struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type; + size_t size; + + size = sizeof(*sparse) + sparse->nr_areas * sizeof(*sparse->areas); + header = vfio_info_cap_add(caps, size, + VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1); + if (IS_ERR(header)) + return PTR_ERR(header); + + sparse_cap = container_of(header, + struct vfio_region_info_cap_sparse_mmap, header); + sparse_cap->nr_areas = sparse->nr_areas; + memcpy(sparse_cap->areas, sparse->areas, + sparse->nr_areas * sizeof(*sparse->areas)); + return 0; +} + +static int region_type_cap(struct vfio_info_cap *caps, void *cap_type) +{ + struct vfio_info_cap_header *header; + struct vfio_region_info_cap_type *type_cap, *cap = cap_type; + + header = vfio_info_cap_add(caps, sizeof(*cap), + VFIO_REGION_INFO_CAP_TYPE, 1); + if (IS_ERR(header)) + return PTR_ERR(header); + + type_cap = container_of(header, struct vfio_region_info_cap_type, + header); + type_cap->type = cap->type; + type_cap->subtype = cap->type; + return 0; +} + +static long vfio_mdev_unlocked_ioctl(void *device_data, + unsigned int cmd, unsigned long arg) +{ + int ret = 0; + struct vfio_mdev *vmdev = device_data; + struct parent_device *parent = vmdev->mdev->parent; + unsigned long minsz; + + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + { + struct vfio_device_info info; + + minsz = offsetofend(struct vfio_device_info, num_irqs); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + if (parent->ops->get_device_info) + ret = parent->ops->get_device_info(vmdev->mdev, &info); + else + return -EINVAL; + + if (ret) + return ret; + + if (parent->ops->reset) + info.flags |= VFIO_DEVICE_FLAGS_RESET; + + memcpy(&vmdev->dev_info, &info, sizeof(info)); + + return copy_to_user((void __user *)arg, &info, minsz); + } + case VFIO_DEVICE_GET_REGION_INFO: + { + struct vfio_region_info info; + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; + u16 cap_type_id = 0; + void *cap_type = NULL; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + if (parent->ops->get_region_info) + ret = parent->ops->get_region_info(vmdev->mdev, &info, + &cap_type_id, &cap_type); + else + return -EINVAL; + + if (ret) + return ret; + + if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && cap_type) { + switch (cap_type_id) { + case VFIO_REGION_INFO_CAP_SPARSE_MMAP: + ret = sparse_mmap_cap(&caps, cap_type); + if (ret) + return ret; + break; + + case VFIO_REGION_INFO_CAP_TYPE: + ret = region_type_cap(&caps, cap_type); + if (ret) + return ret; + break; + default: + return -EINVAL; + } + } + + if (caps.size) { + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user((void __user *)arg + + sizeof(info), caps.buf, + caps.size)) { + kfree(caps.buf); + return -EFAULT; + } + info.cap_offset = sizeof(info); + } + kfree(caps.buf); + } + + return copy_to_user((void __user *)arg, &info, minsz); + } + case VFIO_DEVICE_GET_IRQ_INFO: + { + struct vfio_irq_info info; + + minsz = offsetofend(struct vfio_irq_info, count); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if ((info.argsz < minsz) || + (info.index >= vmdev->dev_info.num_irqs)) + return -EINVAL; + + if (parent->ops->get_irq_info) + ret = parent->ops->get_irq_info(vmdev->mdev, &info); + else + return -EINVAL; + + if (ret) + return ret; + + if (info.count == -1) + return -EINVAL; + + return copy_to_user((void __user *)arg, &info, minsz); + } + case VFIO_DEVICE_SET_IRQS: + { + struct vfio_irq_set hdr; + u8 *data = NULL, *ptr = NULL; + + minsz = offsetofend(struct vfio_irq_set, count); + + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; + + if ((hdr.argsz < minsz) || + (hdr.index >= vmdev->dev_info.num_irqs) || + (hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | + VFIO_IRQ_SET_ACTION_TYPE_MASK))) + return -EINVAL; + + if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { + size_t size; + + if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) + size = sizeof(uint8_t); + else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD) + size = sizeof(int32_t); + else + return -EINVAL; + + if (hdr.argsz - minsz < hdr.count * size) + return -EINVAL; + + ptr = data = memdup_user((void __user *)(arg + minsz), + hdr.count * size); + if (IS_ERR(data)) + return PTR_ERR(data); + } + + if (parent->ops->set_irqs) + ret = parent->ops->set_irqs(vmdev->mdev, hdr.flags, + hdr.index, hdr.start, + hdr.count, data); + else + ret = -EINVAL; + + kfree(ptr); + return ret; + } + case VFIO_DEVICE_RESET: + { + if (parent->ops->reset) + return parent->ops->reset(vmdev->mdev); + + return -EINVAL; + } + } + return -ENOTTY; +} + +static ssize_t vfio_mdev_read(void *device_data, char __user *buf, + size_t count, loff_t *ppos) +{ + struct vfio_mdev *vmdev = device_data; + struct mdev_device *mdev = vmdev->mdev; + struct parent_device *parent = mdev->parent; + unsigned int done = 0; + int ret; + + if (!parent->ops->read) + return -EINVAL; + + while (count) { + size_t filled; + + if (count >= 4 && !(*ppos % 4)) { + u32 val; + + ret = parent->ops->read(mdev, (char *)&val, sizeof(val), + *ppos); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 4; + } else if (count >= 2 && !(*ppos % 2)) { + u16 val; + + ret = parent->ops->read(mdev, (char *)&val, sizeof(val), + *ppos); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 2; + } else { + u8 val; + + ret = parent->ops->read(mdev, &val, sizeof(val), *ppos); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 1; + } + + count -= filled; + done += filled; + *ppos += filled; + buf += filled; + } + + return done; + +read_err: + return -EFAULT; +} + +static ssize_t vfio_mdev_write(void *device_data, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct vfio_mdev *vmdev = device_data; + struct mdev_device *mdev = vmdev->mdev; + struct parent_device *parent = mdev->parent; + unsigned int done = 0; + int ret; + + if (!parent->ops->write) + return -EINVAL; + + while (count) { + size_t filled; + + if (count >= 4 && !(*ppos % 4)) { + u32 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = parent->ops->write(mdev, (char *)&val, + sizeof(val), *ppos); + if (ret <= 0) + goto write_err; + + filled = 4; + } else if (count >= 2 && !(*ppos % 2)) { + u16 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = parent->ops->write(mdev, (char *)&val, + sizeof(val), *ppos); + if (ret <= 0) + goto write_err; + + filled = 2; + } else { + u8 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = parent->ops->write(mdev, &val, sizeof(val), + *ppos); + if (ret <= 0) + goto write_err; + + filled = 1; + } + + count -= filled; + done += filled; + *ppos += filled; + buf += filled; + } + + return done; +write_err: + return -EFAULT; +} + +static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma) +{ + struct vfio_mdev *vmdev = device_data; + struct mdev_device *mdev = vmdev->mdev; + struct parent_device *parent = mdev->parent; + + if (parent->ops->mmap) + return parent->ops->mmap(mdev, vma); + + return -EINVAL; +} + +static const struct vfio_device_ops vfio_mdev_dev_ops = { + .name = "vfio-mdev", + .open = vfio_mdev_open, + .release = vfio_mdev_close, + .ioctl = vfio_mdev_unlocked_ioctl, + .read = vfio_mdev_read, + .write = vfio_mdev_write, + .mmap = vfio_mdev_mmap, +}; + +int vfio_mdev_probe(struct device *dev) +{ + struct vfio_mdev *vmdev; + struct mdev_device *mdev = to_mdev_device(dev); + int ret; + + vmdev = kzalloc(sizeof(*vmdev), GFP_KERNEL); + if (IS_ERR(vmdev)) + return PTR_ERR(vmdev); + + vmdev->mdev = mdev_get_device(mdev); + vmdev->group = mdev->group; + + ret = vfio_add_group_dev(dev, &vfio_mdev_dev_ops, vmdev); + if (ret) + kfree(vmdev); + + mdev_put_device(mdev); + return ret; +} + +void vfio_mdev_remove(struct device *dev) +{ + struct vfio_mdev *vmdev; + + vmdev = vfio_del_group_dev(dev); + kfree(vmdev); +} + +struct mdev_driver vfio_mdev_driver = { + .name = "vfio_mdev", + .probe = vfio_mdev_probe, + .remove = vfio_mdev_remove, +}; + +static int __init vfio_mdev_init(void) +{ + return mdev_register_driver(&vfio_mdev_driver, THIS_MODULE); +} + +static void __exit vfio_mdev_exit(void) +{ + mdev_unregister_driver(&vfio_mdev_driver); +} + +module_init(vfio_mdev_init) +module_exit(vfio_mdev_exit) + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 016c14a1b454..776cc2b063d4 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -21,9 +21,9 @@ #define VFIO_PCI_OFFSET_SHIFT 40 -#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) -#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) -#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) +#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) /* Special capability IDs predefined access */ #define PCI_CAP_ID_INVALID 0xFF /* default raw access */