Message ID | 92fb16181e71a1c4049f9995294dbd7ff4270627.1629131628.git.elena.ufimtseva@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vfio-user implementation | expand |
On Mon, Aug 16, 2021 at 09:42:42AM -0700, Elena Ufimtseva wrote: > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index 7d667b0533..a8b1ea9358 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -215,6 +215,7 @@ void vfio_region_write(void *opaque, hwaddr addr, > uint32_t dword; > uint64_t qword; > } buf; > + int ret; > > switch (size) { > case 1: > @@ -234,7 +235,12 @@ void vfio_region_write(void *opaque, hwaddr addr, > break; > } > > - if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { > + if (vbasedev->proxy != NULL) { > + ret = vfio_user_region_write(vbasedev, region->nr, addr, size, &data); > + } else { > + ret = pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr); > + } The vfio-user spec says everything is little-endian. Why does vfio_user_region_write() take the host-endian uint64_t data value instead of the little-endian buf value? > + if (ret != size) { > error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 > ",%d) failed: %m", > __func__, vbasedev->name, region->nr, > @@ -266,8 +272,14 @@ uint64_t vfio_region_read(void *opaque, > uint64_t qword; > } buf; > uint64_t data = 0; > + int ret; > > - if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { > + if (vbasedev->proxy != NULL) { > + ret = vfio_user_region_read(vbasedev, region->nr, addr, size, &buf); > + } else { > + ret = pread(vbasedev->fd, &buf, size, region->fd_offset + addr); > + } > + if (ret != size) { > error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", > __func__, vbasedev->name, region->nr, > addr, size); > diff --git a/hw/vfio/user.c b/hw/vfio/user.c > index 91b51f37df..83235b2411 100644 > --- a/hw/vfio/user.c > +++ b/hw/vfio/user.c > @@ -767,3 +767,46 @@ int vfio_user_get_region_info(VFIODevice *vbasedev, int index, > memcpy(info, &msgp->argsz, info->argsz); > return 0; > } > + > +int vfio_user_region_read(VFIODevice *vbasedev, uint32_t index, uint64_t offset, > + uint32_t count, void *data) > +{ > + g_autofree VFIOUserRegionRW *msgp = NULL; > + int size = sizeof(*msgp) + count; > + > + msgp = g_malloc0(size); > + vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_READ, sizeof(*msgp), 0); > + msgp->offset = offset; > + msgp->region = index; > + msgp->count = count; > + > + vfio_user_send_recv(vbasedev->proxy, &msgp->hdr, NULL, size, 0); > + if (msgp->hdr.flags & VFIO_USER_ERROR) { > + return -msgp->hdr.error_reply; > + } else if (msgp->count > count) { > + return -E2BIG; > + } else { > + memcpy(data, &msgp->data, msgp->count); > + } > + > + return msgp->count; > +} > + > +int vfio_user_region_write(VFIODevice *vbasedev, uint32_t index, > + uint64_t offset, uint32_t count, void *data) > +{ > + g_autofree VFIOUserRegionRW *msgp = NULL; > + int size = sizeof(*msgp) + count; > + > + msgp = g_malloc0(size); > + vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size, > + VFIO_USER_NO_REPLY); > + msgp->offset = offset; > + msgp->region = index; > + msgp->count = count; > + memcpy(&msgp->data, data, count); > + > + vfio_user_send(vbasedev->proxy, &msgp->hdr, NULL); Are VFIO region writes posted writes (VFIO_USER_NO_REPLY)? This can be a problem if the device driver performs a write to the region followed by another access (e.g. to an mmap region) and expected the write to complete before the second access takes place.
On Mon, Aug 16, 2021 at 09:42:42AM -0700, Elena Ufimtseva wrote: > +int vfio_user_region_write(VFIODevice *vbasedev, uint32_t index, > + uint64_t offset, uint32_t count, void *data) > +{ > + g_autofree VFIOUserRegionRW *msgp = NULL; > + int size = sizeof(*msgp) + count; > + > + msgp = g_malloc0(size); > + vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size, > + VFIO_USER_NO_REPLY); Mirroring https://github.com/oracle/qemu/issues/10 here for visibility: Currently, vfio_user_region_write uses VFIO_USER_NO_REPLY unconditionally, meaning essentially all writes are posted. But that shouldn't be the case, for example for PCI config space, where it's expected that writes will wait for an ack before the VCPU continues. regards john
> On Sep 7, 2021, at 10:24 AM, John Levon <john.levon@nutanix.com> wrote: > > On Mon, Aug 16, 2021 at 09:42:42AM -0700, Elena Ufimtseva wrote: > >> +int vfio_user_region_write(VFIODevice *vbasedev, uint32_t index, >> + uint64_t offset, uint32_t count, void *data) >> +{ >> + g_autofree VFIOUserRegionRW *msgp = NULL; >> + int size = sizeof(*msgp) + count; >> + >> + msgp = g_malloc0(size); >> + vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size, >> + VFIO_USER_NO_REPLY); > > Mirroring https://github.com/oracle/qemu/issues/10 here for visibility: > > Currently, vfio_user_region_write uses VFIO_USER_NO_REPLY unconditionally, > meaning essentially all writes are posted. But that shouldn't be the case, for > example for PCI config space, where it's expected that writes will wait for an > ack before the VCPU continues. > I’m not sure following the PCI spec (mem writes posted, config & IO are not) completely solves the issue if the device uses sparse mmap. A store to went over the socket can be passed by a load that goes directly to memory, which could break a driver that assumes a load completion means older stores to the same device have also completed. JJ
On Thu, Sep 09, 2021 at 06:00:36AM +0000, John Johnson wrote: > > On Sep 7, 2021, at 10:24 AM, John Levon <john.levon@nutanix.com> wrote: > > > > On Mon, Aug 16, 2021 at 09:42:42AM -0700, Elena Ufimtseva wrote: > > > >> +int vfio_user_region_write(VFIODevice *vbasedev, uint32_t index, > >> + uint64_t offset, uint32_t count, void *data) > >> +{ > >> + g_autofree VFIOUserRegionRW *msgp = NULL; > >> + int size = sizeof(*msgp) + count; > >> + > >> + msgp = g_malloc0(size); > >> + vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size, > >> + VFIO_USER_NO_REPLY); > > > > Mirroring https://urldefense.proofpoint.com/v2/url?u=https-3A__github.com_oracle_qemu_issues_10&d=DwIGaQ&c=s883GpUCOChKOHiocYtGcg&r=v7SNLJqx7b9Vfc7ZO82Wg4nnZ8O5XkACFQ30bVKxotI&m=PJ390CfKPdTFUffSi02whMSqey2en8OJv7dm9VAQKI0&s=Mfp1xRKET3LEucEeZwUVUvSJ3V0zzGEktOzFwMsTfEE&e= here for visibility: > > > > Currently, vfio_user_region_write uses VFIO_USER_NO_REPLY unconditionally, > > meaning essentially all writes are posted. But that shouldn't be the case, for > > example for PCI config space, where it's expected that writes will wait for an > > ack before the VCPU continues. > > I’m not sure following the PCI spec (mem writes posted, config & IO > are not) completely solves the issue if the device uses sparse mmap. A store > to went over the socket can be passed by a load that goes directly to memory, > which could break a driver that assumes a load completion means older stores > to the same device have also completed. Sure, but sparse mmaps are under the device's control - so wouldn't that be something of a "don't do that" scenario? regards john
> On Sep 9, 2021, at 5:05 AM, John Levon <john.levon@nutanix.com> wrote: > > On Thu, Sep 09, 2021 at 06:00:36AM +0000, John Johnson wrote: > >>> On Sep 7, 2021, at 10:24 AM, John Levon <john.levon@nutanix.com> wrote: >>> >>> On Mon, Aug 16, 2021 at 09:42:42AM -0700, Elena Ufimtseva wrote: >>> >>>> +int vfio_user_region_write(VFIODevice *vbasedev, uint32_t index, >>>> + uint64_t offset, uint32_t count, void *data) >>>> +{ >>>> + g_autofree VFIOUserRegionRW *msgp = NULL; >>>> + int size = sizeof(*msgp) + count; >>>> + >>>> + msgp = g_malloc0(size); >>>> + vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size, >>>> + VFIO_USER_NO_REPLY); >>> >>> Mirroring https://urldefense.proofpoint.com/v2/url?u=https-3A__github.com_oracle_qemu_issues_10&d=DwIGaQ&c=s883GpUCOChKOHiocYtGcg&r=v7SNLJqx7b9Vfc7ZO82Wg4nnZ8O5XkACFQ30bVKxotI&m=PJ390CfKPdTFUffSi02whMSqey2en8OJv7dm9VAQKI0&s=Mfp1xRKET3LEucEeZwUVUvSJ3V0zzGEktOzFwMsTfEE&e= here for visibility: >>> >>> Currently, vfio_user_region_write uses VFIO_USER_NO_REPLY unconditionally, >>> meaning essentially all writes are posted. But that shouldn't be the case, for >>> example for PCI config space, where it's expected that writes will wait for an >>> ack before the VCPU continues. >> >> I’m not sure following the PCI spec (mem writes posted, config & IO >> are not) completely solves the issue if the device uses sparse mmap. A store >> to went over the socket can be passed by a load that goes directly to memory, >> which could break a driver that assumes a load completion means older stores >> to the same device have also completed. > > Sure, but sparse mmaps are under the device's control - so wouldn't that be > something of a "don't do that" scenario? > The sparse mmaps are under the emulation program’s control, but it doesn’t know what registers the guest device driver is using to force stores to complete. The Linux device drivers doc on kernel.org just says the driver must read from the same device. JJ https://www.kernel.org/doc/Documentation/driver-api/device-io.rst While the basic functions are defined to be synchronous with respect to each other and ordered with respect to each other the busses the devices sit on may themselves have asynchronicity. In particular many authors are burned by the fact that PCI bus writes are posted asynchronously. A driver author must issue a read from the same device to ensure that writes have occurred in the specific cases the author cares.
On Fri, Sep 10, 2021 at 06:07:56AM +0000, John Johnson wrote: > >>> On Mon, Aug 16, 2021 at 09:42:42AM -0700, Elena Ufimtseva wrote: > >>> > >>>> +int vfio_user_region_write(VFIODevice *vbasedev, uint32_t index, > >>>> + uint64_t offset, uint32_t count, void *data) > >>>> +{ > >>>> + g_autofree VFIOUserRegionRW *msgp = NULL; > >>>> + int size = sizeof(*msgp) + count; > >>>> + > >>>> + msgp = g_malloc0(size); > >>>> + vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size, > >>>> + VFIO_USER_NO_REPLY); > >>> > >>> Mirroring https://urldefense.proofpoint.com/v2/url?u=https-3A__github.com_oracle_qemu_issues_10&d=DwIGaQ&c=s883GpUCOChKOHiocYtGcg&r=v7SNLJqx7b9Vfc7ZO82Wg4nnZ8O5XkACFQ30bVKxotI&m=PJ390CfKPdTFUffSi02whMSqey2en8OJv7dm9VAQKI0&s=Mfp1xRKET3LEucEeZwUVUvSJ3V0zzGEktOzFwMsTfEE&e= here for visibility: > >>> > >>> Currently, vfio_user_region_write uses VFIO_USER_NO_REPLY unconditionally, > >>> meaning essentially all writes are posted. But that shouldn't be the case, for > >>> example for PCI config space, where it's expected that writes will wait for an > >>> ack before the VCPU continues. > >> > >> I’m not sure following the PCI spec (mem writes posted, config & IO > >> are not) completely solves the issue if the device uses sparse mmap. A store > >> to went over the socket can be passed by a load that goes directly to memory, > >> which could break a driver that assumes a load completion means older stores > >> to the same device have also completed. > > > > Sure, but sparse mmaps are under the device's control - so wouldn't that be > > something of a "don't do that" scenario? > > The sparse mmaps are under the emulation program’s control, but it > doesn’t know what registers the guest device driver is using to force stores > to complete. The Linux device drivers doc on kernel.org just says the driver > must read from the same device. Sure, but any device where that is important wouldn't use the sparse mmaps, no? There's no other alternative. regards john
diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h index 104bf4ff31..56904cf872 100644 --- a/hw/vfio/user-protocol.h +++ b/hw/vfio/user-protocol.h @@ -109,4 +109,16 @@ typedef struct { uint64_t offset; } VFIOUserRegionInfo; +/* + * VFIO_USER_REGION_READ + * VFIO_USER_REGION_WRITE + */ +typedef struct { + VFIOUserHdr hdr; + uint64_t offset; + uint32_t region; + uint32_t count; + char data[]; +} VFIOUserRegionRW; + #endif /* VFIO_USER_PROTOCOL_H */ diff --git a/hw/vfio/user.h b/hw/vfio/user.h index f0122539ba..02f832a173 100644 --- a/hw/vfio/user.h +++ b/hw/vfio/user.h @@ -74,5 +74,9 @@ int vfio_user_validate_version(VFIODevice *vbasedev, Error **errp); int vfio_user_get_info(VFIODevice *vbasedev); int vfio_user_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info *info, VFIOUserFDs *fds); +int vfio_user_region_read(VFIODevice *vbasedev, uint32_t index, uint64_t offset, + uint32_t count, void *data); +int vfio_user_region_write(VFIODevice *vbasedev, uint32_t index, + uint64_t offset, uint32_t count, void *data); #endif /* VFIO_USER_H */ diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 7d667b0533..a8b1ea9358 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -215,6 +215,7 @@ void vfio_region_write(void *opaque, hwaddr addr, uint32_t dword; uint64_t qword; } buf; + int ret; switch (size) { case 1: @@ -234,7 +235,12 @@ void vfio_region_write(void *opaque, hwaddr addr, break; } - if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { + if (vbasedev->proxy != NULL) { + ret = vfio_user_region_write(vbasedev, region->nr, addr, size, &data); + } else { + ret = pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr); + } + if (ret != size) { error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 ",%d) failed: %m", __func__, vbasedev->name, region->nr, @@ -266,8 +272,14 @@ uint64_t vfio_region_read(void *opaque, uint64_t qword; } buf; uint64_t data = 0; + int ret; - if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { + if (vbasedev->proxy != NULL) { + ret = vfio_user_region_read(vbasedev, region->nr, addr, size, &buf); + } else { + ret = pread(vbasedev->fd, &buf, size, region->fd_offset + addr); + } + if (ret != size) { error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", __func__, vbasedev->name, region->nr, addr, size); diff --git a/hw/vfio/user.c b/hw/vfio/user.c index 91b51f37df..83235b2411 100644 --- a/hw/vfio/user.c +++ b/hw/vfio/user.c @@ -767,3 +767,46 @@ int vfio_user_get_region_info(VFIODevice *vbasedev, int index, memcpy(info, &msgp->argsz, info->argsz); return 0; } + +int vfio_user_region_read(VFIODevice *vbasedev, uint32_t index, uint64_t offset, + uint32_t count, void *data) +{ + g_autofree VFIOUserRegionRW *msgp = NULL; + int size = sizeof(*msgp) + count; + + msgp = g_malloc0(size); + vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_READ, sizeof(*msgp), 0); + msgp->offset = offset; + msgp->region = index; + msgp->count = count; + + vfio_user_send_recv(vbasedev->proxy, &msgp->hdr, NULL, size, 0); + if (msgp->hdr.flags & VFIO_USER_ERROR) { + return -msgp->hdr.error_reply; + } else if (msgp->count > count) { + return -E2BIG; + } else { + memcpy(data, &msgp->data, msgp->count); + } + + return msgp->count; +} + +int vfio_user_region_write(VFIODevice *vbasedev, uint32_t index, + uint64_t offset, uint32_t count, void *data) +{ + g_autofree VFIOUserRegionRW *msgp = NULL; + int size = sizeof(*msgp) + count; + + msgp = g_malloc0(size); + vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size, + VFIO_USER_NO_REPLY); + msgp->offset = offset; + msgp->region = index; + msgp->count = count; + memcpy(&msgp->data, data, count); + + vfio_user_send(vbasedev->proxy, &msgp->hdr, NULL); + + return count; +}