Message ID | 20090605102315.GD26770@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Jun 05, 2009 at 01:23:15PM +0300, Michael S. Tsirkin wrote: > Add routines to manage PCI capability list. First user will be MSI-X. > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com> > --- > hw/pci.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ > hw/pci.h | 18 +++++++++++- > 2 files changed, 106 insertions(+), 10 deletions(-) > > diff --git a/hw/pci.c b/hw/pci.c > index 361d741..ed011b5 100644 > --- a/hw/pci.c > +++ b/hw/pci.c > @@ -130,12 +130,13 @@ void pci_device_save(PCIDevice *s, QEMUFile *f) > int version = s->cap_present ? 3 : 2; > int i; > > - qemu_put_be32(f, version); /* PCI device version */ > + /* PCI device version and capabilities */ > + qemu_put_be32(f, version); > + if (version >= 3) > + qemu_put_be32(f, s->cap_present); > qemu_put_buffer(f, s->config, 256); > for (i = 0; i < 4; i++) > qemu_put_be32(f, s->irq_state[i]); > - if (version >= 3) > - qemu_put_be32(f, s->cap_present); > } What is it doing here? You should just do it right in the first patch, instead of doing in one way there, and fixing here. > > int pci_device_load(PCIDevice *s, QEMUFile *f) > @@ -146,12 +147,6 @@ int pci_device_load(PCIDevice *s, QEMUFile *f) > version_id = qemu_get_be32(f); > if (version_id > 3) > return -EINVAL; > - qemu_get_buffer(f, s->config, 256); > - pci_update_mappings(s); > - > - if (version_id >= 2) > - for (i = 0; i < 4; i ++) > - s->irq_state[i] = qemu_get_be32(f); > if (version_id >= 3) > s->cap_present = qemu_get_be32(f); > else ditto. > @@ -160,6 +155,18 @@ int pci_device_load(PCIDevice *s, QEMUFile *f) > if (s->cap_present & ~s->cap_supported) > return -EINVAL; > > + qemu_get_buffer(f, s->config, 256); > + pci_update_mappings(s); > + > + if (version_id >= 2) > + for (i = 0; i < 4; i ++) > + s->irq_state[i] = qemu_get_be32(f); > + /* Clear wmask and used bits for capabilities. > + Must be restored separately, since capabilities can > + be placed anywhere in config space. */ > + memset(s->used, 0, PCI_CONFIG_SPACE_SIZE); > + for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i) > + s->wmask[i] = 0xff; > return 0; > } Sorry, I don't exactly understand it. Although it can be anywhere, what do we actually lose by keeping it at the same place in config space? > > @@ -870,3 +877,76 @@ PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name) > > return (PCIDevice *)dev; > } > + > +static int pci_find_space(PCIDevice *pdev, uint8_t size) > +{ > + int offset = PCI_CONFIG_HEADER_SIZE; > + int i; > + for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i) > + if (pdev->used[i]) > + offset = i + 1; > + else if (i - offset + 1 == size) > + return offset; > + return 0; > +} > + > +static uint8_t pci_find_capability_list(PCIDevice *pdev, uint8_t cap_id, > + uint8_t *prev_p) > +{ > + uint8_t next, prev; > + > + if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST)) > + return 0; > + > + for (prev = PCI_CAPABILITY_LIST; (next = pdev->config[prev]); > + prev = next + PCI_CAP_LIST_NEXT) > + if (pdev->config[next + PCI_CAP_LIST_ID] == cap_id) > + break; > + > + *prev_p = prev; > + return next; > +} I'd prefer to do: if (prev_p) *prev_p = prev; so we don't have to always pass a prev_p pointer. You have yourself a user where you don't need it in this very patch. > + > +/* Reserve space and add capability to the linked list in pci config space */ > +int pci_add_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t size) > +{ > + uint8_t offset = pci_find_space(pdev, size); > + uint8_t *config = pdev->config + offset; > + if (!offset) > + return -ENOSPC; > + config[PCI_CAP_LIST_ID] = cap_id; > + config[PCI_CAP_LIST_NEXT] = pdev->config[PCI_CAPABILITY_LIST]; > + pdev->config[PCI_CAPABILITY_LIST] = offset; > + pdev->config[PCI_STATUS] |= PCI_STATUS_CAP_LIST; > + memset(pdev->used + offset, 0xFF, size); > + /* Make capability read-only by default */ > + memset(pdev->wmask + offset, 0, size); > + return offset; > +} > + > +/* Unlink capability from the pci config space. */ > +void pci_del_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t size) > +{ > + uint8_t prev, offset = pci_find_capability_list(pdev, cap_id, &prev); > + if (!offset) > + return; > + pdev->config[prev] = pdev->config[offset + PCI_CAP_LIST_NEXT]; > + /* Make capability writeable again */ > + memset(pdev->wmask + offset, 0xff, size); > + memset(pdev->used + offset, 0, size); > + > + if (!pdev->config[PCI_CAPABILITY_LIST]) > + pdev->config[PCI_STATUS] &= ~PCI_STATUS_CAP_LIST; > +} > + > +/* Reserve space for capability at a known offset (to call after load). */ > +void pci_reserve_capability(PCIDevice *pdev, uint8_t offset, uint8_t size) > +{ > + memset(pdev->used + offset, 0xff, size); > +} > + > +uint8_t pci_find_capability(PCIDevice *pdev, uint8_t cap_id) > +{ > + uint8_t prev; > + return pci_find_capability_list(pdev, cap_id, &prev); > +} > diff --git a/hw/pci.h b/hw/pci.h > index 6f0803f..4838c59 100644 > --- a/hw/pci.h > +++ b/hw/pci.h > @@ -123,6 +123,10 @@ typedef struct PCIIORegion { > #define PCI_MIN_GNT 0x3e /* 8 bits */ > #define PCI_MAX_LAT 0x3f /* 8 bits */ > > +/* Capability lists */ > +#define PCI_CAP_LIST_ID 0 /* Capability ID */ > +#define PCI_CAP_LIST_NEXT 1 /* Next capability in the list */ > + > #define PCI_REVISION 0x08 /* obsolete, use PCI_REVISION_ID */ > #define PCI_SUBVENDOR_ID 0x2c /* obsolete, use PCI_SUBSYSTEM_VENDOR_ID */ > #define PCI_SUBDEVICE_ID 0x2e /* obsolete, use PCI_SUBSYSTEM_ID */ > @@ -130,7 +134,7 @@ typedef struct PCIIORegion { > /* Bits in the PCI Status Register (PCI 2.3 spec) */ > #define PCI_STATUS_RESERVED1 0x007 > #define PCI_STATUS_INT_STATUS 0x008 > -#define PCI_STATUS_CAPABILITIES 0x010 > +#define PCI_STATUS_CAP_LIST 0x010 > #define PCI_STATUS_66MHZ 0x020 > #define PCI_STATUS_RESERVED2 0x040 > #define PCI_STATUS_FAST_BACK 0x080 > @@ -160,6 +164,9 @@ struct PCIDevice { > /* Used to implement R/W bytes */ > uint8_t wmask[PCI_CONFIG_SPACE_SIZE]; > > + /* Used to allocate config space for capabilities. */ > + uint8_t used[PCI_CONFIG_SPACE_SIZE]; > + > /* the following fields are read only */ > PCIBus *bus; > int devfn; > @@ -194,6 +201,15 @@ void pci_register_io_region(PCIDevice *pci_dev, int region_num, > uint32_t size, int type, > PCIMapIORegionFunc *map_func); > > +int pci_add_capability(PCIDevice *pci_dev, uint8_t cap_id, uint8_t cap_size); > + > +void pci_del_capability(PCIDevice *pci_dev, uint8_t cap_id, uint8_t cap_size); > + > +void pci_reserve_capability(PCIDevice *pci_dev, uint8_t offset, uint8_t size); > + > +uint8_t pci_find_capability(PCIDevice *pci_dev, uint8_t cap_id); > + > + > uint32_t pci_default_read_config(PCIDevice *d, > uint32_t address, int len); > void pci_default_write_config(PCIDevice *d, > -- > 1.6.3.1.56.g79e1.dirty > > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jun 09, 2009 at 02:11:14PM -0300, Glauber Costa wrote: > On Fri, Jun 05, 2009 at 01:23:15PM +0300, Michael S. Tsirkin wrote: > > Add routines to manage PCI capability list. First user will be MSI-X. > > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com> > > --- > > hw/pci.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ > > hw/pci.h | 18 +++++++++++- > > 2 files changed, 106 insertions(+), 10 deletions(-) > > > > diff --git a/hw/pci.c b/hw/pci.c > > index 361d741..ed011b5 100644 > > --- a/hw/pci.c > > +++ b/hw/pci.c > > @@ -130,12 +130,13 @@ void pci_device_save(PCIDevice *s, QEMUFile *f) > > int version = s->cap_present ? 3 : 2; > > int i; > > > > - qemu_put_be32(f, version); /* PCI device version */ > > + /* PCI device version and capabilities */ > > + qemu_put_be32(f, version); > > + if (version >= 3) > > + qemu_put_be32(f, s->cap_present); > > qemu_put_buffer(f, s->config, 256); > > for (i = 0; i < 4; i++) > > qemu_put_be32(f, s->irq_state[i]); > > - if (version >= 3) > > - qemu_put_be32(f, s->cap_present); > > } > What is it doing here? > You should just do it right in the first patch, instead of doing in > one way there, and fixing here. > > > > > int pci_device_load(PCIDevice *s, QEMUFile *f) > > @@ -146,12 +147,6 @@ int pci_device_load(PCIDevice *s, QEMUFile *f) > > version_id = qemu_get_be32(f); > > if (version_id > 3) > > return -EINVAL; > > - qemu_get_buffer(f, s->config, 256); > > - pci_update_mappings(s); > > - > > - if (version_id >= 2) > > - for (i = 0; i < 4; i ++) > > - s->irq_state[i] = qemu_get_be32(f); > > if (version_id >= 3) > > s->cap_present = qemu_get_be32(f); > > else > ditto. > > @@ -160,6 +155,18 @@ int pci_device_load(PCIDevice *s, QEMUFile *f) > > if (s->cap_present & ~s->cap_supported) > > return -EINVAL; > > > > + qemu_get_buffer(f, s->config, 256); > > + pci_update_mappings(s); > > + > > + if (version_id >= 2) > > + for (i = 0; i < 4; i ++) > > + s->irq_state[i] = qemu_get_be32(f); > > + /* Clear wmask and used bits for capabilities. > > + Must be restored separately, since capabilities can > > + be placed anywhere in config space. */ > > + memset(s->used, 0, PCI_CONFIG_SPACE_SIZE); > > + for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i) > > + s->wmask[i] = 0xff; > > return 0; > > } > Sorry, I don't exactly understand it. Although it can be anywhere, what do we actually > lose by keeping it at the same place in config space? We lose the ability to let user control the capabilities exposed by the device. And generally, I dislike arbitrary limitations. The PCI spec says the capability can be anywhere, implementing a linked list of caps is simple enough to not invent abritrary restrictions. > > > > @@ -870,3 +877,76 @@ PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name) > > > > return (PCIDevice *)dev; > > } > > + > > +static int pci_find_space(PCIDevice *pdev, uint8_t size) > > +{ > > + int offset = PCI_CONFIG_HEADER_SIZE; > > + int i; > > + for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i) > > + if (pdev->used[i]) > > + offset = i + 1; > > + else if (i - offset + 1 == size) > > + return offset; > > + return 0; > > +} > > + > > +static uint8_t pci_find_capability_list(PCIDevice *pdev, uint8_t cap_id, > > + uint8_t *prev_p) > > +{ > > + uint8_t next, prev; > > + > > + if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST)) > > + return 0; > > + > > + for (prev = PCI_CAPABILITY_LIST; (next = pdev->config[prev]); > > + prev = next + PCI_CAP_LIST_NEXT) > > + if (pdev->config[next + PCI_CAP_LIST_ID] == cap_id) > > + break; > > + > > + *prev_p = prev; > > + return next; > > +} > I'd prefer to do: > if (prev_p) > *prev_p = prev; > so we don't have to always pass a prev_p pointer. You have yourself a user > where you don't need it in this very patch. Good idea. > > + > > +/* Reserve space and add capability to the linked list in pci config space */ > > +int pci_add_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t size) > > +{ > > + uint8_t offset = pci_find_space(pdev, size); > > + uint8_t *config = pdev->config + offset; > > + if (!offset) > > + return -ENOSPC; > > + config[PCI_CAP_LIST_ID] = cap_id; > > + config[PCI_CAP_LIST_NEXT] = pdev->config[PCI_CAPABILITY_LIST]; > > + pdev->config[PCI_CAPABILITY_LIST] = offset; > > + pdev->config[PCI_STATUS] |= PCI_STATUS_CAP_LIST; > > + memset(pdev->used + offset, 0xFF, size); > > + /* Make capability read-only by default */ > > + memset(pdev->wmask + offset, 0, size); > > + return offset; > > +} > > + > > +/* Unlink capability from the pci config space. */ > > +void pci_del_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t size) > > +{ > > + uint8_t prev, offset = pci_find_capability_list(pdev, cap_id, &prev); > > + if (!offset) > > + return; > > + pdev->config[prev] = pdev->config[offset + PCI_CAP_LIST_NEXT]; > > + /* Make capability writeable again */ > > + memset(pdev->wmask + offset, 0xff, size); > > + memset(pdev->used + offset, 0, size); > > + > > + if (!pdev->config[PCI_CAPABILITY_LIST]) > > + pdev->config[PCI_STATUS] &= ~PCI_STATUS_CAP_LIST; > > +} > > + > > +/* Reserve space for capability at a known offset (to call after load). */ > > +void pci_reserve_capability(PCIDevice *pdev, uint8_t offset, uint8_t size) > > +{ > > + memset(pdev->used + offset, 0xff, size); > > +} > > + > > +uint8_t pci_find_capability(PCIDevice *pdev, uint8_t cap_id) > > +{ > > + uint8_t prev; > > + return pci_find_capability_list(pdev, cap_id, &prev); > > +} > > diff --git a/hw/pci.h b/hw/pci.h > > index 6f0803f..4838c59 100644 > > --- a/hw/pci.h > > +++ b/hw/pci.h > > @@ -123,6 +123,10 @@ typedef struct PCIIORegion { > > #define PCI_MIN_GNT 0x3e /* 8 bits */ > > #define PCI_MAX_LAT 0x3f /* 8 bits */ > > > > +/* Capability lists */ > > +#define PCI_CAP_LIST_ID 0 /* Capability ID */ > > +#define PCI_CAP_LIST_NEXT 1 /* Next capability in the list */ > > + > > #define PCI_REVISION 0x08 /* obsolete, use PCI_REVISION_ID */ > > #define PCI_SUBVENDOR_ID 0x2c /* obsolete, use PCI_SUBSYSTEM_VENDOR_ID */ > > #define PCI_SUBDEVICE_ID 0x2e /* obsolete, use PCI_SUBSYSTEM_ID */ > > @@ -130,7 +134,7 @@ typedef struct PCIIORegion { > > /* Bits in the PCI Status Register (PCI 2.3 spec) */ > > #define PCI_STATUS_RESERVED1 0x007 > > #define PCI_STATUS_INT_STATUS 0x008 > > -#define PCI_STATUS_CAPABILITIES 0x010 > > +#define PCI_STATUS_CAP_LIST 0x010 > > #define PCI_STATUS_66MHZ 0x020 > > #define PCI_STATUS_RESERVED2 0x040 > > #define PCI_STATUS_FAST_BACK 0x080 > > @@ -160,6 +164,9 @@ struct PCIDevice { > > /* Used to implement R/W bytes */ > > uint8_t wmask[PCI_CONFIG_SPACE_SIZE]; > > > > + /* Used to allocate config space for capabilities. */ > > + uint8_t used[PCI_CONFIG_SPACE_SIZE]; > > + > > /* the following fields are read only */ > > PCIBus *bus; > > int devfn; > > @@ -194,6 +201,15 @@ void pci_register_io_region(PCIDevice *pci_dev, int region_num, > > uint32_t size, int type, > > PCIMapIORegionFunc *map_func); > > > > +int pci_add_capability(PCIDevice *pci_dev, uint8_t cap_id, uint8_t cap_size); > > + > > +void pci_del_capability(PCIDevice *pci_dev, uint8_t cap_id, uint8_t cap_size); > > + > > +void pci_reserve_capability(PCIDevice *pci_dev, uint8_t offset, uint8_t size); > > + > > +uint8_t pci_find_capability(PCIDevice *pci_dev, uint8_t cap_id); > > + > > + > > uint32_t pci_default_read_config(PCIDevice *d, > > uint32_t address, int len); > > void pci_default_write_config(PCIDevice *d, > > -- > > 1.6.3.1.56.g79e1.dirty > > > > > >
On Wed, Jun 10, 2009 at 12:54:15PM +0300, Michael S. Tsirkin wrote: > On Tue, Jun 09, 2009 at 02:11:14PM -0300, Glauber Costa wrote: > > On Fri, Jun 05, 2009 at 01:23:15PM +0300, Michael S. Tsirkin wrote: > > > Add routines to manage PCI capability list. First user will be MSI-X. > > > > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com> > > > --- > > > hw/pci.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ > > > hw/pci.h | 18 +++++++++++- > > > 2 files changed, 106 insertions(+), 10 deletions(-) > > > > > > diff --git a/hw/pci.c b/hw/pci.c > > > index 361d741..ed011b5 100644 > > > --- a/hw/pci.c > > > +++ b/hw/pci.c > > > @@ -130,12 +130,13 @@ void pci_device_save(PCIDevice *s, QEMUFile *f) > > > int version = s->cap_present ? 3 : 2; > > > int i; > > > > > > - qemu_put_be32(f, version); /* PCI device version */ > > > + /* PCI device version and capabilities */ > > > + qemu_put_be32(f, version); > > > + if (version >= 3) > > > + qemu_put_be32(f, s->cap_present); > > > qemu_put_buffer(f, s->config, 256); > > > for (i = 0; i < 4; i++) > > > qemu_put_be32(f, s->irq_state[i]); > > > - if (version >= 3) > > > - qemu_put_be32(f, s->cap_present); > > > } > > What is it doing here? > > You should just do it right in the first patch, instead of doing in > > one way there, and fixing here. > > > > > > > > int pci_device_load(PCIDevice *s, QEMUFile *f) > > > @@ -146,12 +147,6 @@ int pci_device_load(PCIDevice *s, QEMUFile *f) > > > version_id = qemu_get_be32(f); > > > if (version_id > 3) > > > return -EINVAL; > > > - qemu_get_buffer(f, s->config, 256); > > > - pci_update_mappings(s); > > > - > > > - if (version_id >= 2) > > > - for (i = 0; i < 4; i ++) > > > - s->irq_state[i] = qemu_get_be32(f); > > > if (version_id >= 3) > > > s->cap_present = qemu_get_be32(f); > > > else > > ditto. > > > @@ -160,6 +155,18 @@ int pci_device_load(PCIDevice *s, QEMUFile *f) > > > if (s->cap_present & ~s->cap_supported) > > > return -EINVAL; > > > > > > + qemu_get_buffer(f, s->config, 256); > > > + pci_update_mappings(s); > > > + > > > + if (version_id >= 2) > > > + for (i = 0; i < 4; i ++) > > > + s->irq_state[i] = qemu_get_be32(f); > > > + /* Clear wmask and used bits for capabilities. > > > + Must be restored separately, since capabilities can > > > + be placed anywhere in config space. */ > > > + memset(s->used, 0, PCI_CONFIG_SPACE_SIZE); > > > + for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i) > > > + s->wmask[i] = 0xff; > > > return 0; > > > } > > Sorry, I don't exactly understand it. Although it can be anywhere, what do we actually > > lose by keeping it at the same place in config space? > > We lose the ability to let user control the capabilities exposed > by the device. > > And generally, I dislike arbitrary limitations. The PCI spec says the > capability can be anywhere, implementing a linked list of caps is simple > enough to not invent abritrary restrictions. yes, but this is migration time, right? caps can be anywhere, but we don't expect it to change during machine execution lifetime. Or I am just confused by the name "pci_device_load" ? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jun 10, 2009 at 11:55:40AM -0300, Glauber Costa wrote: > On Wed, Jun 10, 2009 at 12:54:15PM +0300, Michael S. Tsirkin wrote: > > On Tue, Jun 09, 2009 at 02:11:14PM -0300, Glauber Costa wrote: > > > On Fri, Jun 05, 2009 at 01:23:15PM +0300, Michael S. Tsirkin wrote: > > > > Add routines to manage PCI capability list. First user will be MSI-X. > > > > > > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com> > > > > --- > > > > hw/pci.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ > > > > hw/pci.h | 18 +++++++++++- > > > > 2 files changed, 106 insertions(+), 10 deletions(-) > > > > > > > > diff --git a/hw/pci.c b/hw/pci.c > > > > index 361d741..ed011b5 100644 > > > > --- a/hw/pci.c > > > > +++ b/hw/pci.c > > > > @@ -130,12 +130,13 @@ void pci_device_save(PCIDevice *s, QEMUFile *f) > > > > int version = s->cap_present ? 3 : 2; > > > > int i; > > > > > > > > - qemu_put_be32(f, version); /* PCI device version */ > > > > + /* PCI device version and capabilities */ > > > > + qemu_put_be32(f, version); > > > > + if (version >= 3) > > > > + qemu_put_be32(f, s->cap_present); > > > > qemu_put_buffer(f, s->config, 256); > > > > for (i = 0; i < 4; i++) > > > > qemu_put_be32(f, s->irq_state[i]); > > > > - if (version >= 3) > > > > - qemu_put_be32(f, s->cap_present); > > > > } > > > What is it doing here? > > > You should just do it right in the first patch, instead of doing in > > > one way there, and fixing here. > > > > > > > > > > > int pci_device_load(PCIDevice *s, QEMUFile *f) > > > > @@ -146,12 +147,6 @@ int pci_device_load(PCIDevice *s, QEMUFile *f) > > > > version_id = qemu_get_be32(f); > > > > if (version_id > 3) > > > > return -EINVAL; > > > > - qemu_get_buffer(f, s->config, 256); > > > > - pci_update_mappings(s); > > > > - > > > > - if (version_id >= 2) > > > > - for (i = 0; i < 4; i ++) > > > > - s->irq_state[i] = qemu_get_be32(f); > > > > if (version_id >= 3) > > > > s->cap_present = qemu_get_be32(f); > > > > else > > > ditto. > > > > @@ -160,6 +155,18 @@ int pci_device_load(PCIDevice *s, QEMUFile *f) > > > > if (s->cap_present & ~s->cap_supported) > > > > return -EINVAL; > > > > > > > > + qemu_get_buffer(f, s->config, 256); > > > > + pci_update_mappings(s); > > > > + > > > > + if (version_id >= 2) > > > > + for (i = 0; i < 4; i ++) > > > > + s->irq_state[i] = qemu_get_be32(f); > > > > + /* Clear wmask and used bits for capabilities. > > > > + Must be restored separately, since capabilities can > > > > + be placed anywhere in config space. */ > > > > + memset(s->used, 0, PCI_CONFIG_SPACE_SIZE); > > > > + for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i) > > > > + s->wmask[i] = 0xff; > > > > return 0; > > > > } > > > Sorry, I don't exactly understand it. Although it can be anywhere, what do we actually > > > lose by keeping it at the same place in config space? > > > > We lose the ability to let user control the capabilities exposed > > by the device. > > > > And generally, I dislike arbitrary limitations. The PCI spec says the > > capability can be anywhere, implementing a linked list of caps is simple > > enough to not invent abritrary restrictions. > yes, but this is migration time, right? I think so, yes. > > caps can be anywhere, but we don't expect it to change during machine execution > lifetime. > > Or I am just confused by the name "pci_device_load" ? Right. So I want to load an image and it has capability X at offset Y. wmask has to match. I don't want to assume that we never change Y for the device without breaking old images, so I clear wmask here and set it up again after looking up capabilities that I loaded. Maybe this explanation should go into the comment above?
> > caps can be anywhere, but we don't expect it to change during machine > > execution lifetime. > > > > Or I am just confused by the name "pci_device_load" ? > > Right. So I want to load an image and it has capability X at offset Y. > wmask has to match. I don't want to assume that we never change Y > for the device without breaking old images, so I clear wmask here > and set it up again after looking up capabilities that I loaded. We should not be loading state into a different device (or a similar device with a different set of capabilities). If you want to provide backwards compatibility then you should do that by creating a device that is the same as the original. As I mentioned in my earlier mail, loading a snapshot should never do anything that can not be achieved through normal operation. Paul -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jun 10, 2009 at 04:24:28PM +0100, Paul Brook wrote: > > > caps can be anywhere, but we don't expect it to change during machine > > > execution lifetime. > > > > > > Or I am just confused by the name "pci_device_load" ? > > > > Right. So I want to load an image and it has capability X at offset Y. > > wmask has to match. I don't want to assume that we never change Y > > for the device without breaking old images, so I clear wmask here > > and set it up again after looking up capabilities that I loaded. > > We should not be loading state into a different device (or a similar device > with a different set of capabilities). > > If you want to provide backwards compatibility then you should do that by > creating a device that is the same as the original. As I mentioned in my > earlier mail, loading a snapshot should never do anything that can not be > achieved through normal operation. > > Paul Why shouldn't it? You don't load a snapshot while guest is running.
Paul Brook wrote: > > > caps can be anywhere, but we don't expect it to change during machine > > > execution lifetime. > > > > > > Or I am just confused by the name "pci_device_load" ? > > > > Right. So I want to load an image and it has capability X at offset Y. > > wmask has to match. I don't want to assume that we never change Y > > for the device without breaking old images, so I clear wmask here > > and set it up again after looking up capabilities that I loaded. > > We should not be loading state into a different device (or a similar device > with a different set of capabilities). > > If you want to provide backwards compatibility then you should do that by > creating a device that is the same as the original. As I mentioned in my > earlier mail, loading a snapshot should never do anything that can not be > achieved through normal operation. If you can create a machine be restoring a snapshot which you can't create by normally starting QEMU, then you'll soon have guests which work fine from their snapshots, but which cannot be booted without a snapshot because there's no way to boot the right machine for the guest. Ssomeone might even have guests like that for years without noticing, because they always save and restore guest state using snapshots, then one day they simply want to boot the guest from it's disk image and find there's no way to do it with any QEMU which runs on their host platform. I think the right long term answer to all this is a way to get QEMU to dump it's current machine configuration in glorious detail as a file which can be reloaded as a machine configuration. -- Jamie -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jun 10, 2009 at 06:43:02PM +0100, Jamie Lokier wrote: > Paul Brook wrote: > > > > caps can be anywhere, but we don't expect it to change during machine > > > > execution lifetime. > > > > > > > > Or I am just confused by the name "pci_device_load" ? > > > > > > Right. So I want to load an image and it has capability X at offset Y. > > > wmask has to match. I don't want to assume that we never change Y > > > for the device without breaking old images, so I clear wmask here > > > and set it up again after looking up capabilities that I loaded. > > > > We should not be loading state into a different device (or a similar device > > with a different set of capabilities). > > > > If you want to provide backwards compatibility then you should do that by > > creating a device that is the same as the original. As I mentioned in my > > earlier mail, loading a snapshot should never do anything that can not be > > achieved through normal operation. > > If you can create a machine be restoring a snapshot which you can't > create by normally starting QEMU, then you'll soon have guests which > work fine from their snapshots, but which cannot be booted without a > snapshot because there's no way to boot the right machine for the guest. Yes. This clearly isn't what I'm building here. You *can* create a guest without msi-x support by passing an appropriate flag. > Ssomeone might even have guests like that for years without noticing, > because they always save and restore guest state using snapshots, then > one day they simply want to boot the guest from it's disk image and > find there's no way to do it with any QEMU which runs on their host > platform. > > I think the right long term answer to all this is a way to get QEMU to > dump it's current machine configuration in glorious detail as a file > which can be reloaded as a machine configuration. > > -- Jamie And then we'll have the same set of problems there.
Michael S. Tsirkin wrote: > > I think the right long term answer to all this is a way to get QEMU to > > dump it's current machine configuration in glorious detail as a file > > which can be reloaded as a machine configuration. > > And then we'll have the same set of problems there. We will, and the solution will be the same: options to create devices as they were in older versions of QEMU. It only needs to cover device features which matter to guests, not every bug fix. However with a machine configuration which is generated by QEMU, there's less worry about proliferation of obscure options, compared with the command line. You don't necessarily have to document every backward-compatibility option in any detail, you just have to make sure it's written and read properly, which is much the same thing as the snapshot code does. -- Jamie -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote: > Michael S. Tsirkin wrote: > > > I think the right long term answer to all this is a way to get QEMU to > > > dump it's current machine configuration in glorious detail as a file > > > which can be reloaded as a machine configuration. > > > > And then we'll have the same set of problems there. > > We will, and the solution will be the same: options to create devices > as they were in older versions of QEMU. It only needs to cover device > features which matter to guests, not every bug fix. > > However with a machine configuration which is generated by QEMU, > there's less worry about proliferation of obscure options, compared > with the command line. You don't necessarily have to document every > backward-compatibility option in any detail, you just have to make > sure it's written and read properly, which is much the same thing as > the snapshot code does. This is a sensible plan, but I don't think we should mix these compat options in with the VM manager supplied configuration. There are two problems with that approach. = Problem 1 - VM manager needs to parse qemu config = Your proposal implies: - VM manager supplies a basic configuration to qemu - It then immediately asks qemu for a dump of the machine configuration in all its glorious detail and retains that config - If the VM manager wishes to add a new device it needs to parse the qemu config and add it, rather than just generate an entirely new config = Problem 2 - We can't predict the future = If a VM manager supplies a configuration which is missing any given option, qemu cannot tell the difference between: - This is a basic config, the VM manager wants whatever the default of the current qemu version - This is a complete config dumped using an old version of qemu, the VM manager wants the old default = Solution - Separate configuration from compat hints = As I suggested before: - Allow the VM manager to dump compat hints; this would be an opaque file format, more like the savevm format than a config file - Use defaults where compat hints are not available; e.g. if the VM manager specifies a device config, but no compat hints are supplied for it, then just use default values - Make the config override compat hints; e.g. if there are compat hints specified for a device not included in the machine config, just ignore those hints Cheers, Mark. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jun 12, 2009 at 09:43:29AM +0100, Mark McLoughlin wrote: > = Solution - Separate configuration from compat hints = > > As I suggested before: > > - Allow the VM manager to dump compat hints; this would be an opaque > file format, more like the savevm format than a config file Why make it "like the savevm" format then? If they are opaque anyway, compat hints could be part of savevm format. > - Use defaults where compat hints are not available; e.g. if the VM > manager specifies a device config, but no compat hints are > supplied for it, then just use default values > > - Make the config override compat hints; e.g. if there are compat > hints specified for a device not included in the machine config, > just ignore those hints > > Cheers, > Mark. If compat hints are opaque and only editable by qemu, we can get into a situation where one can't create a specific setup with a new qemu.
On Fri, 2009-06-12 at 16:59 +0300, Michael S. Tsirkin wrote: > On Fri, Jun 12, 2009 at 09:43:29AM +0100, Mark McLoughlin wrote: > > = Solution - Separate configuration from compat hints = > > > > As I suggested before: > > > > - Allow the VM manager to dump compat hints; this would be an opaque > > file format, more like the savevm format than a config file > > Why make it "like the savevm" format then? > If they are opaque anyway, compat hints could be part of savevm format. So a "savevm --only-compat-hints" command? It might make sense, since we would want the compat hints with savevm too. > > - Use defaults where compat hints are not available; e.g. if the VM > > manager specifies a device config, but no compat hints are > > supplied for it, then just use default values > > > > - Make the config override compat hints; e.g. if there are compat > > hints specified for a device not included in the machine config, > > just ignore those hints > > > > Cheers, > > Mark. > > If compat hints are opaque and only editable by qemu, we can get into a > situation where one can't create a specific setup with a new qemu. An example? Cheers, Mark. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Mark McLoughlin wrote: > On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote: > >> Michael S. Tsirkin wrote: >> >>>> I think the right long term answer to all this is a way to get QEMU to >>>> dump it's current machine configuration in glorious detail as a file >>>> which can be reloaded as a machine configuration. >>>> >>> And then we'll have the same set of problems there. >>> >> We will, and the solution will be the same: options to create devices >> as they were in older versions of QEMU. It only needs to cover device >> features which matter to guests, not every bug fix. >> >> However with a machine configuration which is generated by QEMU, >> there's less worry about proliferation of obscure options, compared >> with the command line. You don't necessarily have to document every >> backward-compatibility option in any detail, you just have to make >> sure it's written and read properly, which is much the same thing as >> the snapshot code does. >> > > This is a sensible plan, but I don't think we should mix these compat > options in with the VM manager supplied configuration. > > There are two problems with that approach. > > = Problem 1 - VM manager needs to parse qemu config = > > Your proposal implies: > > - VM manager supplies a basic configuration to qemu > > - It then immediately asks qemu for a dump of the machine > configuration in all its glorious detail and retains that > config > > - If the VM manager wishes to add a new device it needs to parse the > qemu config and add it, rather than just generate an entirely new > config > What's the problem with parsing the device config and modifying it? Is it just complexity? If we provided a mechanism to simplify manipulating a device config, would that eliminate the concern here? Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Mark McLoughlin wrote: > On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote: > > = Solution - Separate configuration from compat hints = > > As I suggested before: > > - Allow the VM manager to dump compat hints; this would be an opaque > file format, more like the savevm format than a config file > How is compat hints different from a device tree? In my mind, that's what compat hints is. I don't see another sane way to implement it. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2009-06-12 at 09:51 -0500, Anthony Liguori wrote: > Mark McLoughlin wrote: > > On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote: > > > >> Michael S. Tsirkin wrote: > >> > >>>> I think the right long term answer to all this is a way to get QEMU to > >>>> dump it's current machine configuration in glorious detail as a file > >>>> which can be reloaded as a machine configuration. > >>>> > >>> And then we'll have the same set of problems there. > >>> > >> We will, and the solution will be the same: options to create devices > >> as they were in older versions of QEMU. It only needs to cover device > >> features which matter to guests, not every bug fix. > >> > >> However with a machine configuration which is generated by QEMU, > >> there's less worry about proliferation of obscure options, compared > >> with the command line. You don't necessarily have to document every > >> backward-compatibility option in any detail, you just have to make > >> sure it's written and read properly, which is much the same thing as > >> the snapshot code does. > >> > > > > This is a sensible plan, but I don't think we should mix these compat > > options in with the VM manager supplied configuration. > > > > There are two problems with that approach. > > > > = Problem 1 - VM manager needs to parse qemu config = > > > > Your proposal implies: > > > > - VM manager supplies a basic configuration to qemu > > > > - It then immediately asks qemu for a dump of the machine > > configuration in all its glorious detail and retains that > > config > > > > - If the VM manager wishes to add a new device it needs to parse the > > qemu config and add it, rather than just generate an entirely new > > config > > > > What's the problem with parsing the device config and modifying it? Is > it just complexity? Yes, complexity is the issue. > If we provided a mechanism to simplify manipulating a device config, > would that eliminate the concern here? In libvirt's case, a lot of the complexity would come from needing to figure out what to change. i.e. libvirt produces a qemu configuration (currently a command line) from a guest XML's configuration; with this idea, libvirt would probably compare the old XML config to the new XML config, and then apply those differences to the qemu configuration. Cheers, Mark. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2009-06-12 at 09:55 -0500, Anthony Liguori wrote: > Mark McLoughlin wrote: > > On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote: > > > > = Solution - Separate configuration from compat hints = > > > > As I suggested before: > > > > - Allow the VM manager to dump compat hints; this would be an opaque > > file format, more like the savevm format than a config file > > > > How is compat hints different from a device tree? > > In my mind, that's what compat hints is. I don't see another sane way > to implement it. A device tree with a different purpose than a config file. In its simplest form it could be a device tree with a version number for each device[1]. The other obvious piece to add to it would be PCI addresses, so that even if you remove a device, the addresses assigned to existing devices don't change. Cheers, Mark. [1] - Adding such a per-device version number to the config file would solve problem (2) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Mark McLoughlin wrote: > On Fri, 2009-06-12 at 09:51 -0500, Anthony Liguori wrote: > >> Mark McLoughlin wrote: >> >>> On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote: >>> >>> >>>> Michael S. Tsirkin wrote: >>>> >>>> >>>>>> I think the right long term answer to all this is a way to get QEMU to >>>>>> dump it's current machine configuration in glorious detail as a file >>>>>> which can be reloaded as a machine configuration. >>>>>> >>>>>> >>>>> And then we'll have the same set of problems there. >>>>> >>>>> >>>> We will, and the solution will be the same: options to create devices >>>> as they were in older versions of QEMU. It only needs to cover device >>>> features which matter to guests, not every bug fix. >>>> >>>> However with a machine configuration which is generated by QEMU, >>>> there's less worry about proliferation of obscure options, compared >>>> with the command line. You don't necessarily have to document every >>>> backward-compatibility option in any detail, you just have to make >>>> sure it's written and read properly, which is much the same thing as >>>> the snapshot code does. >>>> >>>> >>> This is a sensible plan, but I don't think we should mix these compat >>> options in with the VM manager supplied configuration. >>> >>> There are two problems with that approach. >>> >>> = Problem 1 - VM manager needs to parse qemu config = >>> >>> Your proposal implies: >>> >>> - VM manager supplies a basic configuration to qemu >>> >>> - It then immediately asks qemu for a dump of the machine >>> configuration in all its glorious detail and retains that >>> config >>> >>> - If the VM manager wishes to add a new device it needs to parse the >>> qemu config and add it, rather than just generate an entirely new >>> config >>> >>> >> What's the problem with parsing the device config and modifying it? Is >> it just complexity? >> > > Yes, complexity is the issue. > > >> If we provided a mechanism to simplify manipulating a device config, >> would that eliminate the concern here? >> > > In libvirt's case, a lot of the complexity would come from needing to > figure out what to change. > Right, libvirt wants to be able to easily say "add a scsi block device to this VM". The way I see this working is that there would be a default pc.dtc. We would still have a -drive file=foo.img,if=scsi option that would really just be a wrapper around first searching for an existing LSI controller, if one exists, attaching the lun, if not, create one, etc. libvirt could continue to use this sort of interface. However, as it wants to do more advanced things, it may have to dive into the device tree itself. On live migration, QEMU will save a copy of the device tree somewhere and libvirt needs to keep track of it. It can treat it as opaque. -M /path/to/foo.dtc -drive file=foo.img,if=scsi should continue working as expected IMHO. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Mark McLoughlin wrote: > On Fri, 2009-06-12 at 09:55 -0500, Anthony Liguori wrote: > >> Mark McLoughlin wrote: >> >>> On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote: >>> >>> = Solution - Separate configuration from compat hints = >>> >>> As I suggested before: >>> >>> - Allow the VM manager to dump compat hints; this would be an opaque >>> file format, more like the savevm format than a config file >>> >>> >> How is compat hints different from a device tree? >> >> In my mind, that's what compat hints is. I don't see another sane way >> to implement it. >> > > A device tree with a different purpose than a config file. > > In its simplest form it could be a device tree with a version number for > each device[1]. > I think the point is that you don't need version numbers if you have a proper device tree. NB the device tree contains no host configuration information. Regards, Anthony Liguori > The other obvious piece to add to it would be PCI addresses, so that > even if you remove a device, the addresses assigned to existing devices > don't change. > > Cheers, > Mark. > > [1] - Adding such a per-device version number to the config file would > solve problem (2) > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2009-06-12 at 11:11 -0500, Anthony Liguori wrote: > Mark McLoughlin wrote: > > On Fri, 2009-06-12 at 09:51 -0500, Anthony Liguori wrote: > > > >> Mark McLoughlin wrote: > >> > >>> On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote: > >>> > >>> > >>>> Michael S. Tsirkin wrote: > >>>> > >>>> > >>>>>> I think the right long term answer to all this is a way to get QEMU to > >>>>>> dump it's current machine configuration in glorious detail as a file > >>>>>> which can be reloaded as a machine configuration. > >>>>>> > >>>>>> > >>>>> And then we'll have the same set of problems there. > >>>>> > >>>>> > >>>> We will, and the solution will be the same: options to create devices > >>>> as they were in older versions of QEMU. It only needs to cover device > >>>> features which matter to guests, not every bug fix. > >>>> > >>>> However with a machine configuration which is generated by QEMU, > >>>> there's less worry about proliferation of obscure options, compared > >>>> with the command line. You don't necessarily have to document every > >>>> backward-compatibility option in any detail, you just have to make > >>>> sure it's written and read properly, which is much the same thing as > >>>> the snapshot code does. > >>>> > >>>> > >>> This is a sensible plan, but I don't think we should mix these compat > >>> options in with the VM manager supplied configuration. > >>> > >>> There are two problems with that approach. > >>> > >>> = Problem 1 - VM manager needs to parse qemu config = > >>> > >>> Your proposal implies: > >>> > >>> - VM manager supplies a basic configuration to qemu > >>> > >>> - It then immediately asks qemu for a dump of the machine > >>> configuration in all its glorious detail and retains that > >>> config > >>> > >>> - If the VM manager wishes to add a new device it needs to parse the > >>> qemu config and add it, rather than just generate an entirely new > >>> config > >>> > >>> > >> What's the problem with parsing the device config and modifying it? Is > >> it just complexity? > >> > > > > Yes, complexity is the issue. > > > > > >> If we provided a mechanism to simplify manipulating a device config, > >> would that eliminate the concern here? > >> > > > > In libvirt's case, a lot of the complexity would come from needing to > > figure out what to change. > > > > Right, libvirt wants to be able to easily say "add a scsi block device > to this VM". The way I see this working is that there would be a > default pc.dtc. We would still have a -drive file=foo.img,if=scsi > option that would really just be a wrapper around first searching for an > existing LSI controller, if one exists, attaching the lun, if not, > create one, etc. > > libvirt could continue to use this sort of interface. However, as it > wants to do more advanced things, it may have to dive into the device > tree itself. > > On live migration, QEMU will save a copy of the device tree somewhere > and libvirt needs to keep track of it. It can treat it as opaque. -M > /path/to/foo.dtc -drive file=foo.img,if=scsi should continue working as > expected IMHO. So, when libvirt creates a guest for the first time, it makes a copy of the device tree and continues to use that even if qemu is upgraded. That's enough to ensure compat is retained for all built-in devices. However, in order to retain compat for that SCSI device (e.g. ensuring the PCI address doesn't change as other devices are added an removed), we're back to the same problem ... either: 1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to figure out what address to use, libvirt would need to query qemu for what address was originally allocated to device or it would do all the PCI address allocation itself ... or: 2) Don't use the command line, instead get a dump of the entire device tree (including the SCSI device) - if the device is to be removed or modified in future, libvirt would need to modify the device tree The basic problem would be that the command line config would have very limited ability to override the device tree config. Cheers, Mark. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2009-06-12 at 11:12 -0500, Anthony Liguori wrote: > Mark McLoughlin wrote: > > On Fri, 2009-06-12 at 09:55 -0500, Anthony Liguori wrote: > > > >> Mark McLoughlin wrote: > >> > >>> On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote: > >>> > >>> = Solution - Separate configuration from compat hints = > >>> > >>> As I suggested before: > >>> > >>> - Allow the VM manager to dump compat hints; this would be an opaque > >>> file format, more like the savevm format than a config file > >>> > >>> > >> How is compat hints different from a device tree? > >> > >> In my mind, that's what compat hints is. I don't see another sane way > >> to implement it. > >> > > > > A device tree with a different purpose than a config file. > > > > In its simplest form it could be a device tree with a version number for > > each device[1]. > > > > I think the point is that you don't need version numbers if you have a > proper device tree. How do you add a new attribute to the device tree and, when a supplied device tree lacking said attribute, distinguish between a device tree from an old version of qemu (i.e. use the old default) and a partial device tree from the VM manager (i.e. use the new default) ? > NB the device tree contains no host configuration information. So, it wouldn't e.g. include the path to the image file for a block device? That would always be specified on the command line? Cheers, Mark. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Mark McLoughlin wrote: > So, when libvirt creates a guest for the first time, it makes a copy of > the device tree and continues to use that even if qemu is upgraded. > That's enough to ensure compat is retained for all built-in devices. > > However, in order to retain compat for that SCSI device (e.g. ensuring > the PCI address doesn't change as other devices are added an removed), > we're back to the same problem ... either: > > 1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to figure > out what address to use, libvirt would need to query qemu for what > address was originally allocated to device or it would do all the > PCI address allocation itself ... or: > > 2) Don't use the command line, instead get a dump of the entire > device tree (including the SCSI device) - if the device is to be > removed or modified in future, libvirt would need to modify the > device tree > > The basic problem would be that the command line config would have very > limited ability to override the device tree config. > After libvirt has done -drive file=foo... it should dump the machine config and use that from then on. To combined to a single thread... > How do you add a new attribute to the device tree and, when a supplied > device tree lacking said attribute, distinguish between a device tree > from an old version of qemu (i.e. use the old default) and a partial > device tree from the VM manager (i.e. use the new default) ? > Please define "attribute". I don't follow what you're asking. >> NB the device tree contains no host configuration information. >> > > So, it wouldn't e.g. include the path to the image file for a block > device? That would always be specified on the command line? > No, the IDE definition would contain some sort of symbolic node name. A separate mechanism (either command line or host config file) would then link a image file to the symbolic name. libvirt should really never worry about the machine config file for normal things unless it needs to change what devices are exposed to a guest. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2009-06-12 at 12:00 -0500, Anthony Liguori wrote: > Mark McLoughlin wrote: > > So, when libvirt creates a guest for the first time, it makes a copy of > > the device tree and continues to use that even if qemu is upgraded. > > That's enough to ensure compat is retained for all built-in devices. > > > > However, in order to retain compat for that SCSI device (e.g. ensuring > > the PCI address doesn't change as other devices are added an removed), > > we're back to the same problem ... either: > > > > 1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to figure > > out what address to use, libvirt would need to query qemu for what > > address was originally allocated to device or it would do all the > > PCI address allocation itself ... or: > > > > 2) Don't use the command line, instead get a dump of the entire > > device tree (including the SCSI device) - if the device is to be > > removed or modified in future, libvirt would need to modify the > > device tree > > > > The basic problem would be that the command line config would have very > > limited ability to override the device tree config. > > > > After libvirt has done -drive file=foo... it should dump the machine > config and use that from then on. Right - libvirt then wouldn't be able to avoid the complexity of merging any future changes into the dumped machine config. > To combined to a single thread... > > How do you add a new attribute to the device tree and, when a supplied > > device tree lacking said attribute, distinguish between a device tree > > from an old version of qemu (i.e. use the old default) and a partial > > device tree from the VM manager (i.e. use the new default) ? > > > > Please define "attribute". I don't follow what you're asking. e.g. a per-device "enable MSI support" flag. If qemu is supplied with a device tree that lacks that flag, does it enable or disable MSI? Enable by default is bad - it could be a device tree dumped from an old version of qemu, so compat would be broken. Disable by default is bad - it could be a simple device tree supplied by the user, and the latest features are wanted. Maybe we want a per-device "this is a complete device description" flag and if anything is missing from a supposedly complete description, the old defaults would be used. A config dumped from qemu would have this flag set, a config generated by libvirt would not have the flag. > >> NB the device tree contains no host configuration information. > >> > > > > So, it wouldn't e.g. include the path to the image file for a block > > device? That would always be specified on the command line? > > > > No, the IDE definition would contain some sort of symbolic node name. A > separate mechanism (either command line or host config file) would then > link a image file to the symbolic name. Okay. > libvirt should really never worry about the machine config file for > normal things unless it needs to change what devices are exposed to a guest. But changing devices *is* normal ... e.g. removing a block device. Writing out a device tree is not a problem for libvirt (or any other management tools), it's the need to merge changes into an existing device tree is where the real complexity would lie. Cheers, Mark. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 6/12/09, Mark McLoughlin <markmc@redhat.com> wrote: > On Fri, 2009-06-12 at 12:00 -0500, Anthony Liguori wrote: > > Mark McLoughlin wrote: > > > So, when libvirt creates a guest for the first time, it makes a copy of > > > the device tree and continues to use that even if qemu is upgraded. > > > That's enough to ensure compat is retained for all built-in devices. > > > > > > However, in order to retain compat for that SCSI device (e.g. ensuring > > > the PCI address doesn't change as other devices are added an removed), > > > we're back to the same problem ... either: > > > > > > 1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to figure > > > out what address to use, libvirt would need to query qemu for what > > > address was originally allocated to device or it would do all the > > > PCI address allocation itself ... or: > > > > > > 2) Don't use the command line, instead get a dump of the entire > > > device tree (including the SCSI device) - if the device is to be > > > removed or modified in future, libvirt would need to modify the > > > device tree > > > > > > The basic problem would be that the command line config would have very > > > limited ability to override the device tree config. > > > > > > > After libvirt has done -drive file=foo... it should dump the machine > > config and use that from then on. > > Right - libvirt then wouldn't be able to avoid the complexity of merging > any future changes into the dumped machine config. > > > To combined to a single thread... > > > How do you add a new attribute to the device tree and, when a supplied > > > device tree lacking said attribute, distinguish between a device tree > > > from an old version of qemu (i.e. use the old default) and a partial > > > device tree from the VM manager (i.e. use the new default) ? > > > > > > > Please define "attribute". I don't follow what you're asking. > > e.g. a per-device "enable MSI support" flag. > > If qemu is supplied with a device tree that lacks that flag, does it > enable or disable MSI? > > Enable by default is bad - it could be a device tree dumped from an old > version of qemu, so compat would be broken. > > Disable by default is bad - it could be a simple device tree supplied by > the user, and the latest features are wanted. > > Maybe we want a per-device "this is a complete device description" flag > and if anything is missing from a supposedly complete description, the > old defaults would be used. A config dumped from qemu would have this > flag set, a config generated by libvirt would not have the flag. If the device has different behavior or different properties from guest perspective compared to the old device, it should get a new device type so that you could specify in the device tree either the old device or the new one. Flags won't help in the long term. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2009-06-12 at 20:44 +0300, Blue Swirl wrote: > If the device has different behavior or different properties from > guest perspective compared to the old device, it should get a new > device type so that you could specify in the device tree either the > old device or the new one. Yes, that works - it's analogous to a device (type, version) pair. Cheers, Mark. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Anthony Liguori wrote: > > What's the problem with parsing the device config and modifying it? > Is it just complexity? Two-way modification. Management wants to store the configuration in their database and tell the hypervisor what the machine looks like. If qemu also tells management what the machine looks like, we can easily get conflicts.
Mark McLoughlin wrote: >> I think the point is that you don't need version numbers if you have a >> proper device tree. >> > > How do you add a new attribute to the device tree and, when a supplied > device tree lacking said attribute, distinguish between a device tree > from an old version of qemu (i.e. use the old default) and a partial > device tree from the VM manager (i.e. use the new default) ? > -baseline 0.10 > >> NB the device tree contains no host configuration information. >> > > So, it wouldn't e.g. include the path to the image file for a block > device? That would always be specified on the command line? > Or in a different file. I agree splitting host and guest configuration is a must-have, this ensures portability of virtual machines across hosts and time.
On Fri, Jun 12, 2009 at 04:53:27PM +0100, Mark McLoughlin wrote: > On Fri, 2009-06-12 at 09:55 -0500, Anthony Liguori wrote: > > Mark McLoughlin wrote: > > > On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote: > > > > > > = Solution - Separate configuration from compat hints = > > > > > > As I suggested before: > > > > > > - Allow the VM manager to dump compat hints; this would be an opaque > > > file format, more like the savevm format than a config file > > > > > > > How is compat hints different from a device tree? > > > > In my mind, that's what compat hints is. I don't see another sane way > > to implement it. > > A device tree with a different purpose than a config file. > > In its simplest form it could be a device tree with a version number for > each device[1]. > > The other obvious piece to add to it would be PCI addresses, so that > even if you remove a device, the addresses assigned to existing devices > don't change. Could you clarify this requirement please? If we want to remove a device from under a running guest, you need hotplug. So we can't just remove several lines from the config and hope that it'll work simply because the PCI address is stable. OTOH, if you reboot the guest, it's ok for addresses to change. > Cheers, > Mark. > > [1] - Adding such a per-device version number to the config file would > solve problem (2) > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin wrote: > > If we want to remove a device from under a running guest, you need > hotplug. So we can't just remove several lines from the config and hope > that it'll work simply because the PCI address is stable. > Why not? > OTOH, if you reboot the guest, it's ok for addresses to change. > No, it's not. Some guests depend on addressing for their configuration (for example older Linux guests will swap eth0/eth1 if you swap their slots).
On Sun, Jun 14, 2009 at 12:37:13PM +0300, Avi Kivity wrote: > Michael S. Tsirkin wrote: >> >> If we want to remove a device from under a running guest, you need >> hotplug. So we can't just remove several lines from the config and hope >> that it'll work simply because the PCI address is stable. >> > > Why not? E.g. configuration cycles address a specific bus/slot. You need cooperation from guest if you want to move a device. >> OTOH, if you reboot the guest, it's ok for addresses to change. >> > > No, it's not. Some guests depend on addressing for their configuration > (for example older Linux guests will swap eth0/eth1 if you swap their > slots). Ah, I misunderstood what's meant by the address. I agree that it's useful to be able to control device's placement on the bus.
On Fri, Jun 12, 2009 at 05:48:23PM +0100, Mark McLoughlin wrote: > However, in order to retain compat for that SCSI device (e.g. ensuring > the PCI address doesn't change as other devices are added an removed), > we're back to the same problem ... either: > > 1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to figure > out what address to use, libvirt would need to query qemu for what > address was originally allocated to device or it would do all the > PCI address allocation itself ... This last option makes sense to me: in a real world the user has control over where he places the device on the bus, so why not with qemu?
Avi Kivity <avi@redhat.com> writes: > Mark McLoughlin wrote: [...] >>> NB the device tree contains no host configuration information. >>> >> >> So, it wouldn't e.g. include the path to the image file for a block >> device? That would always be specified on the command line? >> > > Or in a different file. I agree splitting host and guest > configuration is a must-have, this ensures portability of virtual > machines across hosts and time. Splitting into two separate sections should suffice, they could live in the same file for convenience. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, 2009-06-14 at 12:34 +0300, Michael S. Tsirkin wrote: > On Fri, Jun 12, 2009 at 04:53:27PM +0100, Mark McLoughlin wrote: > > The other obvious piece to add to it would be PCI addresses, so that > > even if you remove a device, the addresses assigned to existing devices > > don't change. > > Could you clarify this requirement please? Avi clarified, but I've written it up here too: https://fedoraproject.org/wiki/Features/KVM_Stable_PCI_Addresses Cheers, Mark. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, 2009-06-14 at 12:50 +0300, Michael S. Tsirkin wrote: > On Fri, Jun 12, 2009 at 05:48:23PM +0100, Mark McLoughlin wrote: > > However, in order to retain compat for that SCSI device (e.g. ensuring > > the PCI address doesn't change as other devices are added an removed), > > we're back to the same problem ... either: > > > > 1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to figure > > out what address to use, libvirt would need to query qemu for what > > address was originally allocated to device or it would do all the > > PCI address allocation itself ... > > This last option makes sense to me: in a real world the user has > control over where he places the device on the bus, so why > not with qemu? Yep, most people seem to agree that it makes sense to allow this, but some believe it should only be via a machine description file, not the command line. However, the first problem is that it isn't a solution to the guest ABI problem more generally. And the second problem is that for e.g. libvirt to use it, it would have to be possible to query qemu for what PCI slots were assigned to the devices - libvirt would need to be able to parse 'info pci' and match the devices listed with the devices specified on the command line. Again, details written up here: https://fedoraproject.org/wiki/Features/KVM_Stable_PCI_Addresses Cheers, Mark. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, 2009-06-14 at 10:58 +0300, Avi Kivity wrote: > Mark McLoughlin wrote: > > > > >> I think the point is that you don't need version numbers if you have a > >> proper device tree. > >> > > > > How do you add a new attribute to the device tree and, when a supplied > > device tree lacking said attribute, distinguish between a device tree > > from an old version of qemu (i.e. use the old default) and a partial > > device tree from the VM manager (i.e. use the new default) ? > > > > -baseline 0.10 That's a version number :-) (I was responding to Anthony's "you don't need a version number") Cheers, Mark. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/15/2009 12:08 PM, Mark McLoughlin wrote: >> This last option makes sense to me: in a real world the user has >> control over where he places the device on the bus, so why >> not with qemu? >> > > Yep, most people seem to agree that it makes sense to allow this, but > some believe it should only be via a machine description file, not the > command line. > I don't understand this opposition. It's clear a machine config file is a long way in our future. It's also clear lack of stable PCI addresses hurts us now. > However, the first problem is that it isn't a solution to the guest ABI > problem more generally. > pci_addr was never meant to bring world peace, just stable PCI addresses. The other issues should be addressed separately. > And the second problem is that for e.g. libvirt to use it, it would have > to be possible to query qemu for what PCI slots were assigned to the > devices - libvirt would need to be able to parse 'info pci' and match > the devices listed with the devices specified on the command line. > If all devices (including vga, ide) are set up with pci_addr, then this is unneeded. You do need to export available slot numbers from qemu.
On 06/14/2009 12:47 PM, Michael S. Tsirkin wrote: > Michael S. Tsirkin wrote: >>> If we want to remove a device from under a running guest, you need >>> hotplug. So we can't just remove several lines from the config and hope >>> that it'll work simply because the PCI address is stable. >>> >>> >> Why not? >> > > E.g. configuration cycles address a specific bus/slot. > You need cooperation from guest if you want to move > a device. > By "remove several lines from the config" I understood the guest needs to be restarted. Of course if you don't restart the guest you need true hotplug.
On 06/14/2009 12:50 PM, Michael S. Tsirkin wrote: > On Fri, Jun 12, 2009 at 05:48:23PM +0100, Mark McLoughlin wrote: > >> However, in order to retain compat for that SCSI device (e.g. ensuring >> the PCI address doesn't change as other devices are added an removed), >> we're back to the same problem ... either: >> >> 1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to figure >> out what address to use, libvirt would need to query qemu for what >> address was originally allocated to device or it would do all the >> PCI address allocation itself ... >> > > This last option makes sense to me: in a real world the user has > control over where he places the device on the bus, so why > not with qemu? > Yes, the user build the machine using the command line and monitor (or, in 2017, the machine configuration file), then turns on the power. Command line options are the parts lying around when we start. btw, -drive needs to be separated: -controller type=lsi1234,pci_addr=foobar,name=blah -drive file=foo.img,controller=blah,index=0 -drive file=bar.img,controller=blah,index=1 Drives to not have pci addresses.
On Mon, Jun 15, 2009 at 12:43:48PM +0300, Avi Kivity wrote: > On 06/14/2009 12:50 PM, Michael S. Tsirkin wrote: >> On Fri, Jun 12, 2009 at 05:48:23PM +0100, Mark McLoughlin wrote: >> >>> However, in order to retain compat for that SCSI device (e.g. ensuring >>> the PCI address doesn't change as other devices are added an removed), >>> we're back to the same problem ... either: >>> >>> 1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to figure >>> out what address to use, libvirt would need to query qemu for what >>> address was originally allocated to device or it would do all the >>> PCI address allocation itself ... >>> >> >> This last option makes sense to me: in a real world the user has >> control over where he places the device on the bus, so why >> not with qemu? >> > > Yes, the user build the machine using the command line and monitor (or, > in 2017, the machine configuration file), then turns on the power. > Command line options are the parts lying around when we start. > > btw, -drive needs to be separated: > > -controller type=lsi1234,pci_addr=foobar,name=blah > -drive file=foo.img,controller=blah,index=0 > -drive file=bar.img,controller=blah,index=1 > > Drives to not have pci addresses. Maybe we need a generic 'bus options' flag. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jun 15, 2009 at 12:27:08PM +0300, Avi Kivity wrote: > On 06/15/2009 12:08 PM, Mark McLoughlin wrote: >>> This last option makes sense to me: in a real world the user has >>> control over where he places the device on the bus, so why >>> not with qemu? >>> >> >> Yep, most people seem to agree that it makes sense to allow this, but >> some believe it should only be via a machine description file, not the >> command line. >> > > I don't understand this opposition. It's clear a machine config file is > a long way in our future. It's also clear lack of stable PCI addresses > hurts us now. > >> However, the first problem is that it isn't a solution to the guest ABI >> problem more generally. >> > > pci_addr was never meant to bring world peace, just stable PCI > addresses. The other issues should be addressed separately. > >> And the second problem is that for e.g. libvirt to use it, it would have >> to be possible to query qemu for what PCI slots were assigned to the >> devices - libvirt would need to be able to parse 'info pci' and match >> the devices listed with the devices specified on the command line. >> > > If all devices (including vga, ide) are set up with pci_addr, then this > is unneeded. Right. I think it could be an all or nothing at all approach. > You do need to export available slot numbers from qemu. Why would a slot be unavailable? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jun 15, 2009 at 01:32:49PM +0300, Michael S. Tsirkin wrote: > > You do need to export available slot numbers from qemu. > > Why would a slot be unavailable? > Because it does not exist? -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jun 15, 2009 at 01:44:56PM +0300, Gleb Natapov wrote: > On Mon, Jun 15, 2009 at 01:32:49PM +0300, Michael S. Tsirkin wrote: > > > You do need to export available slot numbers from qemu. > > > > Why would a slot be unavailable? > > > Because it does not exist? We can create a slot with any number, can't we? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jun 15, 2009 at 01:46:53PM +0300, Michael S. Tsirkin wrote: > On Mon, Jun 15, 2009 at 01:44:56PM +0300, Gleb Natapov wrote: > > On Mon, Jun 15, 2009 at 01:32:49PM +0300, Michael S. Tsirkin wrote: > > > > You do need to export available slot numbers from qemu. > > > > > > Why would a slot be unavailable? > > > > > Because it does not exist? > > We can create a slot with any number, can't we? What do you mean? If the mobo has 4 slots you can't create fifth. KVM describes 32 slots in the BIOS. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jun 15, 2009 at 01:52:13PM +0300, Gleb Natapov wrote: > On Mon, Jun 15, 2009 at 01:46:53PM +0300, Michael S. Tsirkin wrote: > > On Mon, Jun 15, 2009 at 01:44:56PM +0300, Gleb Natapov wrote: > > > On Mon, Jun 15, 2009 at 01:32:49PM +0300, Michael S. Tsirkin wrote: > > > > > You do need to export available slot numbers from qemu. > > > > > > > > Why would a slot be unavailable? > > > > > > > Because it does not exist? > > > > We can create a slot with any number, can't we? > What do you mean? If the mobo has 4 slots you can't create fifth. > KVM describes 32 slots in the BIOS. Do you mean the KVM kernel module here? I don't know much about the BIOS. Can't qemu control the number of slots declared?
On Mon, Jun 15, 2009 at 02:07:53PM +0300, Michael S. Tsirkin wrote: > On Mon, Jun 15, 2009 at 01:52:13PM +0300, Gleb Natapov wrote: > > On Mon, Jun 15, 2009 at 01:46:53PM +0300, Michael S. Tsirkin wrote: > > > On Mon, Jun 15, 2009 at 01:44:56PM +0300, Gleb Natapov wrote: > > > > On Mon, Jun 15, 2009 at 01:32:49PM +0300, Michael S. Tsirkin wrote: > > > > > > You do need to export available slot numbers from qemu. > > > > > > > > > > Why would a slot be unavailable? > > > > > > > > > Because it does not exist? > > > > > > We can create a slot with any number, can't we? > > What do you mean? If the mobo has 4 slots you can't create fifth. > > KVM describes 32 slots in the BIOS. > > Do you mean the KVM kernel module here? I don't know much about the No I don't mean KVM kernel module here. > BIOS. Can't qemu control the number of slots declared? > Qemu represents HW, BIOS drives this HW. They should be in sync on such important issues like pci slot configuration. Even if QEMU can control the number of slots declared (which it can't easily do), it will be able to do it only on startup (before BIOS runs). The way to have dynamic number of slots may be pci bridge emulation. Not sure what is needed from BIOS for that. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/15/2009 01:32 PM, Michael S. Tsirkin wrote: >> You do need to export available slot numbers from qemu. >> > > Why would a slot be unavailable? > A slot needs to be configured in ACPI, and not be taken by onboard chips (piix takes slot 0, for example).
On 06/15/2009 12:09 PM, Mark McLoughlin wrote: >>>> I think the point is that you don't need version numbers if you have a >>>> proper device tree. >>>> >>>> >>> How do you add a new attribute to the device tree and, when a supplied >>> device tree lacking said attribute, distinguish between a device tree >>> from an old version of qemu (i.e. use the old default) and a partial >>> device tree from the VM manager (i.e. use the new default) ? >>> >>> >> -baseline 0.10 >> > > That's a version number :-) > > (I was responding to Anthony's "you don't need a version number") > If you want to prevent incompatibilities, you need to make everything new (potentially including bugfixes) non-default. Eventually the default configuration becomes increasingly unusable and you need a new baseline. You must still be able to fall back to the old baseline for older guests. I don't think games with configuration files can hide that.
On Mon, Jun 15, 2009 at 02:14:15PM +0300, Gleb Natapov wrote: > On Mon, Jun 15, 2009 at 02:07:53PM +0300, Michael S. Tsirkin wrote: > > On Mon, Jun 15, 2009 at 01:52:13PM +0300, Gleb Natapov wrote: > > > On Mon, Jun 15, 2009 at 01:46:53PM +0300, Michael S. Tsirkin wrote: > > > > On Mon, Jun 15, 2009 at 01:44:56PM +0300, Gleb Natapov wrote: > > > > > On Mon, Jun 15, 2009 at 01:32:49PM +0300, Michael S. Tsirkin wrote: > > > > > > > You do need to export available slot numbers from qemu. > > > > > > > > > > > > Why would a slot be unavailable? > > > > > > > > > > > Because it does not exist? > > > > > > > > We can create a slot with any number, can't we? > > > What do you mean? If the mobo has 4 slots you can't create fifth. > > > KVM describes 32 slots in the BIOS. > > > > Do you mean the KVM kernel module here? I don't know much about the > No I don't mean KVM kernel module here. > > > BIOS. Can't qemu control the number of slots declared? > > > Qemu represents HW, BIOS drives this HW. They should be in sync on such > important issues like pci slot configuration. As a simple solution, let's stick to 32 slots per bus. That's the maximum that the PCI spec allows, anyway. > Even if QEMU can control the number of slots declared (which it can't > easily do), it will be able to do it only on startup (before BIOS > runs). That's OK - this is when the machine description is read. > The way to have dynamic > number of slots may be pci bridge emulation. Not sure what is needed > from BIOS for that. Since bridge can be hot-plugged, probably nothing? But we don't necessarily need dynamic number of slots IMO. > > -- > Gleb.
Avi Kivity <avi@redhat.com> writes: > On 06/15/2009 12:08 PM, Mark McLoughlin wrote: >>> This last option makes sense to me: in a real world the user has >>> control over where he places the device on the bus, so why >>> not with qemu? >>> >> >> Yep, most people seem to agree that it makes sense to allow this, but >> some believe it should only be via a machine description file, not the >> command line. >> > > I don't understand this opposition. It's clear a machine config file > is a long way in our future. It's also clear lack of stable PCI > addresses hurts us now. Correct. >> However, the first problem is that it isn't a solution to the guest ABI >> problem more generally. >> > > pci_addr was never meant to bring world peace, just stable PCI > addresses. The other issues should be addressed separately. > >> And the second problem is that for e.g. libvirt to use it, it would have >> to be possible to query qemu for what PCI slots were assigned to the >> devices - libvirt would need to be able to parse 'info pci' and match >> the devices listed with the devices specified on the command line. >> > > If all devices (including vga, ide) are set up with pci_addr, then > this is unneeded. You do need to export available slot numbers from > qemu. Not really. QEMU gives just the host bridge a fixed slot[*]. All the other slots are available. The real problem is devices that get implicitly added, like the SCSI controller. Those devices get their slots auto-assigned, which can interfere with slot numbers chosen by the user. We need a way to avoid that, as you suggested elsewhere in this thread. [*] There's an exception or two for oddball targets. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
(adding cc) On 06/15/2009 02:35 PM, Markus Armbruster wrote: > Not really. QEMU gives just the host bridge a fixed slot[*]. All the > other slots are available. > qemu needs to export these two bits of information: the first free slot and the number of slots. More generally, which slots are open. We can assume 1:31, but that's unlovely. > The real problem is devices that get implicitly added, like the SCSI > controller. Those devices get their slots auto-assigned, which can > interfere with slot numbers chosen by the user. We need a way to avoid > that, as you suggested elsewhere in this thread. > Paul/Anthony, can we have -vga pci_addr=, -usb-controller pci_addr=, and -drive pci_addr= (and later, -disk-controller)? Stalling while waiting for the ultimate config file is only generating pain and out-of-tree patches. (I'd be quite happy constructing the entire machine config on the command line, but I realize it's just me)
On Mon, Jun 15, 2009 at 02:27:14PM +0300, Avi Kivity wrote: > On 06/15/2009 01:32 PM, Michael S. Tsirkin wrote: >>> You do need to export available slot numbers from qemu. >>> >> >> Why would a slot be unavailable? >> > > A slot needs to be configured in ACPI, Can we configure all possible 32 slots? > and not be taken by onboard chips > (piix takes slot 0, for example). piix is the root complex, isn't it? Are there other examples? If not, we could teach management about the root complex being special ... > -- > error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/15/2009 02:48 PM, Michael S. Tsirkin wrote: >> A slot needs to be configured in ACPI, >> > > Can we configure all possible 32 slots? > That's what we do. But one is always taken. In the future, perhaps more. >> and not be taken by onboard chips >> (piix takes slot 0, for example). >> > > piix is the root complex, isn't it? Are there other examples? If not, > we could teach management about the root complex being special ... > We should just tell the user which slots are open.
Avi Kivity wrote: > (I'd be quite happy constructing the entire machine config on the > command line, but I realize it's just me) > It is not just you. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity wrote: > On 06/15/2009 12:08 PM, Mark McLoughlin wrote: >>> This last option makes sense to me: in a real world the user has >>> control over where he places the device on the bus, so why >>> not with qemu? >>> >> >> Yep, most people seem to agree that it makes sense to allow this, but >> some believe it should only be via a machine description file, not the >> command line. >> > > I don't understand this opposition. It's clear a machine config file > is a long way in our future. It's also clear lack of stable PCI > addresses hurts us now. Is there opposition? I don't ever recall seeing a patch... I think it's a perfectly fine idea. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity <avi@redhat.com> writes: > (adding cc) > > On 06/15/2009 02:35 PM, Markus Armbruster wrote: >> Not really. QEMU gives just the host bridge a fixed slot[*]. All the >> other slots are available. >> > > qemu needs to export these two bits of information: the first free > slot and the number of slots. > > More generally, which slots are open. We can assume 1:31, but that's > unlovely. Point. >> The real problem is devices that get implicitly added, like the SCSI >> controller. Those devices get their slots auto-assigned, which can >> interfere with slot numbers chosen by the user. We need a way to avoid >> that, as you suggested elsewhere in this thread. >> > > Paul/Anthony, can we have -vga pci_addr=, -usb-controller pci_addr=, > and -drive pci_addr= (and later, -disk-controller)? Stalling while > waiting for the ultimate config file is only generating pain and > out-of-tree patches. Yup. I got bit-rotten patches for pci_addr=, and I can unrot them if they're wanted. > (I'd be quite happy constructing the entire machine config on the > command line, but I realize it's just me) Ha, .bash_history as config file... -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jun 15, 2009 at 02:56:42PM +0300, Avi Kivity wrote: > On 06/15/2009 02:48 PM, Michael S. Tsirkin wrote: >>> A slot needs to be configured in ACPI, >>> >> >> Can we configure all possible 32 slots? >> > > That's what we do. But one is always taken. In the future, perhaps more. > >>> and not be taken by onboard chips >>> (piix takes slot 0, for example). >>> >> >> piix is the root complex, isn't it? Are there other examples? If not, >> we could teach management about the root complex being special ... >> > > We should just tell the user which slots are open. This might be tricky if the config is passed in with the command line flags. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity wrote: > On 06/14/2009 12:50 PM, Michael S. Tsirkin wrote: >> On Fri, Jun 12, 2009 at 05:48:23PM +0100, Mark McLoughlin wrote: >> >>> However, in order to retain compat for that SCSI device (e.g. ensuring >>> the PCI address doesn't change as other devices are added an removed), >>> we're back to the same problem ... either: >>> >>> 1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to >>> figure >>> out what address to use, libvirt would need to query qemu for >>> what >>> address was originally allocated to device or it would do all the >>> PCI address allocation itself ... >>> >> >> This last option makes sense to me: in a real world the user has >> control over where he places the device on the bus, so why >> not with qemu? >> > > Yes, the user build the machine using the command line and monitor > (or, in 2017, the machine configuration file), Considering pbrook just posted a machine config for arm, I think it would be rather sad if pc wasn't converted to it by 2017... > then turns on the power. Command line options are the parts lying > around when we start. > > btw, -drive needs to be separated: > > -controller type=lsi1234,pci_addr=foobar,name=blah > -drive file=foo.img,controller=blah,index=0 > -drive file=bar.img,controller=blah,index=1 > > Drives to not have pci addresses. Drivers don't have indexes and buses but we specify it on the -drive line. -drive is convenient syntax. It stops being convenient when you force it to be two options. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity wrote: > On 06/15/2009 12:09 PM, Mark McLoughlin wrote: >>>>> I think the point is that you don't need version numbers if you >>>>> have a >>>>> proper device tree. >>>>> >>>>> >>>> How do you add a new attribute to the device tree and, when a supplied >>>> device tree lacking said attribute, distinguish between a device tree >>>> from an old version of qemu (i.e. use the old default) and a partial >>>> device tree from the VM manager (i.e. use the new default) ? >>>> >>>> >>> -baseline 0.10 >>> >> >> That's a version number :-) >> >> (I was responding to Anthony's "you don't need a version number") >> > > If you want to prevent incompatibilities, you need to make everything > new (potentially including bugfixes) non-default. Eventually the > default configuration becomes increasingly unusable and you need a new > baseline. You must still be able to fall back to the old baseline for > older guests. I don't think games with configuration files can hide > that. -M pc1 -M pc2 etc. This is pretty easy to maintain with config files. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/15/2009 03:41 PM, Michael S. Tsirkin wrote: > We should just tell the user which slots are open. > > > This might be tricky if the config is passed in with the command line > flags. > qemu -show-available-pci-slots (the qemu equivalent to KVM_CHECK_EXTENSION)
Markus Armbruster wrote: > Avi Kivity <avi@redhat.com> writes: > > >> Paul/Anthony, can we have -vga pci_addr=, -usb-controller pci_addr=, >> and -drive pci_addr= (and later, -disk-controller)? Stalling while >> waiting for the ultimate config file is only generating pain and >> out-of-tree patches. >> > > Yup. > > I got bit-rotten patches for pci_addr=, and I can unrot them if they're > wanted. > Yes, would be good to have patches on the list to discuss. In principle, I have no objection to this. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity wrote: > On 06/15/2009 03:41 PM, Michael S. Tsirkin wrote: >> We should just tell the user which slots are open. >> >> This might be tricky if the config is passed in with the command line >> flags. >> > > qemu -show-available-pci-slots Why does the user care? Let QEMU allocate the PCI slot, then query it to see what slot it assigned and remember that. It's not a good idea to have management applications attempt to do PCI slot allocation. For instance, one day we may decide to make virtio devices multi-function. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/15/2009 03:41 PM, Anthony Liguori wrote: >>> Yep, most people seem to agree that it makes sense to allow this, but >>> some believe it should only be via a machine description file, not the >>> command line. >> >> I don't understand this opposition. It's clear a machine config file >> is a long way in our future. It's also clear lack of stable PCI >> addresses hurts us now. > > > Is there opposition? I don't ever recall seeing a patch... Izik Eidus posted a patch (using a different syntax) in November 2007. > > I think it's a perfectly fine idea. Good.
On 06/15/2009 03:45 PM, Anthony Liguori wrote: >>> This last option makes sense to me: in a real world the user has >>> control over where he places the device on the bus, so why >>> not with qemu? >> >> Yes, the user build the machine using the command line and monitor >> (or, in 2017, the machine configuration file), > > > Considering pbrook just posted a machine config for arm, I think it > would be rather sad if pc wasn't converted to it by 2017... I'd be sad too, but not surprised. >> then turns on the power. Command line options are the parts lying >> around when we start. >> >> btw, -drive needs to be separated: >> >> -controller type=lsi1234,pci_addr=foobar,name=blah >> -drive file=foo.img,controller=blah,index=0 >> -drive file=bar.img,controller=blah,index=1 >> >> Drives to not have pci addresses. > > Drivers don't have indexes and buses but we specify it on the -drive > line. Drives do have indexes. On old parallel scsi drives you set the index by clicking a button on the back of the drive to cycle through scsi addresses 0-7. An IDE drive's index is determined by the cable (master/slave). A SATA drive's index is determined by which header on the motherboard the drive connects to. If by bus you mean the if= parameter, then drives certainly do have buses. Just try connecting the scsi drive from the previous paragraph to a USB port. > -drive is convenient syntax. It stops being convenient when you force > it to be two options. controller= defaults to some builtin thing which autoinstantiates when necessary, so the old -drive syntax works.
Anthony Liguori <anthony@codemonkey.ws> writes: > Avi Kivity wrote: >> On 06/15/2009 12:08 PM, Mark McLoughlin wrote: >>>> This last option makes sense to me: in a real world the user has >>>> control over where he places the device on the bus, so why >>>> not with qemu? >>>> >>> >>> Yep, most people seem to agree that it makes sense to allow this, but >>> some believe it should only be via a machine description file, not the >>> command line. >>> >> >> I don't understand this opposition. It's clear a machine config >> file is a long way in our future. It's also clear lack of stable >> PCI addresses hurts us now. > > Is there opposition? I don't ever recall seeing a patch... http://www.archivum.info/qemu-devel@nongnu.org/2009-01/msg01458.html > I think it's a perfectly fine idea. Off to dust off my patch series. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/15/2009 03:52 PM, Anthony Liguori wrote: > Avi Kivity wrote: >> On 06/15/2009 03:41 PM, Michael S. Tsirkin wrote: >>> We should just tell the user which slots are open. >>> This might be tricky if the config is passed in with the command line >>> flags. >> >> qemu -show-available-pci-slots > > Why does the user care? > > Let QEMU allocate the PCI slot, then query it to see what slot it > assigned and remember that. It's a roundabout way of doing things. As an example, if you try to fit too many devices into the machine, you have to try to add all devices and watch for a qemu error. If you know in advance how many slots you have, you never enter into that situation (and you need to show the limit to the user anyway). > > It's not a good idea to have management applications attempt to do PCI > slot allocation. For instance, one day we may decide to make virtio > devices multi-function. Non-virtio, as well. But we can't make that the default, so the user will have to specify this anyway. Given that you can't hotunplug individual functions, the user will have to specify exactly how functions are aggregated into devices. My recommendation would be for a GUI to allow the user to select a 'quad port virtio NIC' or 'dual port virtio scsi controller' rather than trying to do anything automatic.
On 06/15/2009 03:48 PM, Anthony Liguori wrote: >>>>> device tree lacking said attribute, distinguish between a device tree >>>>> from an old version of qemu (i.e. use the old default) and a partial >>>>> device tree from the VM manager (i.e. use the new default) ? >>>>> >>>> -baseline 0.10 >>> >>> That's a version number :-) >>> >>> (I was responding to Anthony's "you don't need a version number") >> >> If you want to prevent incompatibilities, you need to make everything >> new (potentially including bugfixes) non-default. Eventually the >> default configuration becomes increasingly unusable and you need a >> new baseline. You must still be able to fall back to the old >> baseline for older guests. I don't think games with configuration >> files can hide that. > How do you add a new attribute to the device tree and, when a supplied > > -M pc1 > -M pc2 Certainly preferable to -baseline. > This is pretty easy to maintain with config files. Let's not tie the two together.
Hi, >> Yes, the user build the machine using the command line and monitor >> (or, in 2017, the machine configuration file), > > Considering pbrook just posted a machine config for arm, I think it > would be rather sad if pc wasn't converted to it by 2017... It shouldn't last until 2017, but the process isn't that trivial. Some qemu code / control flow needs serious restruction until it is in a state that creating the devices from a fdt can actually work. > Drivers don't have indexes and buses but we specify it on the -drive > line. -drive is convenient syntax. It stops being convenient when you > force it to be two options. One more issue: -drive also mixes host and guest configuration. These must be separated too. cheers, Gerd -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity wrote: > On 06/15/2009 03:45 PM, Anthony Liguori wrote: >>>> This last option makes sense to me: in a real world the user has >>>> control over where he places the device on the bus, so why >>>> not with qemu? >>> >>> Yes, the user build the machine using the command line and monitor >>> (or, in 2017, the machine configuration file), >> >> >> Considering pbrook just posted a machine config for arm, I think it >> would be rather sad if pc wasn't converted to it by 2017... > > I'd be sad too, but not surprised. > >>> then turns on the power. Command line options are the parts lying >>> around when we start. >>> >>> btw, -drive needs to be separated: >>> >>> -controller type=lsi1234,pci_addr=foobar,name=blah >>> -drive file=foo.img,controller=blah,index=0 >>> -drive file=bar.img,controller=blah,index=1 >>> >>> Drives to not have pci addresses. >> >> Drivers don't have indexes and buses but we specify it on the -drive >> line. > > Drives do have indexes. On old parallel scsi drives you set the index > by clicking a button on the back of the drive to cycle through scsi > addresses 0-7. An IDE drive's index is determined by the cable > (master/slave). A SATA drive's index is determined by which header on > the motherboard the drive connects to. It's not at all that simple. SCSI has a hierarchical address mechanism with 0-7 targets but then potentially multiple LUNs per target. Today, we always emulate a single LUN per target but if we ever wanted to support more than 7 disks on a SCSI controller, we would have to add multiple LUN support too. So the current linear unit= parameter is actually pretty broken for SCSI. For IDE, it's a combination of bus, slot, and master/slave. For virtio, it's just a PCI address. What we really need is something that is more opaque and controller specific. For instance, if we were going to do controllers... -controller type=lsi1234,pci_addr=foobar,name=blah -controller-disk controller=blah,target=0,lun=1,name=sda -controller type=ide,pci_addr=barfoo,name=ide -controller-disk controller=ide,slot=secondary,cable=slave,name=hdd -drive file=foo.img,controller-disk=sda -drive file=bar.img,controller-disk=hdd And having "-hdd file=foo.img" be short-hand for "-drive file=%s,controller-disk=%s". > > > If by bus you mean the if= parameter, then drives certainly do have > buses. Just try connecting the scsi drive from the previous paragraph > to a USB port. No, I meant drive file=foo.img,bus=3. If that doesn't seem obvious what it should do to you that's because it isn't at all obvious :-) It ends up skipping a predefined number of locations in the drive table. This is pretty broken fundamentally because it assumes controllers always support a fixed number of devices. Nothing really respects bus_id though so in practice, I assume it's almost universally broken. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity wrote: > On 06/15/2009 03:52 PM, Anthony Liguori wrote: >> Avi Kivity wrote: >>> On 06/15/2009 03:41 PM, Michael S. Tsirkin wrote: >>>> We should just tell the user which slots are open. >>>> This might be tricky if the config is passed in with the command >>>> line >>>> flags. >>> >>> qemu -show-available-pci-slots >> >> Why does the user care? >> >> Let QEMU allocate the PCI slot, then query it to see what slot it >> assigned and remember that. > > It's a roundabout way of doing things. Having libvirt do PCI slot allocation scares me. It assumes we can return a whitelist of available slots, and then let libvirt just randomly assign things. There's knowledge though in slot assignment that's board-specific. For instance, depending on how many LNK lines you have, you may want to put things in slots in such a way to optimize interrupt balancing or something like that. Some platforms have quirks about expecting a particular slot to have a particular device. It's still an optimal device but it has to be in that slot. You can't really express that via an available slot list. > Non-virtio, as well. But we can't make that the default, so the user > will have to specify this anyway. > > Given that you can't hotunplug individual functions, the user will > have to specify exactly how functions are aggregated into devices. My > recommendation would be for a GUI to allow the user to select a 'quad > port virtio NIC' or 'dual port virtio scsi controller' rather than > trying to do anything automatic. Yeah, I haven't thought much about that. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity wrote: > > Certainly preferable to -baseline. > >> This is pretty easy to maintain with config files. > > Let's not tie the two together. I mentioned it because it suggests a good transition. We at least have to think through how things map to the post-config file world regardless of whether that's a few months from now or a decade :-) Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/15/2009 04:20 PM, Anthony Liguori wrote: >> >>>> then turns on the power. Command line options are the parts lying >>>> around when we start. >>>> >>>> btw, -drive needs to be separated: >>>> >>>> -controller type=lsi1234,pci_addr=foobar,name=blah >>>> -drive file=foo.img,controller=blah,index=0 >>>> -drive file=bar.img,controller=blah,index=1 >>>> >>>> Drives to not have pci addresses. >>> >>> Drivers don't have indexes and buses but we specify it on the -drive >>> line. >> >> Drives do have indexes. On old parallel scsi drives you set the >> index by clicking a button on the back of the drive to cycle through >> scsi addresses 0-7. An IDE drive's index is determined by the cable >> (master/slave). A SATA drive's index is determined by which header >> on the motherboard the drive connects to. > > > It's not at all that simple. SCSI has a hierarchical address > mechanism with 0-7 targets but then potentially multiple LUNs per > target. Today, we always emulate a single LUN per target but if we > ever wanted to support more than 7 disks on a SCSI controller, we > would have to add multiple LUN support too. So the current linear > unit= parameter is actually pretty broken for SCSI. Well, another level in the hierarchy, but I don't think it materially changes things. > > For IDE, it's a combination of bus, slot, and master/slave. For > virtio, it's just a PCI address. What we really need is something > that is more opaque and controller specific. virtio also has a bus (did you mean the pci bus for IDE?), master/slave is the index. virtio doesn't have index, but IMO that was a mistake and we should have designed it as a disk controller in the first place. > For instance, if we were going to do controllers... > > -controller type=lsi1234,pci_addr=foobar,name=blah > -controller-disk controller=blah,target=0,lun=1,name=sda > > -controller type=ide,pci_addr=barfoo,name=ide > -controller-disk controller=ide,slot=secondary,cable=slave,name=hdd > > -drive file=foo.img,controller-disk=sda > -drive file=bar.img,controller-disk=hdd > > And having "-hdd file=foo.img" be short-hand for "-drive > file=%s,controller-disk=%s". Yeah. >> >> >> If by bus you mean the if= parameter, then drives certainly do have >> buses. Just try connecting the scsi drive from the previous >> paragraph to a USB port. > > No, I meant drive file=foo.img,bus=3. If that doesn't seem obvious > what it should do to you that's because it isn't at all obvious :-) > It ends up skipping a predefined number of locations in the drive > table. This is pretty broken fundamentally because it assumes > controllers always support a fixed number of devices. Nothing really > respects bus_id though so in practice, I assume it's almost > universally broken. Isn't the drive table something totally internal? And how does bus= relate to it? Confused.
On 06/15/2009 04:23 PM, Anthony Liguori wrote: > Avi Kivity wrote: >> On 06/15/2009 03:52 PM, Anthony Liguori wrote: >>> Avi Kivity wrote: >>>> On 06/15/2009 03:41 PM, Michael S. Tsirkin wrote: >>>>> We should just tell the user which slots are open. >>>>> This might be tricky if the config is passed in with the command >>>>> line >>>>> flags. >>>> >>>> qemu -show-available-pci-slots >>> >>> Why does the user care? >>> >>> Let QEMU allocate the PCI slot, then query it to see what slot it >>> assigned and remember that. >> >> It's a roundabout way of doing things. > > Having libvirt do PCI slot allocation scares me. It assumes we can > return a whitelist of available slots, and then let libvirt just > randomly assign things. There's knowledge though in slot assignment > that's board-specific. For instance, depending on how many LNK lines > you have, you may want to put things in slots in such a way to > optimize interrupt balancing or something like that. How would qemu know which slots to optimize for? In practice, I don't see that as a real problem. We should (a) add an ioapic and four more pci links (b) recommend that slots be assigned in ascending order, and everything works. I don't see your concern about libvirt allocating slots. If a human can plug a card into a slot, so can libvirt. Doing an interactive back-and-forth (equivalent to plugging a card while blindfolded, then looking to see which slot we hit) is certainly more difficult. > Some platforms have quirks about expecting a particular slot to have a > particular device. It's still an optimal device but it has to be in > that slot. You can't really express that via an available slot list. I'll be surprised if we ever measure different dma speeds on different slots in the qemu virtual pci bus. If we do, we'll find a way to express them: $ qemu -print-pci slot 0:01: available 33MHz slot 0:02: available 33MHz slot 0:03: available 66MHz I feel a little silly typing this.
On 06/15/2009 04:24 PM, Anthony Liguori wrote: > Avi Kivity wrote: >> >> Certainly preferable to -baseline. >> >>> This is pretty easy to maintain with config files. >> >> Let's not tie the two together. > > I mentioned it because it suggests a good transition. We at least > have to think through how things map to the post-config file world > regardless of whether that's a few months from now or a decade :-) Sure, it's good both from the transitional point of view and in its own right.
Avi Kivity wrote: > On 06/15/2009 04:20 PM, Anthony Liguori wrote: >> It's not at all that simple. SCSI has a hierarchical address >> mechanism with 0-7 targets but then potentially multiple LUNs per >> target. Today, we always emulate a single LUN per target but if we >> ever wanted to support more than 7 disks on a SCSI controller, we >> would have to add multiple LUN support too. So the current linear >> unit= parameter is actually pretty broken for SCSI. > > Well, another level in the hierarchy, but I don't think it materially > changes things. Depends on whether you expect to say index=0,lun=3 or index=3. If you mean the later, then it's quite conceivable that each target supports less than the maximum number of LUNs. This makes things pretty confusing to the user because they have to know that in the current implementation, index=0 is valid, index=1 isn't, but index=8 is. >> No, I meant drive file=foo.img,bus=3. If that doesn't seem obvious >> what it should do to you that's because it isn't at all obvious :-) >> It ends up skipping a predefined number of locations in the drive >> table. This is pretty broken fundamentally because it assumes >> controllers always support a fixed number of devices. Nothing really >> respects bus_id though so in practice, I assume it's almost >> universally broken. > > Isn't the drive table something totally internal? And how does bus= > relate to it? The reality of unit=X,bus=Y,if=Z is that they expand to: drive_table_index=Y*max_devs[Z] + X Whereas max_devs = {"ide":4, "scsi": 7, *:0} How drive_table_index is interpreted is "if" specific. For if=scsi, each lsi device gets a base drive table index that starts at bus_index * 7. For virtio, the first empty spot in drive_table results in no more drives being created. It's broken by design. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity wrote: > On 06/15/2009 04:23 PM, Anthony Liguori wrote: > > How would qemu know which slots to optimize for? > > In practice, I don't see that as a real problem. We should (a) add an > ioapic and four more pci links (b) recommend that slots be assigned in > ascending order, and everything works. > > I don't see your concern about libvirt allocating slots. If a human > can plug a card into a slot, so can libvirt. Doing an interactive > back-and-forth (equivalent to plugging a card while blindfolded, then > looking to see which slot we hit) is certainly more difficult. Let's take a concrete example because I think you missed my point. For the r2d board, if you have 1 on-board NIC, it has to go in slot 2. Additional NICs can go in any slot, but the primary on-board NIC is expected to live in slot 2. It's possible to not have that on-board NIC. If you let QEMU allocate which PCI slot a device goes in, we can hide this detail from libvirt. If you have libvirt do PCI slot allocation by default, it has to know about this restriction in the r2d board unless you have a clever way to express this sort of information. Once QEMU has allocated a device to a slot, libvirt can do a good job maintaining that relationship. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/15/2009 04:45 PM, Anthony Liguori wrote: > Avi Kivity wrote: >> On 06/15/2009 04:20 PM, Anthony Liguori wrote: >>> It's not at all that simple. SCSI has a hierarchical address >>> mechanism with 0-7 targets but then potentially multiple LUNs per >>> target. Today, we always emulate a single LUN per target but if we >>> ever wanted to support more than 7 disks on a SCSI controller, we >>> would have to add multiple LUN support too. So the current linear >>> unit= parameter is actually pretty broken for SCSI. >> >> Well, another level in the hierarchy, but I don't think it materially >> changes things. > > Depends on whether you expect to say index=0,lun=3 or index=3. If you > mean the later, then it's quite conceivable that each target supports > less than the maximum number of LUNs. This makes things pretty > confusing to the user because they have to know that in the current > implementation, index=0 is valid, index=1 isn't, but index=8 is. I'd object to any implicit addressing rules. If we have to say target=2,lun=7,street=8,city=9,state=99,zip=12345 instead of index=8345345235 so be it. >>> No, I meant drive file=foo.img,bus=3. If that doesn't seem obvious >>> what it should do to you that's because it isn't at all obvious :-) >>> It ends up skipping a predefined number of locations in the drive >>> table. This is pretty broken fundamentally because it assumes >>> controllers always support a fixed number of devices. Nothing >>> really respects bus_id though so in practice, I assume it's almost >>> universally broken. >> >> Isn't the drive table something totally internal? And how does bus= >> relate to it? > > The reality of unit=X,bus=Y,if=Z is that they expand to: > > drive_table_index=Y*max_devs[Z] + X > > Whereas max_devs = {"ide":4, "scsi": 7, *:0} > > How drive_table_index is interpreted is "if" specific. For if=scsi, > each lsi device gets a base drive table index that starts at bus_index > * 7. For virtio, the first empty spot in drive_table results in no > more drives being created. > > It's broken by design. Agreed. Pity that it's exposed to the poor users.
On Mon, 2009-06-15 at 07:48 -0500, Anthony Liguori wrote: > Avi Kivity wrote: > > On 06/15/2009 12:09 PM, Mark McLoughlin wrote: > >>>>> I think the point is that you don't need version numbers if you > >>>>> have a > >>>>> proper device tree. > >>>>> > >>>>> > >>>> How do you add a new attribute to the device tree and, when a supplied > >>>> device tree lacking said attribute, distinguish between a device tree > >>>> from an old version of qemu (i.e. use the old default) and a partial > >>>> device tree from the VM manager (i.e. use the new default) ? > >>>> > >>>> > >>> -baseline 0.10 > >>> > >> > >> That's a version number :-) > >> > >> (I was responding to Anthony's "you don't need a version number") > >> > > > > If you want to prevent incompatibilities, you need to make everything > > new (potentially including bugfixes) non-default. No need to punish new guests in order to maintain compatibility for old guests. > > Eventually the > > default configuration becomes increasingly unusable and you need a new > > baseline. You must still be able to fall back to the old baseline for > > older guests. I don't think games with configuration files can hide > > that. > > -M pc1 > -M pc2 > > etc. > > This is pretty easy to maintain with config files. I think this would be reasonable, but it is essentially just a version number which you objected to on the basis that it would make cherry-picking harder for distros. One thing that would be nice with this '-M pc1' thing would be to retain '-M pc' as a symlink to the latest version. We'd also need a way to read the symlink too, so that you can query what the current latest version is and use that in future. How would this machine type version relate to e.g. changing the default PCI class of virtio-blk? Would we bump the version number of all machine types can use virtio-blk? A per-device version number is workable alternative, but only with a saveabi type file IMHO. I've tried to summarise the options here: https://fedoraproject.org/wiki/Features/KVM_Stable_Guest_ABI Cheers, Mark. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Anthony Liguori wrote: > Avi Kivity wrote: >> On 06/15/2009 04:23 PM, Anthony Liguori wrote: >> >> How would qemu know which slots to optimize for? >> >> In practice, I don't see that as a real problem. We should (a) add >> an ioapic and four more pci links (b) recommend that slots be >> assigned in ascending order, and everything works. >> >> I don't see your concern about libvirt allocating slots. If a human >> can plug a card into a slot, so can libvirt. Doing an interactive >> back-and-forth (equivalent to plugging a card while blindfolded, then >> looking to see which slot we hit) is certainly more difficult. > > Let's take a concrete example because I think you missed my point. > For the r2d board, if you have 1 on-board NIC, it has to go in slot > 2. Additional NICs can go in any slot, but the primary on-board NIC > is expected to live in slot 2. It's possible to not have that > on-board NIC. Libvirt does not support r2d. I hope it won't start to support it. We can have default values for these types of devices or something like pci_addr=auto. > > If you let QEMU allocate which PCI slot a device goes in, we can hide > this detail from libvirt. If you have libvirt do PCI slot allocation > by default, it has to know about this restriction in the r2d board > unless you have a clever way to express this sort of information. > > Once QEMU has allocated a device to a slot, libvirt can do a good job > maintaining that relationship. > The end user should have a mechanism to control device slot positioning. For example, if you have several pci devices, some get high rate of interrupts and some not, if you want to optimize you guest you should isolate the high rate 'interesting' devices. This is something the user will need to do. I agree that the default behavior might be 'auto' Also, while moving from one qemu version to another, you'll need to represent the older behavior. -qemu-0.10 is not good enough since there will be multiple versions in the future with multiple distributions setting their defaults. > Regards, > > Anthony Liguori > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Mark McLoughlin wrote: > On Mon, 2009-06-15 at 07:48 -0500, Anthony Liguori wrote: > >>> Eventually the >>> default configuration becomes increasingly unusable and you need a new >>> baseline. You must still be able to fall back to the old baseline for >>> older guests. I don't think games with configuration files can hide >>> that. >>> >> -M pc1 >> -M pc2 >> >> etc. >> >> This is pretty easy to maintain with config files. >> > > I think this would be reasonable, but it is essentially just a version > number which you objected to on the basis that it would make > cherry-picking harder for distros. > It doesn't have to be pc1, pc2. It could be pc-with-usb or pc-with-balloon. If a distro cherry picks in such a way that their pc is not a standard QEMU pc, they would add a new PC type that's specific to their distro. > One thing that would be nice with this '-M pc1' thing would be to retain > '-M pc' as a symlink to the latest version. We'd also need a way to read > the symlink too, so that you can query what the current latest version > is and use that in future. > Another option is an explicit -M default which always uses the default machine for the architecture. Likewise, we would need a way to query what the default machine was for an architecture. > How would this machine type version relate to e.g. changing the default > PCI class of virtio-blk? Would we bump the version number of all machine > types can use virtio-blk? > You would introduce a new machine type. For instance, pc-virtio-class-other. The names don't have to look like that, I'm just doing that to make a point. This may mean that you end up with dozens of machine types but you preserve compatibility, which is a good thing. Of course, the flip side is that you make preserving the machine config the duty of the user and we don't maintain compatible machine types. This won't work without a proper config file though so for now, we're stuck maintaining machine types. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jun 15, 2009 at 6:43 AM, Avi Kivity<avi@redhat.com> wrote: > (I'd be quite happy constructing the entire machine config on the command > line, but I realize it's just me) as a user-only (well, i'm a developer, but don't meddle in kernel affairs since 0.99pl9); I also like that kvm is totally CLI-managed. but migration-wise, i think it could be nicer if the 'origin' process could send the config to the 'target' one. IOW: the -incoming flag shouldn't need any other parameter, and the 'migrate' command should send the whole hardware description before the CPU state, and fail with a 'can't comply' message if the target complains. of course, that's a simplification. for example, the 'target' process should be able to respect some parameters, mostly the 'external' descriptions, like storage pathnames, or '-net tap' ones.
Dor Laor wrote: > Libvirt does not support r2d. I hope it won't start to support it. It supports mips, sparc, and ppc machines now. I don't see why it wouldn't support r2d. For ppcemb, I expect this same problem to occur. This sort of restriction is going to be common with embedded boards. > We can have default values for these types of devices or something > like pci_addr=auto. Why wouldn't libvirt always use pci_addr=auto? If the only argument for having libvirt do pci slot allocation is error messages, can't we find a nice way to allow libvirt to create friendly error messages when QEMU fails? >> If you let QEMU allocate which PCI slot a device goes in, we can hide >> this detail from libvirt. If you have libvirt do PCI slot allocation >> by default, it has to know about this restriction in the r2d board >> unless you have a clever way to express this sort of information. >> >> Once QEMU has allocated a device to a slot, libvirt can do a good job >> maintaining that relationship. >> > > The end user should have a mechanism to control device slot > positioning. For example, if you have several pci devices, some > get high rate of interrupts and some not, if you want to optimize you > guest you should isolate the high rate 'interesting' devices. > This is something the user will need to do. I agree that the default > behavior might be 'auto' I'm not at all arguing against pci_addr. I'm arguing about how libvirt should use it with respect to the "genesis" use-case where libvirt has no specific reason to choose one PCI slot over another. In that case, I'm merely advocating that we want to let QEMU make the decision. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jun 15, 2009 at 09:20:00AM -0500, Anthony Liguori wrote: > Mark McLoughlin wrote: >> On Mon, 2009-06-15 at 07:48 -0500, Anthony Liguori wrote: >> >>>> Eventually the default configuration becomes increasingly unusable >>>> and you need a new baseline. You must still be able to fall back >>>> to the old baseline for older guests. I don't think games with >>>> configuration files can hide that. >>>> >>> -M pc1 >>> -M pc2 >>> >>> etc. >>> >>> This is pretty easy to maintain with config files. >>> >> >> I think this would be reasonable, but it is essentially just a version >> number which you objected to on the basis that it would make >> cherry-picking harder for distros. >> > > It doesn't have to be pc1, pc2. It could be pc-with-usb or > pc-with-balloon. If a distro cherry picks in such a way that their pc > is not a standard QEMU pc, they would add a new PC type that's specific > to their distro. > >> One thing that would be nice with this '-M pc1' thing would be to retain >> '-M pc' as a symlink to the latest version. We'd also need a way to read >> the symlink too, so that you can query what the current latest version >> is and use that in future. >> > > Another option is an explicit -M default which always uses the default > machine for the architecture. Likewise, we would need a way to query > what the default machine was for an architecture. > >> How would this machine type version relate to e.g. changing the default >> PCI class of virtio-blk? Would we bump the version number of all machine >> types can use virtio-blk? >> > You would introduce a new machine type. For instance, > pc-virtio-class-other. The names don't have to look like that, I'm just > doing that to make a point. This may mean that you end up with dozens > of machine types but you preserve compatibility, which is a good thing. And then pc-virtio-class-other-with-balloon-without-usb? Wouldn't it be more straightforward to have capability bits which can be switched on and off independently rather than trying to fit unrelated features into a machine type? IMO it only seems more work at first, and QA gets a bit nervious that they can't exhaustively test all options. But in the long run it simplifies things as you don't have to set policy and invent silly names. > Of course, the flip side is that you make preserving the machine config > the duty of the user and we don't maintain compatible machine types. > This won't work without a proper config file though so for now, we're > stuck maintaining machine types. > > Regards, > > Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jun 15, 2009 at 09:24:32AM -0500, Anthony Liguori wrote: > Dor Laor wrote: >> Libvirt does not support r2d. I hope it won't start to support it. > > It supports mips, sparc, and ppc machines now. I don't see why it > wouldn't support r2d. For ppcemb, I expect this same problem to occur. > This sort of restriction is going to be common with embedded boards. > >> We can have default values for these types of devices or something >> like pci_addr=auto. > > Why wouldn't libvirt always use pci_addr=auto? If the only argument for > having libvirt do pci slot allocation is error messages, can't we find a > nice way to allow libvirt to create friendly error messages when QEMU > fails? > >>> If you let QEMU allocate which PCI slot a device goes in, we can hide >>> this detail from libvirt. If you have libvirt do PCI slot allocation >>> by default, it has to know about this restriction in the r2d board >>> unless you have a clever way to express this sort of information. >>> >>> Once QEMU has allocated a device to a slot, libvirt can do a good job >>> maintaining that relationship. >>> >> >> The end user should have a mechanism to control device slot >> positioning. For example, if you have several pci devices, some >> get high rate of interrupts and some not, if you want to optimize you >> guest you should isolate the high rate 'interesting' devices. >> This is something the user will need to do. I agree that the default >> behavior might be 'auto' > > I'm not at all arguing against pci_addr. I'm arguing about how libvirt > should use it with respect to the "genesis" use-case where libvirt has > no specific reason to choose one PCI slot over another. In that case, > I'm merely advocating that we want to let QEMU make the decision. The allocation code could be moved out into a library, and libvirt could link with it (ducks). > Regards, > > Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin wrote: >> I'm not at all arguing against pci_addr. I'm arguing about how libvirt >> should use it with respect to the "genesis" use-case where libvirt has >> no specific reason to choose one PCI slot over another. In that case, >> I'm merely advocating that we want to let QEMU make the decision. >> > > The allocation code could be moved out into a library, and libvirt could > link with it (ducks). > Why does libvirt want to do allocation? Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/15/2009 05:24 PM, Anthony Liguori wrote: > Dor Laor wrote: >> Libvirt does not support r2d. I hope it won't start to support it. > > It supports mips, sparc, and ppc machines now. I don't see why it > wouldn't support r2d. For ppcemb, I expect this same problem to > occur. This sort of restriction is going to be common with embedded > boards. I expect these restrictions will have to be known by the management application. Otherwise the users will try invalid configurations only to receive errors when they launch them. GUIs exist to guide users, not as an inefficient means of trial-and-error. > >> We can have default values for these types of devices or something >> like pci_addr=auto. > > Why wouldn't libvirt always use pci_addr=auto? If the only argument > for having libvirt do pci slot allocation is error messages, can't we > find a nice way to allow libvirt to create friendly error messages > when QEMU fails? Error messages are not the only argument for pushing slot allocation to management. See my previous messages on the topic. >>> If you let QEMU allocate which PCI slot a device goes in, we can >>> hide this detail from libvirt. If you have libvirt do PCI slot >>> allocation by default, it has to know about this restriction in the >>> r2d board unless you have a clever way to express this sort of >>> information. >>> >>> Once QEMU has allocated a device to a slot, libvirt can do a good >>> job maintaining that relationship. >>> >> >> The end user should have a mechanism to control device slot >> positioning. For example, if you have several pci devices, some >> get high rate of interrupts and some not, if you want to optimize you >> guest you should isolate the high rate 'interesting' devices. >> This is something the user will need to do. I agree that the default >> behavior might be 'auto' > > I'm not at all arguing against pci_addr. I'm arguing about how > libvirt should use it with respect to the "genesis" use-case where > libvirt has no specific reason to choose one PCI slot over another. > In that case, I'm merely advocating that we want to let QEMU make the > decision. However this may end up, isn't it offtopic? Whatever we do we have to support both pci_addr= and default placement, so we can push this discussion to livirt-devel and bid them godspeed.
Avi Kivity wrote: > > I'd object to any implicit addressing rules. If we have to say > target=2,lun=7,street=8,city=9,state=99,zip=12345 instead of > index=8345345235 so be it. The next observation is that while we expand the SCSI addressing, the current propose flattens the PCI hierarchy (i.e. pci_addr=00:01.0). An alternative would be to either always expand or always flatten addressing. I think the later has a lot of merit. Consider: -controller type=lsi1234,addr=00:01,name=blah -controller-disk controller=blah,addr=00:01,name=sda -controller type=ide,addr=00.02,name=ide -controller-disk controller=ide,addr=3,name=hdd -drive file=foo.img,controller-disk=sda -drive file=bar.img,controller-disk=hdd This means that addr's format depends on the parent device node which is a bit less explicit than the previous example. However, it is much more consistent and easier to implement. Basically, when adding a device to it's parent, you hand the parent the "addr" field and that lets you say where you want to sit on the bus. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jun 15, 2009 at 10:03:22AM -0500, Anthony Liguori wrote: > Michael S. Tsirkin wrote: > >>I'm not at all arguing against pci_addr. I'm arguing about how libvirt > >>should use it with respect to the "genesis" use-case where libvirt has > >>no specific reason to choose one PCI slot over another. In that case, > >>I'm merely advocating that we want to let QEMU make the decision. > >> > > > >The allocation code could be moved out into a library, and libvirt could > >link with it (ducks). > > > > Why does libvirt want to do allocation? It doesn't want to. As Mark said, libvirt just wants to be able to ensure a stable guest ABI, of which stable PCI addresses is one aspect. This does not imply libvirt wants to allocate the PCI addresses, just that it wants a way to keep them stable. All else being equal I'd rather libvirt wasn't in the PCI address allocation business. Regards, Daniel
Michael S. Tsirkin wrote: > And then pc-virtio-class-other-with-balloon-without-usb? Wouldn't it be > more straightforward to have capability bits which can be switched on > and off independently rather than trying to fit unrelated features into > a machine type? IMO it only seems more work at first, and QA gets a bit > nervious that they can't exhaustively test all options. But in the long > run it simplifies things as you don't have to set policy and invent > silly names. > We're strictly talking about default machine configs. That has nothing to do with capabilities. You still need to know what the default set of enabled capabilities were and keep track of that. All that I'm suggesting is that we use the machine name to collapse the default set of capabilities into something that libvirt can track. The advantage of using something more opaque like that is that it simplifies things for management tools as they don't have to keep track of "capabilities" that we're adding. Heck, you could even do: pc-00000034 Where "pc-%08x" % (capabilities) :-) Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/15/2009 06:07 PM, Anthony Liguori wrote: > Avi Kivity wrote: >> >> I'd object to any implicit addressing rules. If we have to say >> target=2,lun=7,street=8,city=9,state=99,zip=12345 instead of >> index=8345345235 so be it. > > The next observation is that while we expand the SCSI addressing, the > current propose flattens the PCI hierarchy (i.e. pci_addr=00:01.0). > > An alternative would be to either always expand or always flatten > addressing. I think the later has a lot of merit. Consider: > > -controller type=lsi1234,addr=00:01,name=blah > -controller-disk controller=blah,addr=00:01,name=sda > > -controller type=ide,addr=00.02,name=ide > -controller-disk controller=ide,addr=3,name=hdd > > -drive file=foo.img,controller-disk=sda > -drive file=bar.img,controller-disk=hdd > > This means that addr's format depends on the parent device node which > is a bit less explicit than the previous example. However, it is much > more consistent and easier to implement. Basically, when adding a > device to it's parent, you hand the parent the "addr" field and that > lets you say where you want to sit on the bus. I would prefer explicit names (pci_addr, lun, etc.) but would be okay with generic names too. There's value in sticking to well-understood names and address formats.
Avi Kivity wrote: > However this may end up, isn't it offtopic? Whatever we do we have to > support both pci_addr= and default placement, so we can push this > discussion to livirt-devel and bid them godspeed. I'm not sure how we got here but yeah, let's table this part of the discussion. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Daniel P. Berrange wrote: > On Mon, Jun 15, 2009 at 10:03:22AM -0500, Anthony Liguori wrote: > >> Michael S. Tsirkin wrote: >> >>>> I'm not at all arguing against pci_addr. I'm arguing about how libvirt >>>> should use it with respect to the "genesis" use-case where libvirt has >>>> no specific reason to choose one PCI slot over another. In that case, >>>> I'm merely advocating that we want to let QEMU make the decision. >>>> >>>> >>> The allocation code could be moved out into a library, and libvirt could >>> link with it (ducks). >>> >>> >> Why does libvirt want to do allocation? >> > > It doesn't want to. As Mark said, libvirt just wants to be able to ensure > a stable guest ABI, of which stable PCI addresses is one aspect. This does > not imply libvirt wants to allocate the PCI addresses, just that it wants > a way to keep them stable. All else being equal I'd rather libvirt wasn't > in the PCI address allocation business. > It's not about what libvirt wants. It's about what will serve the end user the most. Apart for stable guest ABI, end users need to have the option to control the slot for their devices. Just like them have for physical machines. It's not theoretical discussion, limiting issues with shared irq is one real life example. Thanks, dor -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/15/2009 06:12 PM, Dor Laor wrote: >> It doesn't want to. As Mark said, libvirt just wants to be able to >> ensure >> a stable guest ABI, of which stable PCI addresses is one aspect. This >> does >> not imply libvirt wants to allocate the PCI addresses, just that it >> wants >> a way to keep them stable. All else being equal I'd rather libvirt >> wasn't >> in the PCI address allocation business. > > > It's not about what libvirt wants. It's about what will serve the end > user the most. > Apart for stable guest ABI, end users need to have the option to > control the slot for > their devices. Just like them have for physical machines. It's not > theoretical discussion, > limiting issues with shared irq is one real life example. > Another issue is enumeration. Guests will present their devices in the order they find them on the pci bus (of course enumeration is guest specific). So if I have 2 virtio controllers the only way I can distinguish between them is using their pci slots.
Avi Kivity wrote: > I would prefer explicit names (pci_addr, lun, etc.) but would be okay > with generic names too. I think having a generic address has a lot of value in terms of code implementation. Otherwise, the valid options for -drive become context-sensitive which is going to be annoying and error-prone. Some sanity could be added by using addressing prefixes like addr=pci:00:01.0 or addr=scsi:0.3 but I'll leave that up to whoever takes this on. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/15/2009 06:20 PM, Anthony Liguori wrote: > Avi Kivity wrote: >> I would prefer explicit names (pci_addr, lun, etc.) but would be okay >> with generic names too. > > I think having a generic address has a lot of value in terms of code > implementation. Otherwise, the valid options for -drive become > context-sensitive which is going to be annoying and error-prone. Some > sanity could be added by using addressing prefixes like > addr=pci:00:01.0 or addr=scsi:0.3 but I'll leave that up to whoever > takes this on. The code problems are easily solved by adding another level of indirection. User confusion problems are only aggravated by additional abstraction, though ("what do I put in addr=, here?").
On Mon, 2009-06-15 at 18:05 +0300, Avi Kivity wrote: > On 06/15/2009 05:24 PM, Anthony Liguori wrote: > > Dor Laor wrote: > >> Libvirt does not support r2d. I hope it won't start to support it. > > > > It supports mips, sparc, and ppc machines now. I don't see why it > > wouldn't support r2d. For ppcemb, I expect this same problem to > > occur. This sort of restriction is going to be common with embedded > > boards. > > I expect these restrictions will have to be known by the management > application. Otherwise the users will try invalid configurations only > to receive errors when they launch them. GUIs exist to guide users, not > as an inefficient means of trial-and-error. So long as the restrictions would be known to the management app via some "what slots are available" mechanism in qemu, that sounds fine. > > I'm not at all arguing against pci_addr. I'm arguing about how > > libvirt should use it with respect to the "genesis" use-case where > > libvirt has no specific reason to choose one PCI slot over another. > > In that case, I'm merely advocating that we want to let QEMU make the > > decision. > > However this may end up, isn't it offtopic? Whatever we do we have to > support both pci_addr= and default placement, so we can push this > discussion to livirt-devel and bid them godspeed. Presumably you're not proposing that qemu-devel completely ignore the typical requirements of management apps? You can push the discussion to libvirt-devel, and the conclusion would most likely be: "We can do slot allocation if you provide us with a way to query free slots, or we can use qemu's default allocation if you provide us a way to query the allocation. We'd prefer the default allocation problem, but we don't really care. Both require about the same amount of work for us." libvirt was only mentioned in this thread as a concrete example of how the suggested solutions would actually be used by management apps. Cheers, Mark. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2009-06-15 at 18:12 +0300, Dor Laor wrote: > > It doesn't want to. As Mark said, libvirt just wants to be able to ensure > > a stable guest ABI, of which stable PCI addresses is one aspect. This does > > not imply libvirt wants to allocate the PCI addresses, just that it wants > > a way to keep them stable. All else being equal I'd rather libvirt wasn't > > in the PCI address allocation business. > > > > It's not about what libvirt wants. It's about what will serve the end > user the most. Absolutely. And not just about what most helps end users of libvirt based management apps, but also any app managing qemu. > Apart for stable guest ABI, end users need to have the option to > control the slot for their devices. Just like them have for physical > machines. It's not theoretical discussion, limiting issues with shared > irq is one real life example. Providing end users with the *option* to choose PCI slots sounds like a fine feature request for any management app. Requiring all management apps to force end users to explicitly choose PCI slots in order for slots to be stable is not so reasonable. Cheers, Mark. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Mark McLoughlin wrote: > So long as the restrictions would be known to the management app via > some "what slots are available" mechanism in qemu, that sounds fine. > I'm not sure a "what slots are available" mechanism is as straight forward as has been claimed. It doesn't matter though because it's orthogonal to the current proposal. >>> I'm not at all arguing against pci_addr. I'm arguing about how >>> libvirt should use it with respect to the "genesis" use-case where >>> libvirt has no specific reason to choose one PCI slot over another. >>> In that case, I'm merely advocating that we want to let QEMU make the >>> decision. >>> >> However this may end up, isn't it offtopic? Whatever we do we have to >> support both pci_addr= and default placement, so we can push this >> discussion to livirt-devel and bid them godspeed. >> > > Presumably you're not proposing that qemu-devel completely ignore the > typical requirements of management apps? > This is a happy case where the current proposals allow both usages to occur. Which one libvirt chooses it up to it. To summarize, I think we have: 1) Introduce addressing to all host device configurations - Either in the canonical form "pci_addr=bus:dev.fn or target=3,lun=1" or in flattened form "addr=bus:dev.fn or addr=target.lun". I prefer the later form but I think either would be acceptable. 2) Whenever the default machine type changes in a guest-visible way, introduce a new machine type - Use explicit versions in name: pc-v1, pc-v2 or use more descriptive names pc-with-usb - Easily transitions to device config files Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/15/2009 09:12 PM, Anthony Liguori wrote: > > 2) Whenever the default machine type changes in a guest-visible way, > introduce a new machine type s/whenever/qemu stable release/ > - Use explicit versions in name: pc-v1, pc-v2 pc-qemu-0.10? This is similar to a hardware vendor's model number (though they tend to change components without changing model numbers, though naughty vendors) > or use more descriptive names pc-with-usb > - Easily transitions to device config files Combinatorial explosion. Just use -usb.
Avi Kivity wrote: > On 06/15/2009 09:12 PM, Anthony Liguori wrote: >> >> 2) Whenever the default machine type changes in a guest-visible way, >> introduce a new machine type > > s/whenever/qemu stable release/ > >> - Use explicit versions in name: pc-v1, pc-v2 > > pc-qemu-0.10? > > This is similar to a hardware vendor's model number (though they tend > to change components without changing model numbers, though naughty > vendors) Yup, that makes a whole lot of sense. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 6/15/09, Avi Kivity <avi@redhat.com> wrote: > On 06/15/2009 09:12 PM, Anthony Liguori wrote: > > > > > 2) Whenever the default machine type changes in a guest-visible way, > introduce a new machine type > > > > s/whenever/qemu stable release/ > > > > - Use explicit versions in name: pc-v1, pc-v2 > > > > pc-qemu-0.10? pc-2009.06? Or given the hardware, should that be pc-1997? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/15/2009 09:44 PM, Blue Swirl wrote: >> pc-qemu-0.10? >> > > pc-2009.06? Or given the hardware, should that be pc-1997? > pc-qemu-0.10 has the obvious benefit of allowing people to immediately know what's the oldest version of qemu that supports it.
On Mon, 2009-06-15 at 13:12 -0500, Anthony Liguori wrote: > Mark McLoughlin wrote: > > So long as the restrictions would be known to the management app via > > some "what slots are available" mechanism in qemu, that sounds fine. > > > > I'm not sure a "what slots are available" mechanism is as straight > forward as has been claimed. If qemu can't provide that information, then the management app does not have sufficient information to do the slot allocation itself. In which case, it must leave it up to qemu to do it. > It doesn't matter though because it's orthogonal to the current proposal. It is not orthogonal to solving the actual problem at hand, though - i.e. how to allow management apps to provide stable PCI addresses. > >>> I'm not at all arguing against pci_addr. I'm arguing about how > >>> libvirt should use it with respect to the "genesis" use-case where > >>> libvirt has no specific reason to choose one PCI slot over another. > >>> In that case, I'm merely advocating that we want to let QEMU make the > >>> decision. > >>> > >> However this may end up, isn't it offtopic? Whatever we do we have to > >> support both pci_addr= and default placement, so we can push this > >> discussion to livirt-devel and bid them godspeed. > >> > > > > Presumably you're not proposing that qemu-devel completely ignore the > > typical requirements of management apps? > > > > This is a happy case where the current proposals allow both usages to > occur. Which one libvirt chooses it up to it. > > To summarize, I think we have: > > 1) Introduce addressing to all host device configurations > - Either in the canonical form "pci_addr=bus:dev.fn or target=3,lun=1" > or in flattened form "addr=bus:dev.fn or addr=target.lun". I prefer the > later form but I think either would be acceptable. That helps, but it's not enough on its own. The management app needs to figure out what addresses to pass either by: a) Initially allowing qemu to do the address allocation, and thereafter using those addresses - this requires some way to query the addresses of devices or b) Doing the initial address allocation itself - this requires some way to query what slots are available. > 2) Whenever the default machine type changes in a guest-visible way, > introduce a new machine type > - Use explicit versions in name: pc-v1, pc-v2 or use more descriptive > names pc-with-usb > - Easily transitions to device config files To be clear - you're not proposing this is a solution to the "stable PCI addresses" problem, are you? The main requirement is for the addresses to stay stable even if the user adds/removes other devices. This is a fine solution to the "stable guest ABI" problem ... assuming there's some way of querying the current default machine type. Cheers, Mark. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/16/2009 03:14 PM, Mark McLoughlin wrote: > On Mon, 2009-06-15 at 13:12 -0500, Anthony Liguori wrote: > >> Mark McLoughlin wrote: >> >>> So long as the restrictions would be known to the management app via >>> some "what slots are available" mechanism in qemu, that sounds fine. >>> >>> >> I'm not sure a "what slots are available" mechanism is as straight >> forward as has been claimed. >> > > If qemu can't provide that information, then the management app does not > have sufficient information to do the slot allocation itself. In which > case, it must leave it up to qemu to do it. > A given -M machine will have well-known open slots (since it's an ABI), same as it has rtl8139 and ne2000 cards. Worst case we hardcode those numbers (gasp, faint). >> It doesn't matter though because it's orthogonal to the current proposal. >> > > It is not orthogonal to solving the actual problem at hand, though - > i.e. how to allow management apps to provide stable PCI addresses. > It's part of the solution, but hardly a difficult the most difficult part. > This is a fine solution to the "stable guest ABI" problem ... assuming > there's some way of querying the current default machine type. > $ qemu -print-default-machine or maybe $ qemu -show default-machine $ qemu -show pci-bus $ qemu -show me a way out
On Tue, 2009-06-16 at 15:28 +0300, Avi Kivity wrote: > On 06/16/2009 03:14 PM, Mark McLoughlin wrote: > > On Mon, 2009-06-15 at 13:12 -0500, Anthony Liguori wrote: > > > >> Mark McLoughlin wrote: > >> > >>> So long as the restrictions would be known to the management app via > >>> some "what slots are available" mechanism in qemu, that sounds fine. > >>> > >>> > >> I'm not sure a "what slots are available" mechanism is as straight > >> forward as has been claimed. > >> > > > > If qemu can't provide that information, then the management app does not > > have sufficient information to do the slot allocation itself. In which > > case, it must leave it up to qemu to do it. > > > > A given -M machine will have well-known open slots (since it's an ABI), > same as it has rtl8139 and ne2000 cards. If they're so obviously well-known, I don't see how the query mechanism would not be straightforward, which is the comment I was replying to. > Worst case we hardcode those numbers (gasp, faint). Maybe we can just add the open slots to the -help output. That'd be nice and clean. > >> It doesn't matter though because it's orthogonal to the current proposal. > >> > > > > It is not orthogonal to solving the actual problem at hand, though - > > i.e. how to allow management apps to provide stable PCI addresses. > > > > It's part of the solution, but hardly a difficult the most difficult part. Agree. > > This is a fine solution to the "stable guest ABI" problem ... assuming > > there's some way of querying the current default machine type. > > > > $ qemu -print-default-machine Or: $ readlink /usr/share/qemu/machine-types/pc.dt Cheers, Mark. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/16/2009 03:39 PM, Mark McLoughlin wrote: >> Worst case we hardcode those numbers (gasp, faint). >> > > Maybe we can just add the open slots to the -help output. That'd be nice > and clean. > Yeah, there's precedent too. > Or: > > $ readlink /usr/share/qemu/machine-types/pc.dt > > That works if you have exactly one qemu installed. It's best if qemu itself is the entry point (qemu -print-device-tree). Though I wouldn't want to inflict it upon the management application writers.
Avi Kivity wrote: > Another issue is enumeration. Guests will present their devices in the > order they find them on the pci bus (of course enumeration is guest > specific). So if I have 2 virtio controllers the only way I can > distinguish between them is using their pci slots. virtio controllers really should have a user-suppliable string or UUID to identify them to the guest. Don't they? -- Jamie -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Mark McLoughlin wrote: > > After libvirt has done -drive file=foo... it should dump the machine > > config and use that from then on. > > Right - libvirt then wouldn't be able to avoid the complexity of merging > any future changes into the dumped machine config. As long as qemu can accept a machine config _and_ -drive file=foo (and monitor commands to add/remove devices), libvirt could merge by simply calling qemu with whatever additional command line options or monitor commands modify the config, then dump the new config. That way, virtio would not have to deal with that complexity. It would be written in one place: qemu. Or better, a utility: qemu-machine-config. -- Jamie -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Mark McLoughlin wrote: > > Worst case we hardcode those numbers (gasp, faint). > > Maybe we can just add the open slots to the -help output. That'd be nice > and clean. Make them part of the machine configuration. After all, they are part of the machine configuration, and ACPI, BIOS etc. need to know about all the machine slots anyway. Having said that, I prefer the idea that slot allocation is handled either in Qemu, or in a separate utility called qemu-machine-config (for working with machine configs), or in a library libqemu-machine-config.so. I particularly don't like the idea of arcane machine-dependent slot allocation knowledge living in libvirt, because it needs to be in Qemu anyway for non-libvirt users. No point in having two implementations of something tricky and likely to have machine quirks, if one will do. -- Jamie -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/16/2009 09:32 PM, Jamie Lokier wrote: > Avi Kivity wrote: > >> Another issue is enumeration. Guests will present their devices in the >> order they find them on the pci bus (of course enumeration is guest >> specific). So if I have 2 virtio controllers the only way I can >> distinguish between them is using their pci slots. >> > > virtio controllers really should have a user-suppliable string or UUID > to identify them to the guest. Don't they? > virtio controllers don't exist. When they do, they may have a UUID or not, but in either case guest infrastructure is in place for reporting the PCI slot, not the UUID. virtio disks do have a UUID. I don't think older versions of Windows will use it though, so if you reorder your slots you'll see your drive letters change. Same with Linux if you don't use udev by-uuid rules.
On Tue, 2009-06-16 at 19:44 +0100, Jamie Lokier wrote: > Mark McLoughlin wrote: > > > Worst case we hardcode those numbers (gasp, faint). > > > > Maybe we can just add the open slots to the -help output. That'd be nice > > and clean. I was being sarcastic - libvirt currently must parse qemu -help, and even has some test infrastructure to check that it works with various versions of qemu. Extending this would not be nice and clean :-) > I particularly don't like the idea of arcane machine-dependent slot > allocation knowledge living in libvirt, because it needs to be in Qemu > anyway for non-libvirt users. No point in having two implementations > of something tricky and likely to have machine quirks, if one will do. Indeed. Cheers, Mark. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/17/2009 11:33 AM, Mark McLoughlin wrote: >> I particularly don't like the idea of arcane machine-dependent slot >> allocation knowledge living in libvirt, because it needs to be in Qemu >> anyway for non-libvirt users. No point in having two implementations >> of something tricky and likely to have machine quirks, if one will do. >> > > Indeed. > I don't understand this. Management already has to allocate MAC addresses, UUIDs, IDE interface and master/slave role, SCSI LUNs/targets/whatever. It has to understand NUMA (if not do actual allocation). Even if it doesn't allocate the slots, it has to be able to query them so it can tell the user which NIC or controller is connected where, or to do hotunplug. It has to understand that there is a limitation on the number of slots, and know what that limitation is (unless it feels that launching an overcommitted guest and showing an error to the user is preferable to not allowing the user to overcommit in the first place. If you'll review my patent application for pci slot allocation, you'll see the following line: slot_nr = nb_allocated_slots++; /* Allocate pci slot */ while there is a lot of complicated setup code before that (see the prior art section as well), I believe licensees could well implement the algorithm in two short months, including testing.
On Wed, 2009-06-17 at 12:03 +0300, Avi Kivity wrote: > On 06/17/2009 11:33 AM, Mark McLoughlin wrote: > >> I particularly don't like the idea of arcane machine-dependent slot > >> allocation knowledge living in libvirt, because it needs to be in Qemu > >> anyway for non-libvirt users. No point in having two implementations > >> of something tricky and likely to have machine quirks, if one will do. > > > > Indeed. > > I don't understand this. Take note of the "arcane machine-dependent slot allocation knowledge" bit. If the algorithm in for management apps is as simple as "query qemu for available slots and sequentially allocate slots", then that's perfectly fine. If management apps need to hard-code which slots are available on different targets and different qemu versions, or restrictions on which devices can use which slots, or knowledge that some devices can be multi-function, or ... anything like that is just lame. Cheers, Mark. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/17/2009 12:18 PM, Mark McLoughlin wrote: > On Wed, 2009-06-17 at 12:03 +0300, Avi Kivity wrote: > >> On 06/17/2009 11:33 AM, Mark McLoughlin wrote: >> >>>> I particularly don't like the idea of arcane machine-dependent slot >>>> allocation knowledge living in libvirt, because it needs to be in Qemu >>>> anyway for non-libvirt users. No point in having two implementations >>>> of something tricky and likely to have machine quirks, if one will do. >>>> >>> Indeed. >>> >> I don't understand this. >> > > Take note of the "arcane machine-dependent slot allocation knowledge" > bit. > > If the algorithm in for management apps is as simple as "query qemu for > available slots and sequentially allocate slots", then that's perfectly > fine. > That's the thinking. > If management apps need to hard-code which slots are available on > different targets and different qemu versions, or restrictions on which > devices can use which slots, or knowledge that some devices can be > multi-function, or ... anything like that is just lame. > You can't abstract these things away. If you can't put a NIC in slot 4, and you have 7 slots, then you cannot have 7 NICs. Having qemu allocate the slot numbers does not absolve management from knowing this limitation and preventing the user from creating a machine with 7 slots. Likewise, management will have to know which devices are multi-function, since that affects their hotpluggability. Ditto if some slot if faster than others, if you want to make use of this information you have to let the upper layers know. It could be done using an elaborate machine description that qemu exposes to management coupled with a constraint solver that optimizes the machine layout according to user specifications and hardware limitations. Or we could take the view that real life is not perfect (especially where computers are involved), add some machine specific knowledge, and spend the rest of the summer at the beach.
Avi Kivity wrote: > On 06/16/2009 09:32 PM, Jamie Lokier wrote: > >Avi Kivity wrote: > > > >>Another issue is enumeration. Guests will present their devices in the > >>order they find them on the pci bus (of course enumeration is guest > >>specific). So if I have 2 virtio controllers the only way I can > >>distinguish between them is using their pci slots. > > > >virtio controllers really should have a user-suppliable string or UUID > >to identify them to the guest. Don't they? > > virtio controllers don't exist. When they do, they may have a UUID or > not, but in either case guest infrastructure is in place for reporting > the PCI slot, not the UUID. > > virtio disks do have a UUID. I don't think older versions of Windows > will use it though, so if you reorder your slots you'll see your drive > letters change. Same with Linux if you don't use udev by-uuid rules. I guess I meant virtio disks, so that's ok. -- Jamie -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity wrote: > >If management apps need to hard-code which slots are available on > >different targets and different qemu versions, or restrictions on which > >devices can use which slots, or knowledge that some devices can be > >multi-function, or ... anything like that is just lame. > > > > You can't abstract these things away. If you can't put a NIC in slot 4, > and you have 7 slots, then you cannot have 7 NICs. Having qemu allocate > the slot numbers does not absolve management from knowing this > limitation and preventing the user from creating a machine with 7 slots. > > Likewise, management will have to know which devices are multi-function, > since that affects their hotpluggability. Ditto if some slot if faster > than others, if you want to make use of this information you have to let > the upper layers know. > > It could be done using an elaborate machine description that qemu > exposes to management coupled with a constraint solver that optimizes > the machine layout according to user specifications and hardware > limitations. Or we could take the view that real life is not perfect > (especially where computers are involved), add some machine specific > knowledge, and spend the rest of the summer at the beach. To be honest, an elaborate machine description is probably fine... A fancy constraint solver is not required. A simple one strikes me as about as simple as what you'd hard-code anyway, but with fewer special cases. Note that the result can fail due to things like insufficient address space for all the device BARs even when they _are_ in the right slots. Especially if there are lots of slots, or bridges which can provide unlimited slots. That is arcane: device-dependent, CPU-dependent, machine-dependent, RAM-size dependent (in a non-linear way), device-option-dependent and probably QEMU-version-dependent too. It would be nice if libvirt (et al) would prevent the user from creating a VM with insufficient BAR space for that machine, but I'm not sure how to do it sanely, without arcane knowledge getting about. Maybe that idea of a .so shared by qemu and libvirt, to manipulate device configurations, is a sane one after all. -- Jamie -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/hw/pci.c b/hw/pci.c index 361d741..ed011b5 100644 --- a/hw/pci.c +++ b/hw/pci.c @@ -130,12 +130,13 @@ void pci_device_save(PCIDevice *s, QEMUFile *f) int version = s->cap_present ? 3 : 2; int i; - qemu_put_be32(f, version); /* PCI device version */ + /* PCI device version and capabilities */ + qemu_put_be32(f, version); + if (version >= 3) + qemu_put_be32(f, s->cap_present); qemu_put_buffer(f, s->config, 256); for (i = 0; i < 4; i++) qemu_put_be32(f, s->irq_state[i]); - if (version >= 3) - qemu_put_be32(f, s->cap_present); } int pci_device_load(PCIDevice *s, QEMUFile *f) @@ -146,12 +147,6 @@ int pci_device_load(PCIDevice *s, QEMUFile *f) version_id = qemu_get_be32(f); if (version_id > 3) return -EINVAL; - qemu_get_buffer(f, s->config, 256); - pci_update_mappings(s); - - if (version_id >= 2) - for (i = 0; i < 4; i ++) - s->irq_state[i] = qemu_get_be32(f); if (version_id >= 3) s->cap_present = qemu_get_be32(f); else @@ -160,6 +155,18 @@ int pci_device_load(PCIDevice *s, QEMUFile *f) if (s->cap_present & ~s->cap_supported) return -EINVAL; + qemu_get_buffer(f, s->config, 256); + pci_update_mappings(s); + + if (version_id >= 2) + for (i = 0; i < 4; i ++) + s->irq_state[i] = qemu_get_be32(f); + /* Clear wmask and used bits for capabilities. + Must be restored separately, since capabilities can + be placed anywhere in config space. */ + memset(s->used, 0, PCI_CONFIG_SPACE_SIZE); + for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i) + s->wmask[i] = 0xff; return 0; } @@ -870,3 +877,76 @@ PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name) return (PCIDevice *)dev; } + +static int pci_find_space(PCIDevice *pdev, uint8_t size) +{ + int offset = PCI_CONFIG_HEADER_SIZE; + int i; + for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i) + if (pdev->used[i]) + offset = i + 1; + else if (i - offset + 1 == size) + return offset; + return 0; +} + +static uint8_t pci_find_capability_list(PCIDevice *pdev, uint8_t cap_id, + uint8_t *prev_p) +{ + uint8_t next, prev; + + if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST)) + return 0; + + for (prev = PCI_CAPABILITY_LIST; (next = pdev->config[prev]); + prev = next + PCI_CAP_LIST_NEXT) + if (pdev->config[next + PCI_CAP_LIST_ID] == cap_id) + break; + + *prev_p = prev; + return next; +} + +/* Reserve space and add capability to the linked list in pci config space */ +int pci_add_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t size) +{ + uint8_t offset = pci_find_space(pdev, size); + uint8_t *config = pdev->config + offset; + if (!offset) + return -ENOSPC; + config[PCI_CAP_LIST_ID] = cap_id; + config[PCI_CAP_LIST_NEXT] = pdev->config[PCI_CAPABILITY_LIST]; + pdev->config[PCI_CAPABILITY_LIST] = offset; + pdev->config[PCI_STATUS] |= PCI_STATUS_CAP_LIST; + memset(pdev->used + offset, 0xFF, size); + /* Make capability read-only by default */ + memset(pdev->wmask + offset, 0, size); + return offset; +} + +/* Unlink capability from the pci config space. */ +void pci_del_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t size) +{ + uint8_t prev, offset = pci_find_capability_list(pdev, cap_id, &prev); + if (!offset) + return; + pdev->config[prev] = pdev->config[offset + PCI_CAP_LIST_NEXT]; + /* Make capability writeable again */ + memset(pdev->wmask + offset, 0xff, size); + memset(pdev->used + offset, 0, size); + + if (!pdev->config[PCI_CAPABILITY_LIST]) + pdev->config[PCI_STATUS] &= ~PCI_STATUS_CAP_LIST; +} + +/* Reserve space for capability at a known offset (to call after load). */ +void pci_reserve_capability(PCIDevice *pdev, uint8_t offset, uint8_t size) +{ + memset(pdev->used + offset, 0xff, size); +} + +uint8_t pci_find_capability(PCIDevice *pdev, uint8_t cap_id) +{ + uint8_t prev; + return pci_find_capability_list(pdev, cap_id, &prev); +} diff --git a/hw/pci.h b/hw/pci.h index 6f0803f..4838c59 100644 --- a/hw/pci.h +++ b/hw/pci.h @@ -123,6 +123,10 @@ typedef struct PCIIORegion { #define PCI_MIN_GNT 0x3e /* 8 bits */ #define PCI_MAX_LAT 0x3f /* 8 bits */ +/* Capability lists */ +#define PCI_CAP_LIST_ID 0 /* Capability ID */ +#define PCI_CAP_LIST_NEXT 1 /* Next capability in the list */ + #define PCI_REVISION 0x08 /* obsolete, use PCI_REVISION_ID */ #define PCI_SUBVENDOR_ID 0x2c /* obsolete, use PCI_SUBSYSTEM_VENDOR_ID */ #define PCI_SUBDEVICE_ID 0x2e /* obsolete, use PCI_SUBSYSTEM_ID */ @@ -130,7 +134,7 @@ typedef struct PCIIORegion { /* Bits in the PCI Status Register (PCI 2.3 spec) */ #define PCI_STATUS_RESERVED1 0x007 #define PCI_STATUS_INT_STATUS 0x008 -#define PCI_STATUS_CAPABILITIES 0x010 +#define PCI_STATUS_CAP_LIST 0x010 #define PCI_STATUS_66MHZ 0x020 #define PCI_STATUS_RESERVED2 0x040 #define PCI_STATUS_FAST_BACK 0x080 @@ -160,6 +164,9 @@ struct PCIDevice { /* Used to implement R/W bytes */ uint8_t wmask[PCI_CONFIG_SPACE_SIZE]; + /* Used to allocate config space for capabilities. */ + uint8_t used[PCI_CONFIG_SPACE_SIZE]; + /* the following fields are read only */ PCIBus *bus; int devfn; @@ -194,6 +201,15 @@ void pci_register_io_region(PCIDevice *pci_dev, int region_num, uint32_t size, int type, PCIMapIORegionFunc *map_func); +int pci_add_capability(PCIDevice *pci_dev, uint8_t cap_id, uint8_t cap_size); + +void pci_del_capability(PCIDevice *pci_dev, uint8_t cap_id, uint8_t cap_size); + +void pci_reserve_capability(PCIDevice *pci_dev, uint8_t offset, uint8_t size); + +uint8_t pci_find_capability(PCIDevice *pci_dev, uint8_t cap_id); + + uint32_t pci_default_read_config(PCIDevice *d, uint32_t address, int len); void pci_default_write_config(PCIDevice *d,
Add routines to manage PCI capability list. First user will be MSI-X. Signed-off-by: Michael S. Tsirkin <mst@redhat.com> --- hw/pci.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ hw/pci.h | 18 +++++++++++- 2 files changed, 106 insertions(+), 10 deletions(-)