diff mbox

[PATCHv3,03/13] qemu: add routines to manage PCI capabilities

Message ID 20090605102315.GD26770@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Michael S. Tsirkin June 5, 2009, 10:23 a.m. UTC
Add routines to manage PCI capability list. First user will be MSI-X.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/pci.c |   98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
 hw/pci.h |   18 +++++++++++-
 2 files changed, 106 insertions(+), 10 deletions(-)

Comments

Glauber Costa June 9, 2009, 5:11 p.m. UTC | #1
On Fri, Jun 05, 2009 at 01:23:15PM +0300, Michael S. Tsirkin wrote:
> Add routines to manage PCI capability list. First user will be MSI-X.
> 
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>  hw/pci.c |   98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
>  hw/pci.h |   18 +++++++++++-
>  2 files changed, 106 insertions(+), 10 deletions(-)
> 
> diff --git a/hw/pci.c b/hw/pci.c
> index 361d741..ed011b5 100644
> --- a/hw/pci.c
> +++ b/hw/pci.c
> @@ -130,12 +130,13 @@ void pci_device_save(PCIDevice *s, QEMUFile *f)
>      int version = s->cap_present ? 3 : 2;
>      int i;
>  
> -    qemu_put_be32(f, version); /* PCI device version */
> +    /* PCI device version and capabilities */
> +    qemu_put_be32(f, version);
> +    if (version >= 3)
> +        qemu_put_be32(f, s->cap_present);
>      qemu_put_buffer(f, s->config, 256);
>      for (i = 0; i < 4; i++)
>          qemu_put_be32(f, s->irq_state[i]);
> -    if (version >= 3)
> -        qemu_put_be32(f, s->cap_present);
>  }
What is it doing here?
You should just do it right in the first patch, instead of doing in
one way there, and fixing here.

>  
>  int pci_device_load(PCIDevice *s, QEMUFile *f)
> @@ -146,12 +147,6 @@ int pci_device_load(PCIDevice *s, QEMUFile *f)
>      version_id = qemu_get_be32(f);
>      if (version_id > 3)
>          return -EINVAL;
> -    qemu_get_buffer(f, s->config, 256);
> -    pci_update_mappings(s);
> -
> -    if (version_id >= 2)
> -        for (i = 0; i < 4; i ++)
> -            s->irq_state[i] = qemu_get_be32(f);
>      if (version_id >= 3)
>          s->cap_present = qemu_get_be32(f);
>      else
ditto.

> @@ -160,6 +155,18 @@ int pci_device_load(PCIDevice *s, QEMUFile *f)
>      if (s->cap_present & ~s->cap_supported)
>          return -EINVAL;
>  
> +    qemu_get_buffer(f, s->config, 256);
> +    pci_update_mappings(s);
> +
> +    if (version_id >= 2)
> +        for (i = 0; i < 4; i ++)
> +            s->irq_state[i] = qemu_get_be32(f);
> +    /* Clear wmask and used bits for capabilities.
> +       Must be restored separately, since capabilities can
> +       be placed anywhere in config space. */
> +    memset(s->used, 0, PCI_CONFIG_SPACE_SIZE);
> +    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i)
> +        s->wmask[i] = 0xff;
>      return 0;
>  }
Sorry, I don't exactly understand it. Although it can be anywhere, what do we actually
lose by keeping it at the same place in config space?

>  
> @@ -870,3 +877,76 @@ PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name)
>  
>      return (PCIDevice *)dev;
>  }
> +
> +static int pci_find_space(PCIDevice *pdev, uint8_t size)
> +{
> +    int offset = PCI_CONFIG_HEADER_SIZE;
> +    int i;
> +    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i)
> +        if (pdev->used[i])
> +            offset = i + 1;
> +        else if (i - offset + 1 == size)
> +            return offset;
> +    return 0;
> +}
> +
> +static uint8_t pci_find_capability_list(PCIDevice *pdev, uint8_t cap_id,
> +                                        uint8_t *prev_p)
> +{
> +    uint8_t next, prev;
> +
> +    if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST))
> +        return 0;
> +
> +    for (prev = PCI_CAPABILITY_LIST; (next = pdev->config[prev]);
> +         prev = next + PCI_CAP_LIST_NEXT)
> +        if (pdev->config[next + PCI_CAP_LIST_ID] == cap_id)
> +            break;
> +
> +    *prev_p = prev;
> +    return next;
> +}
I'd prefer to do:
	if (prev_p)
		*prev_p = prev;
so we don't have to always pass a prev_p pointer. You have yourself a user
where you don't need it in this very patch.

> +
> +/* Reserve space and add capability to the linked list in pci config space */
> +int pci_add_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t size)
> +{
> +    uint8_t offset = pci_find_space(pdev, size);
> +    uint8_t *config = pdev->config + offset;
> +    if (!offset)
> +        return -ENOSPC;
> +    config[PCI_CAP_LIST_ID] = cap_id;
> +    config[PCI_CAP_LIST_NEXT] = pdev->config[PCI_CAPABILITY_LIST];
> +    pdev->config[PCI_CAPABILITY_LIST] = offset;
> +    pdev->config[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
> +    memset(pdev->used + offset, 0xFF, size);
> +    /* Make capability read-only by default */
> +    memset(pdev->wmask + offset, 0, size);
> +    return offset;
> +}
> +
> +/* Unlink capability from the pci config space. */
> +void pci_del_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t size)
> +{
> +    uint8_t prev, offset = pci_find_capability_list(pdev, cap_id, &prev);
> +    if (!offset)
> +        return;
> +    pdev->config[prev] = pdev->config[offset + PCI_CAP_LIST_NEXT];
> +    /* Make capability writeable again */
> +    memset(pdev->wmask + offset, 0xff, size);
> +    memset(pdev->used + offset, 0, size);
> +
> +    if (!pdev->config[PCI_CAPABILITY_LIST])
> +        pdev->config[PCI_STATUS] &= ~PCI_STATUS_CAP_LIST;
> +}
> +
> +/* Reserve space for capability at a known offset (to call after load). */
> +void pci_reserve_capability(PCIDevice *pdev, uint8_t offset, uint8_t size)
> +{
> +    memset(pdev->used + offset, 0xff, size);
> +}
> +
> +uint8_t pci_find_capability(PCIDevice *pdev, uint8_t cap_id)
> +{
> +    uint8_t prev;
> +    return pci_find_capability_list(pdev, cap_id, &prev);
> +}
> diff --git a/hw/pci.h b/hw/pci.h
> index 6f0803f..4838c59 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -123,6 +123,10 @@ typedef struct PCIIORegion {
>  #define PCI_MIN_GNT		0x3e	/* 8 bits */
>  #define PCI_MAX_LAT		0x3f	/* 8 bits */
>  
> +/* Capability lists */
> +#define PCI_CAP_LIST_ID		0	/* Capability ID */
> +#define PCI_CAP_LIST_NEXT	1	/* Next capability in the list */
> +
>  #define PCI_REVISION            0x08    /* obsolete, use PCI_REVISION_ID */
>  #define PCI_SUBVENDOR_ID        0x2c    /* obsolete, use PCI_SUBSYSTEM_VENDOR_ID */
>  #define PCI_SUBDEVICE_ID        0x2e    /* obsolete, use PCI_SUBSYSTEM_ID */
> @@ -130,7 +134,7 @@ typedef struct PCIIORegion {
>  /* Bits in the PCI Status Register (PCI 2.3 spec) */
>  #define PCI_STATUS_RESERVED1	0x007
>  #define PCI_STATUS_INT_STATUS	0x008
> -#define PCI_STATUS_CAPABILITIES	0x010
> +#define PCI_STATUS_CAP_LIST	0x010
>  #define PCI_STATUS_66MHZ	0x020
>  #define PCI_STATUS_RESERVED2	0x040
>  #define PCI_STATUS_FAST_BACK	0x080
> @@ -160,6 +164,9 @@ struct PCIDevice {
>      /* Used to implement R/W bytes */
>      uint8_t wmask[PCI_CONFIG_SPACE_SIZE];
>  
> +    /* Used to allocate config space for capabilities. */
> +    uint8_t used[PCI_CONFIG_SPACE_SIZE];
> +
>      /* the following fields are read only */
>      PCIBus *bus;
>      int devfn;
> @@ -194,6 +201,15 @@ void pci_register_io_region(PCIDevice *pci_dev, int region_num,
>                              uint32_t size, int type,
>                              PCIMapIORegionFunc *map_func);
>  
> +int pci_add_capability(PCIDevice *pci_dev, uint8_t cap_id, uint8_t cap_size);
> +
> +void pci_del_capability(PCIDevice *pci_dev, uint8_t cap_id, uint8_t cap_size);
> +
> +void pci_reserve_capability(PCIDevice *pci_dev, uint8_t offset, uint8_t size);
> +
> +uint8_t pci_find_capability(PCIDevice *pci_dev, uint8_t cap_id);
> +
> +
>  uint32_t pci_default_read_config(PCIDevice *d,
>                                   uint32_t address, int len);
>  void pci_default_write_config(PCIDevice *d,
> -- 
> 1.6.3.1.56.g79e1.dirty
> 
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin June 10, 2009, 9:54 a.m. UTC | #2
On Tue, Jun 09, 2009 at 02:11:14PM -0300, Glauber Costa wrote:
> On Fri, Jun 05, 2009 at 01:23:15PM +0300, Michael S. Tsirkin wrote:
> > Add routines to manage PCI capability list. First user will be MSI-X.
> > 
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> >  hw/pci.c |   98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
> >  hw/pci.h |   18 +++++++++++-
> >  2 files changed, 106 insertions(+), 10 deletions(-)
> > 
> > diff --git a/hw/pci.c b/hw/pci.c
> > index 361d741..ed011b5 100644
> > --- a/hw/pci.c
> > +++ b/hw/pci.c
> > @@ -130,12 +130,13 @@ void pci_device_save(PCIDevice *s, QEMUFile *f)
> >      int version = s->cap_present ? 3 : 2;
> >      int i;
> >  
> > -    qemu_put_be32(f, version); /* PCI device version */
> > +    /* PCI device version and capabilities */
> > +    qemu_put_be32(f, version);
> > +    if (version >= 3)
> > +        qemu_put_be32(f, s->cap_present);
> >      qemu_put_buffer(f, s->config, 256);
> >      for (i = 0; i < 4; i++)
> >          qemu_put_be32(f, s->irq_state[i]);
> > -    if (version >= 3)
> > -        qemu_put_be32(f, s->cap_present);
> >  }
> What is it doing here?
> You should just do it right in the first patch, instead of doing in
> one way there, and fixing here.
> 
> >  
> >  int pci_device_load(PCIDevice *s, QEMUFile *f)
> > @@ -146,12 +147,6 @@ int pci_device_load(PCIDevice *s, QEMUFile *f)
> >      version_id = qemu_get_be32(f);
> >      if (version_id > 3)
> >          return -EINVAL;
> > -    qemu_get_buffer(f, s->config, 256);
> > -    pci_update_mappings(s);
> > -
> > -    if (version_id >= 2)
> > -        for (i = 0; i < 4; i ++)
> > -            s->irq_state[i] = qemu_get_be32(f);
> >      if (version_id >= 3)
> >          s->cap_present = qemu_get_be32(f);
> >      else
> ditto.
> > @@ -160,6 +155,18 @@ int pci_device_load(PCIDevice *s, QEMUFile *f)
> >      if (s->cap_present & ~s->cap_supported)
> >          return -EINVAL;
> >  
> > +    qemu_get_buffer(f, s->config, 256);
> > +    pci_update_mappings(s);
> > +
> > +    if (version_id >= 2)
> > +        for (i = 0; i < 4; i ++)
> > +            s->irq_state[i] = qemu_get_be32(f);
> > +    /* Clear wmask and used bits for capabilities.
> > +       Must be restored separately, since capabilities can
> > +       be placed anywhere in config space. */
> > +    memset(s->used, 0, PCI_CONFIG_SPACE_SIZE);
> > +    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i)
> > +        s->wmask[i] = 0xff;
> >      return 0;
> >  }
> Sorry, I don't exactly understand it. Although it can be anywhere, what do we actually
> lose by keeping it at the same place in config space?

We lose the ability to let user control the capabilities exposed
by the device.

And generally, I dislike arbitrary limitations. The PCI spec says the
capability can be anywhere, implementing a linked list of caps is simple
enough to not invent abritrary restrictions.

> >  
> > @@ -870,3 +877,76 @@ PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name)
> >  
> >      return (PCIDevice *)dev;
> >  }
> > +
> > +static int pci_find_space(PCIDevice *pdev, uint8_t size)
> > +{
> > +    int offset = PCI_CONFIG_HEADER_SIZE;
> > +    int i;
> > +    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i)
> > +        if (pdev->used[i])
> > +            offset = i + 1;
> > +        else if (i - offset + 1 == size)
> > +            return offset;
> > +    return 0;
> > +}
> > +
> > +static uint8_t pci_find_capability_list(PCIDevice *pdev, uint8_t cap_id,
> > +                                        uint8_t *prev_p)
> > +{
> > +    uint8_t next, prev;
> > +
> > +    if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST))
> > +        return 0;
> > +
> > +    for (prev = PCI_CAPABILITY_LIST; (next = pdev->config[prev]);
> > +         prev = next + PCI_CAP_LIST_NEXT)
> > +        if (pdev->config[next + PCI_CAP_LIST_ID] == cap_id)
> > +            break;
> > +
> > +    *prev_p = prev;
> > +    return next;
> > +}
> I'd prefer to do:
> 	if (prev_p)
> 		*prev_p = prev;
> so we don't have to always pass a prev_p pointer. You have yourself a user
> where you don't need it in this very patch.


Good idea.

> > +
> > +/* Reserve space and add capability to the linked list in pci config space */
> > +int pci_add_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t size)
> > +{
> > +    uint8_t offset = pci_find_space(pdev, size);
> > +    uint8_t *config = pdev->config + offset;
> > +    if (!offset)
> > +        return -ENOSPC;
> > +    config[PCI_CAP_LIST_ID] = cap_id;
> > +    config[PCI_CAP_LIST_NEXT] = pdev->config[PCI_CAPABILITY_LIST];
> > +    pdev->config[PCI_CAPABILITY_LIST] = offset;
> > +    pdev->config[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
> > +    memset(pdev->used + offset, 0xFF, size);
> > +    /* Make capability read-only by default */
> > +    memset(pdev->wmask + offset, 0, size);
> > +    return offset;
> > +}
> > +
> > +/* Unlink capability from the pci config space. */
> > +void pci_del_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t size)
> > +{
> > +    uint8_t prev, offset = pci_find_capability_list(pdev, cap_id, &prev);
> > +    if (!offset)
> > +        return;
> > +    pdev->config[prev] = pdev->config[offset + PCI_CAP_LIST_NEXT];
> > +    /* Make capability writeable again */
> > +    memset(pdev->wmask + offset, 0xff, size);
> > +    memset(pdev->used + offset, 0, size);
> > +
> > +    if (!pdev->config[PCI_CAPABILITY_LIST])
> > +        pdev->config[PCI_STATUS] &= ~PCI_STATUS_CAP_LIST;
> > +}
> > +
> > +/* Reserve space for capability at a known offset (to call after load). */
> > +void pci_reserve_capability(PCIDevice *pdev, uint8_t offset, uint8_t size)
> > +{
> > +    memset(pdev->used + offset, 0xff, size);
> > +}
> > +
> > +uint8_t pci_find_capability(PCIDevice *pdev, uint8_t cap_id)
> > +{
> > +    uint8_t prev;
> > +    return pci_find_capability_list(pdev, cap_id, &prev);
> > +}
> > diff --git a/hw/pci.h b/hw/pci.h
> > index 6f0803f..4838c59 100644
> > --- a/hw/pci.h
> > +++ b/hw/pci.h
> > @@ -123,6 +123,10 @@ typedef struct PCIIORegion {
> >  #define PCI_MIN_GNT		0x3e	/* 8 bits */
> >  #define PCI_MAX_LAT		0x3f	/* 8 bits */
> >  
> > +/* Capability lists */
> > +#define PCI_CAP_LIST_ID		0	/* Capability ID */
> > +#define PCI_CAP_LIST_NEXT	1	/* Next capability in the list */
> > +
> >  #define PCI_REVISION            0x08    /* obsolete, use PCI_REVISION_ID */
> >  #define PCI_SUBVENDOR_ID        0x2c    /* obsolete, use PCI_SUBSYSTEM_VENDOR_ID */
> >  #define PCI_SUBDEVICE_ID        0x2e    /* obsolete, use PCI_SUBSYSTEM_ID */
> > @@ -130,7 +134,7 @@ typedef struct PCIIORegion {
> >  /* Bits in the PCI Status Register (PCI 2.3 spec) */
> >  #define PCI_STATUS_RESERVED1	0x007
> >  #define PCI_STATUS_INT_STATUS	0x008
> > -#define PCI_STATUS_CAPABILITIES	0x010
> > +#define PCI_STATUS_CAP_LIST	0x010
> >  #define PCI_STATUS_66MHZ	0x020
> >  #define PCI_STATUS_RESERVED2	0x040
> >  #define PCI_STATUS_FAST_BACK	0x080
> > @@ -160,6 +164,9 @@ struct PCIDevice {
> >      /* Used to implement R/W bytes */
> >      uint8_t wmask[PCI_CONFIG_SPACE_SIZE];
> >  
> > +    /* Used to allocate config space for capabilities. */
> > +    uint8_t used[PCI_CONFIG_SPACE_SIZE];
> > +
> >      /* the following fields are read only */
> >      PCIBus *bus;
> >      int devfn;
> > @@ -194,6 +201,15 @@ void pci_register_io_region(PCIDevice *pci_dev, int region_num,
> >                              uint32_t size, int type,
> >                              PCIMapIORegionFunc *map_func);
> >  
> > +int pci_add_capability(PCIDevice *pci_dev, uint8_t cap_id, uint8_t cap_size);
> > +
> > +void pci_del_capability(PCIDevice *pci_dev, uint8_t cap_id, uint8_t cap_size);
> > +
> > +void pci_reserve_capability(PCIDevice *pci_dev, uint8_t offset, uint8_t size);
> > +
> > +uint8_t pci_find_capability(PCIDevice *pci_dev, uint8_t cap_id);
> > +
> > +
> >  uint32_t pci_default_read_config(PCIDevice *d,
> >                                   uint32_t address, int len);
> >  void pci_default_write_config(PCIDevice *d,
> > -- 
> > 1.6.3.1.56.g79e1.dirty
> > 
> > 
> >
Glauber Costa June 10, 2009, 2:55 p.m. UTC | #3
On Wed, Jun 10, 2009 at 12:54:15PM +0300, Michael S. Tsirkin wrote:
> On Tue, Jun 09, 2009 at 02:11:14PM -0300, Glauber Costa wrote:
> > On Fri, Jun 05, 2009 at 01:23:15PM +0300, Michael S. Tsirkin wrote:
> > > Add routines to manage PCI capability list. First user will be MSI-X.
> > > 
> > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > ---
> > >  hw/pci.c |   98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
> > >  hw/pci.h |   18 +++++++++++-
> > >  2 files changed, 106 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/hw/pci.c b/hw/pci.c
> > > index 361d741..ed011b5 100644
> > > --- a/hw/pci.c
> > > +++ b/hw/pci.c
> > > @@ -130,12 +130,13 @@ void pci_device_save(PCIDevice *s, QEMUFile *f)
> > >      int version = s->cap_present ? 3 : 2;
> > >      int i;
> > >  
> > > -    qemu_put_be32(f, version); /* PCI device version */
> > > +    /* PCI device version and capabilities */
> > > +    qemu_put_be32(f, version);
> > > +    if (version >= 3)
> > > +        qemu_put_be32(f, s->cap_present);
> > >      qemu_put_buffer(f, s->config, 256);
> > >      for (i = 0; i < 4; i++)
> > >          qemu_put_be32(f, s->irq_state[i]);
> > > -    if (version >= 3)
> > > -        qemu_put_be32(f, s->cap_present);
> > >  }
> > What is it doing here?
> > You should just do it right in the first patch, instead of doing in
> > one way there, and fixing here.
> > 
> > >  
> > >  int pci_device_load(PCIDevice *s, QEMUFile *f)
> > > @@ -146,12 +147,6 @@ int pci_device_load(PCIDevice *s, QEMUFile *f)
> > >      version_id = qemu_get_be32(f);
> > >      if (version_id > 3)
> > >          return -EINVAL;
> > > -    qemu_get_buffer(f, s->config, 256);
> > > -    pci_update_mappings(s);
> > > -
> > > -    if (version_id >= 2)
> > > -        for (i = 0; i < 4; i ++)
> > > -            s->irq_state[i] = qemu_get_be32(f);
> > >      if (version_id >= 3)
> > >          s->cap_present = qemu_get_be32(f);
> > >      else
> > ditto.
> > > @@ -160,6 +155,18 @@ int pci_device_load(PCIDevice *s, QEMUFile *f)
> > >      if (s->cap_present & ~s->cap_supported)
> > >          return -EINVAL;
> > >  
> > > +    qemu_get_buffer(f, s->config, 256);
> > > +    pci_update_mappings(s);
> > > +
> > > +    if (version_id >= 2)
> > > +        for (i = 0; i < 4; i ++)
> > > +            s->irq_state[i] = qemu_get_be32(f);
> > > +    /* Clear wmask and used bits for capabilities.
> > > +       Must be restored separately, since capabilities can
> > > +       be placed anywhere in config space. */
> > > +    memset(s->used, 0, PCI_CONFIG_SPACE_SIZE);
> > > +    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i)
> > > +        s->wmask[i] = 0xff;
> > >      return 0;
> > >  }
> > Sorry, I don't exactly understand it. Although it can be anywhere, what do we actually
> > lose by keeping it at the same place in config space?
> 
> We lose the ability to let user control the capabilities exposed
> by the device.
> 
> And generally, I dislike arbitrary limitations. The PCI spec says the
> capability can be anywhere, implementing a linked list of caps is simple
> enough to not invent abritrary restrictions.
yes, but this is migration time, right?

caps can be anywhere, but we don't expect it to change during machine execution
lifetime.

Or I am just confused by the name "pci_device_load" ?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin June 10, 2009, 3:01 p.m. UTC | #4
On Wed, Jun 10, 2009 at 11:55:40AM -0300, Glauber Costa wrote:
> On Wed, Jun 10, 2009 at 12:54:15PM +0300, Michael S. Tsirkin wrote:
> > On Tue, Jun 09, 2009 at 02:11:14PM -0300, Glauber Costa wrote:
> > > On Fri, Jun 05, 2009 at 01:23:15PM +0300, Michael S. Tsirkin wrote:
> > > > Add routines to manage PCI capability list. First user will be MSI-X.
> > > > 
> > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > > ---
> > > >  hw/pci.c |   98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
> > > >  hw/pci.h |   18 +++++++++++-
> > > >  2 files changed, 106 insertions(+), 10 deletions(-)
> > > > 
> > > > diff --git a/hw/pci.c b/hw/pci.c
> > > > index 361d741..ed011b5 100644
> > > > --- a/hw/pci.c
> > > > +++ b/hw/pci.c
> > > > @@ -130,12 +130,13 @@ void pci_device_save(PCIDevice *s, QEMUFile *f)
> > > >      int version = s->cap_present ? 3 : 2;
> > > >      int i;
> > > >  
> > > > -    qemu_put_be32(f, version); /* PCI device version */
> > > > +    /* PCI device version and capabilities */
> > > > +    qemu_put_be32(f, version);
> > > > +    if (version >= 3)
> > > > +        qemu_put_be32(f, s->cap_present);
> > > >      qemu_put_buffer(f, s->config, 256);
> > > >      for (i = 0; i < 4; i++)
> > > >          qemu_put_be32(f, s->irq_state[i]);
> > > > -    if (version >= 3)
> > > > -        qemu_put_be32(f, s->cap_present);
> > > >  }
> > > What is it doing here?
> > > You should just do it right in the first patch, instead of doing in
> > > one way there, and fixing here.
> > > 
> > > >  
> > > >  int pci_device_load(PCIDevice *s, QEMUFile *f)
> > > > @@ -146,12 +147,6 @@ int pci_device_load(PCIDevice *s, QEMUFile *f)
> > > >      version_id = qemu_get_be32(f);
> > > >      if (version_id > 3)
> > > >          return -EINVAL;
> > > > -    qemu_get_buffer(f, s->config, 256);
> > > > -    pci_update_mappings(s);
> > > > -
> > > > -    if (version_id >= 2)
> > > > -        for (i = 0; i < 4; i ++)
> > > > -            s->irq_state[i] = qemu_get_be32(f);
> > > >      if (version_id >= 3)
> > > >          s->cap_present = qemu_get_be32(f);
> > > >      else
> > > ditto.
> > > > @@ -160,6 +155,18 @@ int pci_device_load(PCIDevice *s, QEMUFile *f)
> > > >      if (s->cap_present & ~s->cap_supported)
> > > >          return -EINVAL;
> > > >  
> > > > +    qemu_get_buffer(f, s->config, 256);
> > > > +    pci_update_mappings(s);
> > > > +
> > > > +    if (version_id >= 2)
> > > > +        for (i = 0; i < 4; i ++)
> > > > +            s->irq_state[i] = qemu_get_be32(f);
> > > > +    /* Clear wmask and used bits for capabilities.
> > > > +       Must be restored separately, since capabilities can
> > > > +       be placed anywhere in config space. */
> > > > +    memset(s->used, 0, PCI_CONFIG_SPACE_SIZE);
> > > > +    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i)
> > > > +        s->wmask[i] = 0xff;
> > > >      return 0;
> > > >  }
> > > Sorry, I don't exactly understand it. Although it can be anywhere, what do we actually
> > > lose by keeping it at the same place in config space?
> > 
> > We lose the ability to let user control the capabilities exposed
> > by the device.
> > 
> > And generally, I dislike arbitrary limitations. The PCI spec says the
> > capability can be anywhere, implementing a linked list of caps is simple
> > enough to not invent abritrary restrictions.
> yes, but this is migration time, right?

I think so, yes.

> 
> caps can be anywhere, but we don't expect it to change during machine execution
> lifetime.
> 
> Or I am just confused by the name "pci_device_load" ?

Right. So I want to load an image and it has capability X at offset Y.
wmask has to match. I don't want to assume that we never change Y
for the device without breaking old images, so I clear wmask here
and set it up again after looking up capabilities that I loaded.

Maybe this explanation should go into the comment above?
Paul Brook June 10, 2009, 3:24 p.m. UTC | #5
> > caps can be anywhere, but we don't expect it to change during machine
> > execution lifetime.
> >
> > Or I am just confused by the name "pci_device_load" ?
>
> Right. So I want to load an image and it has capability X at offset Y.
> wmask has to match. I don't want to assume that we never change Y
> for the device without breaking old images, so I clear wmask here
> and set it up again after looking up capabilities that I loaded.

We should not be loading state into a different device (or a similar device 
with a different set of capabilities).

If you want to provide backwards compatibility then you should do that by 
creating a device that is the same as the original.  As I mentioned in my 
earlier mail, loading a snapshot should never do anything that can not be 
achieved through normal operation.

Paul

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin June 10, 2009, 3:50 p.m. UTC | #6
On Wed, Jun 10, 2009 at 04:24:28PM +0100, Paul Brook wrote:
> > > caps can be anywhere, but we don't expect it to change during machine
> > > execution lifetime.
> > >
> > > Or I am just confused by the name "pci_device_load" ?
> >
> > Right. So I want to load an image and it has capability X at offset Y.
> > wmask has to match. I don't want to assume that we never change Y
> > for the device without breaking old images, so I clear wmask here
> > and set it up again after looking up capabilities that I loaded.
> 
> We should not be loading state into a different device (or a similar device 
> with a different set of capabilities).
> 
> If you want to provide backwards compatibility then you should do that by 
> creating a device that is the same as the original.  As I mentioned in my 
> earlier mail, loading a snapshot should never do anything that can not be 
> achieved through normal operation.
> 
> Paul

Why shouldn't it? You don't load a snapshot while guest is running.
Jamie Lokier June 10, 2009, 5:43 p.m. UTC | #7
Paul Brook wrote:
> > > caps can be anywhere, but we don't expect it to change during machine
> > > execution lifetime.
> > >
> > > Or I am just confused by the name "pci_device_load" ?
> >
> > Right. So I want to load an image and it has capability X at offset Y.
> > wmask has to match. I don't want to assume that we never change Y
> > for the device without breaking old images, so I clear wmask here
> > and set it up again after looking up capabilities that I loaded.
> 
> We should not be loading state into a different device (or a similar device 
> with a different set of capabilities).
> 
> If you want to provide backwards compatibility then you should do that by 
> creating a device that is the same as the original.  As I mentioned in my 
> earlier mail, loading a snapshot should never do anything that can not be 
> achieved through normal operation.

If you can create a machine be restoring a snapshot which you can't
create by normally starting QEMU, then you'll soon have guests which
work fine from their snapshots, but which cannot be booted without a
snapshot because there's no way to boot the right machine for the guest.

Ssomeone might even have guests like that for years without noticing,
because they always save and restore guest state using snapshots, then
one day they simply want to boot the guest from it's disk image and
find there's no way to do it with any QEMU which runs on their host
platform.

I think the right long term answer to all this is a way to get QEMU to
dump it's current machine configuration in glorious detail as a file
which can be reloaded as a machine configuration.

-- Jamie
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin June 10, 2009, 6:22 p.m. UTC | #8
On Wed, Jun 10, 2009 at 06:43:02PM +0100, Jamie Lokier wrote:
> Paul Brook wrote:
> > > > caps can be anywhere, but we don't expect it to change during machine
> > > > execution lifetime.
> > > >
> > > > Or I am just confused by the name "pci_device_load" ?
> > >
> > > Right. So I want to load an image and it has capability X at offset Y.
> > > wmask has to match. I don't want to assume that we never change Y
> > > for the device without breaking old images, so I clear wmask here
> > > and set it up again after looking up capabilities that I loaded.
> > 
> > We should not be loading state into a different device (or a similar device 
> > with a different set of capabilities).
> > 
> > If you want to provide backwards compatibility then you should do that by 
> > creating a device that is the same as the original.  As I mentioned in my 
> > earlier mail, loading a snapshot should never do anything that can not be 
> > achieved through normal operation.
> 
> If you can create a machine be restoring a snapshot which you can't
> create by normally starting QEMU, then you'll soon have guests which
> work fine from their snapshots, but which cannot be booted without a
> snapshot because there's no way to boot the right machine for the guest.

Yes. This clearly isn't what I'm building here. You *can* create a guest
without msi-x support by passing an appropriate flag.

> Ssomeone might even have guests like that for years without noticing,
> because they always save and restore guest state using snapshots, then
> one day they simply want to boot the guest from it's disk image and
> find there's no way to do it with any QEMU which runs on their host
> platform.
> 
> I think the right long term answer to all this is a way to get QEMU to
> dump it's current machine configuration in glorious detail as a file
> which can be reloaded as a machine configuration.
> 
> -- Jamie

And then we'll have the same set of problems there.
Jamie Lokier June 10, 2009, 7:27 p.m. UTC | #9
Michael S. Tsirkin wrote:
> > I think the right long term answer to all this is a way to get QEMU to
> > dump it's current machine configuration in glorious detail as a file
> > which can be reloaded as a machine configuration.
> 
> And then we'll have the same set of problems there.

We will, and the solution will be the same: options to create devices
as they were in older versions of QEMU.  It only needs to cover device
features which matter to guests, not every bug fix.

However with a machine configuration which is generated by QEMU,
there's less worry about proliferation of obscure options, compared
with the command line.  You don't necessarily have to document every
backward-compatibility option in any detail, you just have to make
sure it's written and read properly, which is much the same thing as
the snapshot code does.

-- Jamie
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark McLoughlin June 12, 2009, 8:43 a.m. UTC | #10
On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote:
> Michael S. Tsirkin wrote:
> > > I think the right long term answer to all this is a way to get QEMU to
> > > dump it's current machine configuration in glorious detail as a file
> > > which can be reloaded as a machine configuration.
> > 
> > And then we'll have the same set of problems there.
> 
> We will, and the solution will be the same: options to create devices
> as they were in older versions of QEMU.  It only needs to cover device
> features which matter to guests, not every bug fix.
> 
> However with a machine configuration which is generated by QEMU,
> there's less worry about proliferation of obscure options, compared
> with the command line.  You don't necessarily have to document every
> backward-compatibility option in any detail, you just have to make
> sure it's written and read properly, which is much the same thing as
> the snapshot code does.

This is a sensible plan, but I don't think we should mix these compat
options in with the VM manager supplied configuration.

There are two problems with that approach.

= Problem 1 - VM manager needs to parse qemu config =

Your proposal implies:

  - VM manager supplies a basic configuration to qemu

  - It then immediately asks qemu for a dump of the machine 
    configuration in all its glorious detail and retains that
    config

  - If the VM manager wishes to add a new device it needs to parse the 
    qemu config and add it, rather than just generate an entirely new 
    config

= Problem 2 - We can't predict the future =

If a VM manager supplies a configuration which is missing any given
option, qemu cannot tell the difference between:

  - This is a basic config, the VM manager wants whatever the default 
    of the current qemu version

  - This is a complete config dumped using an old version of qemu, the 
    VM manager wants the old default

= Solution - Separate configuration from compat hints =

As I suggested before:

  - Allow the VM manager to dump compat hints; this would be an opaque 
    file format, more like the savevm format than a config file

  - Use defaults where compat hints are not available; e.g. if the VM 
    manager specifies a device config, but no compat hints are 
    supplied for it, then just use default values

  - Make the config override compat hints; e.g. if there are compat 
    hints specified for a device not included in the machine config, 
    just ignore those hints

Cheers,
Mark.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin June 12, 2009, 1:59 p.m. UTC | #11
On Fri, Jun 12, 2009 at 09:43:29AM +0100, Mark McLoughlin wrote:
> = Solution - Separate configuration from compat hints =
> 
> As I suggested before:
> 
>   - Allow the VM manager to dump compat hints; this would be an opaque 
>     file format, more like the savevm format than a config file

Why make it "like the savevm" format then?
If they are opaque anyway, compat hints could be part of savevm format.

>   - Use defaults where compat hints are not available; e.g. if the VM 
>     manager specifies a device config, but no compat hints are 
>     supplied for it, then just use default values
> 
>   - Make the config override compat hints; e.g. if there are compat 
>     hints specified for a device not included in the machine config, 
>     just ignore those hints
> 
> Cheers,
> Mark.

If compat hints are opaque and only editable by qemu, we can get into a
situation where one can't create a specific setup with a new qemu.
Mark McLoughlin June 12, 2009, 2:48 p.m. UTC | #12
On Fri, 2009-06-12 at 16:59 +0300, Michael S. Tsirkin wrote:
> On Fri, Jun 12, 2009 at 09:43:29AM +0100, Mark McLoughlin wrote:
> > = Solution - Separate configuration from compat hints =
> > 
> > As I suggested before:
> > 
> >   - Allow the VM manager to dump compat hints; this would be an opaque 
> >     file format, more like the savevm format than a config file
> 
> Why make it "like the savevm" format then?
> If they are opaque anyway, compat hints could be part of savevm format.

So a "savevm --only-compat-hints" command? It might make sense, since we
would want the compat hints with savevm too.

> >   - Use defaults where compat hints are not available; e.g. if the VM 
> >     manager specifies a device config, but no compat hints are 
> >     supplied for it, then just use default values
> > 
> >   - Make the config override compat hints; e.g. if there are compat 
> >     hints specified for a device not included in the machine config, 
> >     just ignore those hints
> > 
> > Cheers,
> > Mark.
> 
> If compat hints are opaque and only editable by qemu, we can get into a
> situation where one can't create a specific setup with a new qemu.

An example?

Cheers,
Mark.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori June 12, 2009, 2:51 p.m. UTC | #13
Mark McLoughlin wrote:
> On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote:
>   
>> Michael S. Tsirkin wrote:
>>     
>>>> I think the right long term answer to all this is a way to get QEMU to
>>>> dump it's current machine configuration in glorious detail as a file
>>>> which can be reloaded as a machine configuration.
>>>>         
>>> And then we'll have the same set of problems there.
>>>       
>> We will, and the solution will be the same: options to create devices
>> as they were in older versions of QEMU.  It only needs to cover device
>> features which matter to guests, not every bug fix.
>>
>> However with a machine configuration which is generated by QEMU,
>> there's less worry about proliferation of obscure options, compared
>> with the command line.  You don't necessarily have to document every
>> backward-compatibility option in any detail, you just have to make
>> sure it's written and read properly, which is much the same thing as
>> the snapshot code does.
>>     
>
> This is a sensible plan, but I don't think we should mix these compat
> options in with the VM manager supplied configuration.
>
> There are two problems with that approach.
>
> = Problem 1 - VM manager needs to parse qemu config =
>
> Your proposal implies:
>
>   - VM manager supplies a basic configuration to qemu
>
>   - It then immediately asks qemu for a dump of the machine 
>     configuration in all its glorious detail and retains that
>     config
>
>   - If the VM manager wishes to add a new device it needs to parse the 
>     qemu config and add it, rather than just generate an entirely new 
>     config
>   

What's the problem with parsing the device config and modifying it?  Is 
it just complexity?

If we provided a mechanism to simplify manipulating a device config, 
would that eliminate the concern here?

Regards,

Anthony Liguori


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori June 12, 2009, 2:55 p.m. UTC | #14
Mark McLoughlin wrote:
> On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote:
>   
> = Solution - Separate configuration from compat hints =
>
> As I suggested before:
>
>   - Allow the VM manager to dump compat hints; this would be an opaque 
>     file format, more like the savevm format than a config file
>   

How is compat hints different from a device tree?

In my mind, that's what compat hints is.  I don't see another sane way 
to implement it.

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark McLoughlin June 12, 2009, 3:41 p.m. UTC | #15
On Fri, 2009-06-12 at 09:51 -0500, Anthony Liguori wrote:
> Mark McLoughlin wrote:
> > On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote:
> >   
> >> Michael S. Tsirkin wrote:
> >>     
> >>>> I think the right long term answer to all this is a way to get QEMU to
> >>>> dump it's current machine configuration in glorious detail as a file
> >>>> which can be reloaded as a machine configuration.
> >>>>         
> >>> And then we'll have the same set of problems there.
> >>>       
> >> We will, and the solution will be the same: options to create devices
> >> as they were in older versions of QEMU.  It only needs to cover device
> >> features which matter to guests, not every bug fix.
> >>
> >> However with a machine configuration which is generated by QEMU,
> >> there's less worry about proliferation of obscure options, compared
> >> with the command line.  You don't necessarily have to document every
> >> backward-compatibility option in any detail, you just have to make
> >> sure it's written and read properly, which is much the same thing as
> >> the snapshot code does.
> >>     
> >
> > This is a sensible plan, but I don't think we should mix these compat
> > options in with the VM manager supplied configuration.
> >
> > There are two problems with that approach.
> >
> > = Problem 1 - VM manager needs to parse qemu config =
> >
> > Your proposal implies:
> >
> >   - VM manager supplies a basic configuration to qemu
> >
> >   - It then immediately asks qemu for a dump of the machine 
> >     configuration in all its glorious detail and retains that
> >     config
> >
> >   - If the VM manager wishes to add a new device it needs to parse the 
> >     qemu config and add it, rather than just generate an entirely new 
> >     config
> >   
> 
> What's the problem with parsing the device config and modifying it?  Is 
> it just complexity?

Yes, complexity is the issue.

> If we provided a mechanism to simplify manipulating a device config, 
> would that eliminate the concern here?

In libvirt's case, a lot of the complexity would come from needing to
figure out what to change.

i.e. libvirt produces a qemu configuration (currently a command line)
from a guest XML's configuration; with this idea, libvirt would probably
compare the old XML config to the new XML config, and then apply those
differences to the qemu configuration.

Cheers,
Mark.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark McLoughlin June 12, 2009, 3:53 p.m. UTC | #16
On Fri, 2009-06-12 at 09:55 -0500, Anthony Liguori wrote:
> Mark McLoughlin wrote:
> > On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote:
> >   
> > = Solution - Separate configuration from compat hints =
> >
> > As I suggested before:
> >
> >   - Allow the VM manager to dump compat hints; this would be an opaque 
> >     file format, more like the savevm format than a config file
> >   
> 
> How is compat hints different from a device tree?
> 
> In my mind, that's what compat hints is.  I don't see another sane way 
> to implement it.

A device tree with a different purpose than a config file.

In its simplest form it could be a device tree with a version number for
each device[1]. 

The other obvious piece to add to it would be PCI addresses, so that
even if you remove a device, the addresses assigned to existing devices
don't change.

Cheers,
Mark.

[1] - Adding such a per-device version number to the config file would
solve problem (2)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori June 12, 2009, 4:11 p.m. UTC | #17
Mark McLoughlin wrote:
> On Fri, 2009-06-12 at 09:51 -0500, Anthony Liguori wrote:
>   
>> Mark McLoughlin wrote:
>>     
>>> On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote:
>>>   
>>>       
>>>> Michael S. Tsirkin wrote:
>>>>     
>>>>         
>>>>>> I think the right long term answer to all this is a way to get QEMU to
>>>>>> dump it's current machine configuration in glorious detail as a file
>>>>>> which can be reloaded as a machine configuration.
>>>>>>         
>>>>>>             
>>>>> And then we'll have the same set of problems there.
>>>>>       
>>>>>           
>>>> We will, and the solution will be the same: options to create devices
>>>> as they were in older versions of QEMU.  It only needs to cover device
>>>> features which matter to guests, not every bug fix.
>>>>
>>>> However with a machine configuration which is generated by QEMU,
>>>> there's less worry about proliferation of obscure options, compared
>>>> with the command line.  You don't necessarily have to document every
>>>> backward-compatibility option in any detail, you just have to make
>>>> sure it's written and read properly, which is much the same thing as
>>>> the snapshot code does.
>>>>     
>>>>         
>>> This is a sensible plan, but I don't think we should mix these compat
>>> options in with the VM manager supplied configuration.
>>>
>>> There are two problems with that approach.
>>>
>>> = Problem 1 - VM manager needs to parse qemu config =
>>>
>>> Your proposal implies:
>>>
>>>   - VM manager supplies a basic configuration to qemu
>>>
>>>   - It then immediately asks qemu for a dump of the machine 
>>>     configuration in all its glorious detail and retains that
>>>     config
>>>
>>>   - If the VM manager wishes to add a new device it needs to parse the 
>>>     qemu config and add it, rather than just generate an entirely new 
>>>     config
>>>   
>>>       
>> What's the problem with parsing the device config and modifying it?  Is 
>> it just complexity?
>>     
>
> Yes, complexity is the issue.
>
>   
>> If we provided a mechanism to simplify manipulating a device config, 
>> would that eliminate the concern here?
>>     
>
> In libvirt's case, a lot of the complexity would come from needing to
> figure out what to change.
>   

Right, libvirt wants to be able to easily say "add a scsi block device 
to this VM".  The way I see this working is that there would be a 
default pc.dtc.  We would still have a -drive file=foo.img,if=scsi 
option that would really just be a wrapper around first searching for an 
existing LSI controller, if one exists, attaching the lun, if not, 
create one, etc.

libvirt could continue to use this sort of interface.  However, as it 
wants to do more advanced things, it may have to dive into the device 
tree itself.

On live migration, QEMU will save a copy of the device tree somewhere 
and libvirt needs to keep track of it.  It can treat it as opaque.  -M 
/path/to/foo.dtc -drive file=foo.img,if=scsi should continue working as 
expected IMHO.

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori June 12, 2009, 4:12 p.m. UTC | #18
Mark McLoughlin wrote:
> On Fri, 2009-06-12 at 09:55 -0500, Anthony Liguori wrote:
>   
>> Mark McLoughlin wrote:
>>     
>>> On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote:
>>>   
>>> = Solution - Separate configuration from compat hints =
>>>
>>> As I suggested before:
>>>
>>>   - Allow the VM manager to dump compat hints; this would be an opaque 
>>>     file format, more like the savevm format than a config file
>>>   
>>>       
>> How is compat hints different from a device tree?
>>
>> In my mind, that's what compat hints is.  I don't see another sane way 
>> to implement it.
>>     
>
> A device tree with a different purpose than a config file.
>
> In its simplest form it could be a device tree with a version number for
> each device[1]. 
>   

I think the point is that you don't need version numbers if you have a 
proper device tree.  NB the device tree contains no host configuration 
information.

Regards,

Anthony Liguori

> The other obvious piece to add to it would be PCI addresses, so that
> even if you remove a device, the addresses assigned to existing devices
> don't change.
>
> Cheers,
> Mark.
>
> [1] - Adding such a per-device version number to the config file would
> solve problem (2)
>
>   

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark McLoughlin June 12, 2009, 4:48 p.m. UTC | #19
On Fri, 2009-06-12 at 11:11 -0500, Anthony Liguori wrote:
> Mark McLoughlin wrote:
> > On Fri, 2009-06-12 at 09:51 -0500, Anthony Liguori wrote:
> >   
> >> Mark McLoughlin wrote:
> >>     
> >>> On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote:
> >>>   
> >>>       
> >>>> Michael S. Tsirkin wrote:
> >>>>     
> >>>>         
> >>>>>> I think the right long term answer to all this is a way to get QEMU to
> >>>>>> dump it's current machine configuration in glorious detail as a file
> >>>>>> which can be reloaded as a machine configuration.
> >>>>>>         
> >>>>>>             
> >>>>> And then we'll have the same set of problems there.
> >>>>>       
> >>>>>           
> >>>> We will, and the solution will be the same: options to create devices
> >>>> as they were in older versions of QEMU.  It only needs to cover device
> >>>> features which matter to guests, not every bug fix.
> >>>>
> >>>> However with a machine configuration which is generated by QEMU,
> >>>> there's less worry about proliferation of obscure options, compared
> >>>> with the command line.  You don't necessarily have to document every
> >>>> backward-compatibility option in any detail, you just have to make
> >>>> sure it's written and read properly, which is much the same thing as
> >>>> the snapshot code does.
> >>>>     
> >>>>         
> >>> This is a sensible plan, but I don't think we should mix these compat
> >>> options in with the VM manager supplied configuration.
> >>>
> >>> There are two problems with that approach.
> >>>
> >>> = Problem 1 - VM manager needs to parse qemu config =
> >>>
> >>> Your proposal implies:
> >>>
> >>>   - VM manager supplies a basic configuration to qemu
> >>>
> >>>   - It then immediately asks qemu for a dump of the machine 
> >>>     configuration in all its glorious detail and retains that
> >>>     config
> >>>
> >>>   - If the VM manager wishes to add a new device it needs to parse the 
> >>>     qemu config and add it, rather than just generate an entirely new 
> >>>     config
> >>>   
> >>>       
> >> What's the problem with parsing the device config and modifying it?  Is 
> >> it just complexity?
> >>     
> >
> > Yes, complexity is the issue.
> >
> >   
> >> If we provided a mechanism to simplify manipulating a device config, 
> >> would that eliminate the concern here?
> >>     
> >
> > In libvirt's case, a lot of the complexity would come from needing to
> > figure out what to change.
> >   
> 
> Right, libvirt wants to be able to easily say "add a scsi block device 
> to this VM".  The way I see this working is that there would be a 
> default pc.dtc.  We would still have a -drive file=foo.img,if=scsi 
> option that would really just be a wrapper around first searching for an 
> existing LSI controller, if one exists, attaching the lun, if not, 
> create one, etc.
> 
> libvirt could continue to use this sort of interface.  However, as it 
> wants to do more advanced things, it may have to dive into the device 
> tree itself.
> 
> On live migration, QEMU will save a copy of the device tree somewhere 
> and libvirt needs to keep track of it.  It can treat it as opaque.  -M 
> /path/to/foo.dtc -drive file=foo.img,if=scsi should continue working as 
> expected IMHO.

So, when libvirt creates a guest for the first time, it makes a copy of
the device tree and continues to use that even if qemu is upgraded.
That's enough to ensure compat is retained for all built-in devices.

However, in order to retain compat for that SCSI device (e.g. ensuring
the PCI address doesn't change as other devices are added an removed),
we're back to the same problem ... either:

  1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to figure 
     out what address to use, libvirt would need to query qemu for what 
     address was originally allocated to device or it would do all the 
     PCI address allocation itself ... or:

  2) Don't use the command line, instead get a dump of the entire 
     device tree (including the SCSI device) - if the device is to be 
     removed or modified in future, libvirt would need to modify the 
     device tree

The basic problem would be that the command line config would have very
limited ability to override the device tree config.

Cheers,
Mark.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark McLoughlin June 12, 2009, 4:48 p.m. UTC | #20
On Fri, 2009-06-12 at 11:12 -0500, Anthony Liguori wrote:
> Mark McLoughlin wrote:
> > On Fri, 2009-06-12 at 09:55 -0500, Anthony Liguori wrote:
> >   
> >> Mark McLoughlin wrote:
> >>     
> >>> On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote:
> >>>   
> >>> = Solution - Separate configuration from compat hints =
> >>>
> >>> As I suggested before:
> >>>
> >>>   - Allow the VM manager to dump compat hints; this would be an opaque 
> >>>     file format, more like the savevm format than a config file
> >>>   
> >>>       
> >> How is compat hints different from a device tree?
> >>
> >> In my mind, that's what compat hints is.  I don't see another sane way 
> >> to implement it.
> >>     
> >
> > A device tree with a different purpose than a config file.
> >
> > In its simplest form it could be a device tree with a version number for
> > each device[1]. 
> >   
> 
> I think the point is that you don't need version numbers if you have a 
> proper device tree.

How do you add a new attribute to the device tree and, when a supplied
device tree lacking said attribute, distinguish between a device tree
from an old version of qemu (i.e. use the old default) and a partial
device tree from the VM manager (i.e. use the new default) ?

> NB the device tree contains no host configuration information.

So, it wouldn't e.g. include the path to the image file for a block
device? That would always be specified on the command line?

Cheers,
Mark.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori June 12, 2009, 5 p.m. UTC | #21
Mark McLoughlin wrote:
> So, when libvirt creates a guest for the first time, it makes a copy of
> the device tree and continues to use that even if qemu is upgraded.
> That's enough to ensure compat is retained for all built-in devices.
>
> However, in order to retain compat for that SCSI device (e.g. ensuring
> the PCI address doesn't change as other devices are added an removed),
> we're back to the same problem ... either:
>
>   1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to figure 
>      out what address to use, libvirt would need to query qemu for what 
>      address was originally allocated to device or it would do all the 
>      PCI address allocation itself ... or:
>
>   2) Don't use the command line, instead get a dump of the entire 
>      device tree (including the SCSI device) - if the device is to be 
>      removed or modified in future, libvirt would need to modify the 
>      device tree
>
> The basic problem would be that the command line config would have very
> limited ability to override the device tree config.
>   

After libvirt has done -drive file=foo... it should dump the machine 
config and use that from then on.

To combined to a single thread...
> How do you add a new attribute to the device tree and, when a supplied
> device tree lacking said attribute, distinguish between a device tree
> from an old version of qemu (i.e. use the old default) and a partial
> device tree from the VM manager (i.e. use the new default) ?
>   

Please define "attribute".  I don't follow what you're asking.

>> NB the device tree contains no host configuration information.
>>     
>
> So, it wouldn't e.g. include the path to the image file for a block
> device? That would always be specified on the command line?
>   

No, the IDE definition would contain some sort of symbolic node name.  A 
separate mechanism (either command line or host config file) would then 
link a image file to the symbolic name.

libvirt should really never worry about the machine config file for 
normal things unless it needs to change what devices are exposed to a guest.

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark McLoughlin June 12, 2009, 5:31 p.m. UTC | #22
On Fri, 2009-06-12 at 12:00 -0500, Anthony Liguori wrote:
> Mark McLoughlin wrote:
> > So, when libvirt creates a guest for the first time, it makes a copy of
> > the device tree and continues to use that even if qemu is upgraded.
> > That's enough to ensure compat is retained for all built-in devices.
> >
> > However, in order to retain compat for that SCSI device (e.g. ensuring
> > the PCI address doesn't change as other devices are added an removed),
> > we're back to the same problem ... either:
> >
> >   1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to figure 
> >      out what address to use, libvirt would need to query qemu for what 
> >      address was originally allocated to device or it would do all the 
> >      PCI address allocation itself ... or:
> >
> >   2) Don't use the command line, instead get a dump of the entire 
> >      device tree (including the SCSI device) - if the device is to be 
> >      removed or modified in future, libvirt would need to modify the 
> >      device tree
> >
> > The basic problem would be that the command line config would have very
> > limited ability to override the device tree config.
> >   
> 
> After libvirt has done -drive file=foo... it should dump the machine 
> config and use that from then on.

Right - libvirt then wouldn't be able to avoid the complexity of merging
any future changes into the dumped machine config.

> To combined to a single thread...
> > How do you add a new attribute to the device tree and, when a supplied
> > device tree lacking said attribute, distinguish between a device tree
> > from an old version of qemu (i.e. use the old default) and a partial
> > device tree from the VM manager (i.e. use the new default) ?
> >   
> 
> Please define "attribute".  I don't follow what you're asking.

e.g. a per-device "enable MSI support" flag.

If qemu is supplied with a device tree that lacks that flag, does it
enable or disable MSI?

Enable by default is bad - it could be a device tree dumped from an old
version of qemu, so compat would be broken.

Disable by default is bad - it could be a simple device tree supplied by
the user, and the latest features are wanted.

Maybe we want a per-device "this is a complete device description" flag
and if anything is missing from a supposedly complete description, the
old defaults would be used. A config dumped from qemu would have this
flag set, a config generated by libvirt would not have the flag.

> >> NB the device tree contains no host configuration information.
> >>     
> >
> > So, it wouldn't e.g. include the path to the image file for a block
> > device? That would always be specified on the command line?
> >   
> 
> No, the IDE definition would contain some sort of symbolic node name.  A 
> separate mechanism (either command line or host config file) would then 
> link a image file to the symbolic name.

Okay.

> libvirt should really never worry about the machine config file for 
> normal things unless it needs to change what devices are exposed to a guest.

But changing devices *is* normal ... e.g. removing a block device.

Writing out a device tree is not a problem for libvirt (or any other
management tools), it's the need to merge changes into an existing
device tree is where the real complexity would lie.

Cheers,
Mark.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Blue Swirl June 12, 2009, 5:44 p.m. UTC | #23
On 6/12/09, Mark McLoughlin <markmc@redhat.com> wrote:
> On Fri, 2009-06-12 at 12:00 -0500, Anthony Liguori wrote:
>  > Mark McLoughlin wrote:
>  > > So, when libvirt creates a guest for the first time, it makes a copy of
>  > > the device tree and continues to use that even if qemu is upgraded.
>  > > That's enough to ensure compat is retained for all built-in devices.
>  > >
>  > > However, in order to retain compat for that SCSI device (e.g. ensuring
>  > > the PCI address doesn't change as other devices are added an removed),
>  > > we're back to the same problem ... either:
>  > >
>  > >   1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to figure
>  > >      out what address to use, libvirt would need to query qemu for what
>  > >      address was originally allocated to device or it would do all the
>  > >      PCI address allocation itself ... or:
>  > >
>  > >   2) Don't use the command line, instead get a dump of the entire
>  > >      device tree (including the SCSI device) - if the device is to be
>  > >      removed or modified in future, libvirt would need to modify the
>  > >      device tree
>  > >
>  > > The basic problem would be that the command line config would have very
>  > > limited ability to override the device tree config.
>  > >
>  >
>  > After libvirt has done -drive file=foo... it should dump the machine
>  > config and use that from then on.
>
>  Right - libvirt then wouldn't be able to avoid the complexity of merging
>  any future changes into the dumped machine config.
>
>  > To combined to a single thread...
>  > > How do you add a new attribute to the device tree and, when a supplied
>  > > device tree lacking said attribute, distinguish between a device tree
>  > > from an old version of qemu (i.e. use the old default) and a partial
>  > > device tree from the VM manager (i.e. use the new default) ?
>  > >
>  >
>  > Please define "attribute".  I don't follow what you're asking.
>
>  e.g. a per-device "enable MSI support" flag.
>
>  If qemu is supplied with a device tree that lacks that flag, does it
>  enable or disable MSI?
>
>  Enable by default is bad - it could be a device tree dumped from an old
>  version of qemu, so compat would be broken.
>
>  Disable by default is bad - it could be a simple device tree supplied by
>  the user, and the latest features are wanted.
>
>  Maybe we want a per-device "this is a complete device description" flag
>  and if anything is missing from a supposedly complete description, the
>  old defaults would be used. A config dumped from qemu would have this
>  flag set, a config generated by libvirt would not have the flag.

If the device has different behavior or different properties from
guest perspective compared to the old device, it should get a new
device type so that you could specify in the device tree either the
old device or the new one. Flags won't help in the long term.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark McLoughlin June 12, 2009, 5:55 p.m. UTC | #24
On Fri, 2009-06-12 at 20:44 +0300, Blue Swirl wrote:

> If the device has different behavior or different properties from
> guest perspective compared to the old device, it should get a new
> device type so that you could specify in the device tree either the
> old device or the new one.

Yes, that works - it's analogous to a device (type, version) pair.

Cheers,
Mark.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 14, 2009, 7:55 a.m. UTC | #25
Anthony Liguori wrote:
>
> What's the problem with parsing the device config and modifying it?  
> Is it just complexity?

Two-way modification.  Management wants to store the configuration in 
their database and tell the hypervisor what the machine looks like.  If 
qemu also tells management what the machine looks like, we can easily 
get conflicts.
Avi Kivity June 14, 2009, 7:58 a.m. UTC | #26
Mark McLoughlin wrote:

  

>> I think the point is that you don't need version numbers if you have a 
>> proper device tree.
>>     
>
> How do you add a new attribute to the device tree and, when a supplied
> device tree lacking said attribute, distinguish between a device tree
> from an old version of qemu (i.e. use the old default) and a partial
> device tree from the VM manager (i.e. use the new default) ?
>   

-baseline 0.10

>   
>> NB the device tree contains no host configuration information.
>>     
>
> So, it wouldn't e.g. include the path to the image file for a block
> device? That would always be specified on the command line?
>   

Or in a different file.  I agree splitting host and guest configuration 
is a must-have, this ensures portability of virtual machines across 
hosts and time.
Michael S. Tsirkin June 14, 2009, 9:34 a.m. UTC | #27
On Fri, Jun 12, 2009 at 04:53:27PM +0100, Mark McLoughlin wrote:
> On Fri, 2009-06-12 at 09:55 -0500, Anthony Liguori wrote:
> > Mark McLoughlin wrote:
> > > On Wed, 2009-06-10 at 20:27 +0100, Jamie Lokier wrote:
> > >   
> > > = Solution - Separate configuration from compat hints =
> > >
> > > As I suggested before:
> > >
> > >   - Allow the VM manager to dump compat hints; this would be an opaque 
> > >     file format, more like the savevm format than a config file
> > >   
> > 
> > How is compat hints different from a device tree?
> > 
> > In my mind, that's what compat hints is.  I don't see another sane way 
> > to implement it.
> 
> A device tree with a different purpose than a config file.
> 
> In its simplest form it could be a device tree with a version number for
> each device[1]. 
> 
> The other obvious piece to add to it would be PCI addresses, so that
> even if you remove a device, the addresses assigned to existing devices
> don't change.

Could you clarify this requirement please?

If we want to remove a device from under a running guest, you need
hotplug. So we can't just remove several lines from the config and hope
that it'll work simply because the PCI address is stable.

OTOH, if you reboot the guest, it's ok for addresses to change.


> Cheers,
> Mark.
> 
> [1] - Adding such a per-device version number to the config file would
> solve problem (2)
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 14, 2009, 9:37 a.m. UTC | #28
Michael S. Tsirkin wrote:
>
> If we want to remove a device from under a running guest, you need
> hotplug. So we can't just remove several lines from the config and hope
> that it'll work simply because the PCI address is stable.
>   

Why not?

> OTOH, if you reboot the guest, it's ok for addresses to change.
>   

No, it's not.  Some guests depend on addressing for their configuration 
(for example older Linux guests will swap eth0/eth1 if you swap their 
slots).
Michael S. Tsirkin June 14, 2009, 9:47 a.m. UTC | #29
On Sun, Jun 14, 2009 at 12:37:13PM +0300, Avi Kivity wrote:
> Michael S. Tsirkin wrote:
>>
>> If we want to remove a device from under a running guest, you need
>> hotplug. So we can't just remove several lines from the config and hope
>> that it'll work simply because the PCI address is stable.
>>   
>
> Why not?

E.g. configuration cycles address a specific bus/slot.
You need cooperation from guest if you want to move
a device.

>> OTOH, if you reboot the guest, it's ok for addresses to change.
>>   
>
> No, it's not.  Some guests depend on addressing for their configuration  
> (for example older Linux guests will swap eth0/eth1 if you swap their  
> slots).

Ah, I misunderstood what's meant by the address. I agree that it's
useful to be able to control device's placement on the bus.
Michael S. Tsirkin June 14, 2009, 9:50 a.m. UTC | #30
On Fri, Jun 12, 2009 at 05:48:23PM +0100, Mark McLoughlin wrote:
> However, in order to retain compat for that SCSI device (e.g. ensuring
> the PCI address doesn't change as other devices are added an removed),
> we're back to the same problem ... either:
> 
>   1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to figure 
>      out what address to use, libvirt would need to query qemu for what 
>      address was originally allocated to device or it would do all the 
>      PCI address allocation itself ... 

This last option makes sense to me: in a real world the user has
control over where he places the device on the bus, so why
not with qemu?
Markus Armbruster June 15, 2009, 5:32 a.m. UTC | #31
Avi Kivity <avi@redhat.com> writes:

> Mark McLoughlin wrote:
[...]
>>> NB the device tree contains no host configuration information.
>>>     
>>
>> So, it wouldn't e.g. include the path to the image file for a block
>> device? That would always be specified on the command line?
>>   
>
> Or in a different file.  I agree splitting host and guest
> configuration is a must-have, this ensures portability of virtual
> machines across hosts and time.

Splitting into two separate sections should suffice, they could live in
the same file for convenience.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark McLoughlin June 15, 2009, 9:02 a.m. UTC | #32
On Sun, 2009-06-14 at 12:34 +0300, Michael S. Tsirkin wrote:
> On Fri, Jun 12, 2009 at 04:53:27PM +0100, Mark McLoughlin wrote:
> > The other obvious piece to add to it would be PCI addresses, so that
> > even if you remove a device, the addresses assigned to existing devices
> > don't change.
> 
> Could you clarify this requirement please?

Avi clarified, but I've written it up here too:

  https://fedoraproject.org/wiki/Features/KVM_Stable_PCI_Addresses

Cheers,
Mark.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark McLoughlin June 15, 2009, 9:08 a.m. UTC | #33
On Sun, 2009-06-14 at 12:50 +0300, Michael S. Tsirkin wrote:
> On Fri, Jun 12, 2009 at 05:48:23PM +0100, Mark McLoughlin wrote:
> > However, in order to retain compat for that SCSI device (e.g. ensuring
> > the PCI address doesn't change as other devices are added an removed),
> > we're back to the same problem ... either:
> > 
> >   1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to figure 
> >      out what address to use, libvirt would need to query qemu for what 
> >      address was originally allocated to device or it would do all the 
> >      PCI address allocation itself ... 
> 
> This last option makes sense to me: in a real world the user has
> control over where he places the device on the bus, so why
> not with qemu?

Yep, most people seem to agree that it makes sense to allow this, but
some believe it should only be via a machine description file, not the
command line.

However, the first problem is that it isn't a solution to the guest ABI
problem more generally.

And the second problem is that for e.g. libvirt to use it, it would have
to be possible to query qemu for what PCI slots were assigned to the
devices - libvirt would need to be able to parse 'info pci' and match
the devices listed with the devices specified on the command line.

Again, details written up here:

  https://fedoraproject.org/wiki/Features/KVM_Stable_PCI_Addresses

Cheers,
Mark.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark McLoughlin June 15, 2009, 9:09 a.m. UTC | #34
On Sun, 2009-06-14 at 10:58 +0300, Avi Kivity wrote:
> Mark McLoughlin wrote:
> 
>   
> 
> >> I think the point is that you don't need version numbers if you have a 
> >> proper device tree.
> >>     
> >
> > How do you add a new attribute to the device tree and, when a supplied
> > device tree lacking said attribute, distinguish between a device tree
> > from an old version of qemu (i.e. use the old default) and a partial
> > device tree from the VM manager (i.e. use the new default) ?
> >   
> 
> -baseline 0.10

That's a version number :-)

(I was responding to Anthony's "you don't need a version number")

Cheers,
Mark.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 15, 2009, 9:27 a.m. UTC | #35
On 06/15/2009 12:08 PM, Mark McLoughlin wrote:
>> This last option makes sense to me: in a real world the user has
>> control over where he places the device on the bus, so why
>> not with qemu?
>>      
>
> Yep, most people seem to agree that it makes sense to allow this, but
> some believe it should only be via a machine description file, not the
> command line.
>    

I don't understand this opposition.  It's clear a machine config file is 
a long way in our future.  It's also clear lack of stable PCI addresses 
hurts us now.

> However, the first problem is that it isn't a solution to the guest ABI
> problem more generally.
>    

pci_addr was never meant to bring world peace, just stable PCI 
addresses.  The other issues should be addressed separately.

> And the second problem is that for e.g. libvirt to use it, it would have
> to be possible to query qemu for what PCI slots were assigned to the
> devices - libvirt would need to be able to parse 'info pci' and match
> the devices listed with the devices specified on the command line.
>    

If all devices (including vga, ide) are set up with pci_addr, then this 
is unneeded.  You do need to export available slot numbers from qemu.
Avi Kivity June 15, 2009, 9:38 a.m. UTC | #36
On 06/14/2009 12:47 PM, Michael S. Tsirkin wrote:
> Michael S. Tsirkin wrote:
>>> If we want to remove a device from under a running guest, you need
>>> hotplug. So we can't just remove several lines from the config and hope
>>> that it'll work simply because the PCI address is stable.
>>>
>>>        
>> Why not?
>>      
>
> E.g. configuration cycles address a specific bus/slot.
> You need cooperation from guest if you want to move
> a device.
>    

By "remove several lines from the config" I understood the guest needs 
to be restarted.  Of course if you don't restart the guest you need true 
hotplug.
Avi Kivity June 15, 2009, 9:43 a.m. UTC | #37
On 06/14/2009 12:50 PM, Michael S. Tsirkin wrote:
> On Fri, Jun 12, 2009 at 05:48:23PM +0100, Mark McLoughlin wrote:
>    
>> However, in order to retain compat for that SCSI device (e.g. ensuring
>> the PCI address doesn't change as other devices are added an removed),
>> we're back to the same problem ... either:
>>
>>    1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to figure
>>       out what address to use, libvirt would need to query qemu for what
>>       address was originally allocated to device or it would do all the
>>       PCI address allocation itself ...
>>      
>
> This last option makes sense to me: in a real world the user has
> control over where he places the device on the bus, so why
> not with qemu?
>    

Yes, the user build the machine using the command line and monitor (or, 
in 2017, the machine configuration file), then turns on the power.  
Command line options are the parts lying around when we start.

btw, -drive needs to be separated:

   -controller type=lsi1234,pci_addr=foobar,name=blah
   -drive file=foo.img,controller=blah,index=0
   -drive file=bar.img,controller=blah,index=1

Drives to not have pci addresses.
Michael S. Tsirkin June 15, 2009, 10:29 a.m. UTC | #38
On Mon, Jun 15, 2009 at 12:43:48PM +0300, Avi Kivity wrote:
> On 06/14/2009 12:50 PM, Michael S. Tsirkin wrote:
>> On Fri, Jun 12, 2009 at 05:48:23PM +0100, Mark McLoughlin wrote:
>>    
>>> However, in order to retain compat for that SCSI device (e.g. ensuring
>>> the PCI address doesn't change as other devices are added an removed),
>>> we're back to the same problem ... either:
>>>
>>>    1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to figure
>>>       out what address to use, libvirt would need to query qemu for what
>>>       address was originally allocated to device or it would do all the
>>>       PCI address allocation itself ...
>>>      
>>
>> This last option makes sense to me: in a real world the user has
>> control over where he places the device on the bus, so why
>> not with qemu?
>>    
>
> Yes, the user build the machine using the command line and monitor (or,  
> in 2017, the machine configuration file), then turns on the power.   
> Command line options are the parts lying around when we start.
>
> btw, -drive needs to be separated:
>
>   -controller type=lsi1234,pci_addr=foobar,name=blah
>   -drive file=foo.img,controller=blah,index=0
>   -drive file=bar.img,controller=blah,index=1
>
> Drives to not have pci addresses.

Maybe we need a generic 'bus options' flag.



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin June 15, 2009, 10:32 a.m. UTC | #39
On Mon, Jun 15, 2009 at 12:27:08PM +0300, Avi Kivity wrote:
> On 06/15/2009 12:08 PM, Mark McLoughlin wrote:
>>> This last option makes sense to me: in a real world the user has
>>> control over where he places the device on the bus, so why
>>> not with qemu?
>>>      
>>
>> Yep, most people seem to agree that it makes sense to allow this, but
>> some believe it should only be via a machine description file, not the
>> command line.
>>    
>
> I don't understand this opposition.  It's clear a machine config file is  
> a long way in our future.  It's also clear lack of stable PCI addresses  
> hurts us now.
>
>> However, the first problem is that it isn't a solution to the guest ABI
>> problem more generally.
>>    
>
> pci_addr was never meant to bring world peace, just stable PCI  
> addresses.  The other issues should be addressed separately.
>
>> And the second problem is that for e.g. libvirt to use it, it would have
>> to be possible to query qemu for what PCI slots were assigned to the
>> devices - libvirt would need to be able to parse 'info pci' and match
>> the devices listed with the devices specified on the command line.
>>    
>
> If all devices (including vga, ide) are set up with pci_addr, then this  
> is unneeded.

Right. I think it could be an all or nothing at all approach.

> You do need to export available slot numbers from qemu.

Why would a slot be unavailable?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov June 15, 2009, 10:44 a.m. UTC | #40
On Mon, Jun 15, 2009 at 01:32:49PM +0300, Michael S. Tsirkin wrote:
> > You do need to export available slot numbers from qemu.
> 
> Why would a slot be unavailable?
> 
Because it does not exist?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin June 15, 2009, 10:46 a.m. UTC | #41
On Mon, Jun 15, 2009 at 01:44:56PM +0300, Gleb Natapov wrote:
> On Mon, Jun 15, 2009 at 01:32:49PM +0300, Michael S. Tsirkin wrote:
> > > You do need to export available slot numbers from qemu.
> > 
> > Why would a slot be unavailable?
> > 
> Because it does not exist?

We can create a slot with any number, can't we?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov June 15, 2009, 10:52 a.m. UTC | #42
On Mon, Jun 15, 2009 at 01:46:53PM +0300, Michael S. Tsirkin wrote:
> On Mon, Jun 15, 2009 at 01:44:56PM +0300, Gleb Natapov wrote:
> > On Mon, Jun 15, 2009 at 01:32:49PM +0300, Michael S. Tsirkin wrote:
> > > > You do need to export available slot numbers from qemu.
> > > 
> > > Why would a slot be unavailable?
> > > 
> > Because it does not exist?
> 
> We can create a slot with any number, can't we?
What do you mean? If the mobo has 4 slots you can't create fifth.
KVM describes 32 slots in the BIOS.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin June 15, 2009, 11:07 a.m. UTC | #43
On Mon, Jun 15, 2009 at 01:52:13PM +0300, Gleb Natapov wrote:
> On Mon, Jun 15, 2009 at 01:46:53PM +0300, Michael S. Tsirkin wrote:
> > On Mon, Jun 15, 2009 at 01:44:56PM +0300, Gleb Natapov wrote:
> > > On Mon, Jun 15, 2009 at 01:32:49PM +0300, Michael S. Tsirkin wrote:
> > > > > You do need to export available slot numbers from qemu.
> > > > 
> > > > Why would a slot be unavailable?
> > > > 
> > > Because it does not exist?
> > 
> > We can create a slot with any number, can't we?
> What do you mean? If the mobo has 4 slots you can't create fifth.
> KVM describes 32 slots in the BIOS.

Do you mean the KVM kernel module here? I don't know much about the
BIOS. Can't qemu control the number of slots declared?
Gleb Natapov June 15, 2009, 11:14 a.m. UTC | #44
On Mon, Jun 15, 2009 at 02:07:53PM +0300, Michael S. Tsirkin wrote:
> On Mon, Jun 15, 2009 at 01:52:13PM +0300, Gleb Natapov wrote:
> > On Mon, Jun 15, 2009 at 01:46:53PM +0300, Michael S. Tsirkin wrote:
> > > On Mon, Jun 15, 2009 at 01:44:56PM +0300, Gleb Natapov wrote:
> > > > On Mon, Jun 15, 2009 at 01:32:49PM +0300, Michael S. Tsirkin wrote:
> > > > > > You do need to export available slot numbers from qemu.
> > > > > 
> > > > > Why would a slot be unavailable?
> > > > > 
> > > > Because it does not exist?
> > > 
> > > We can create a slot with any number, can't we?
> > What do you mean? If the mobo has 4 slots you can't create fifth.
> > KVM describes 32 slots in the BIOS.
> 
> Do you mean the KVM kernel module here? I don't know much about the
No I don't mean KVM kernel module here.

> BIOS. Can't qemu control the number of slots declared?
> 
Qemu represents HW, BIOS drives this HW. They should be in sync on such
important issues like pci slot configuration. Even if QEMU can control
the number of slots declared (which it can't easily do), it will be able
to do it only on startup (before BIOS runs). The way to have dynamic
number of slots may be pci bridge emulation. Not sure what is needed
from BIOS for that.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 15, 2009, 11:27 a.m. UTC | #45
On 06/15/2009 01:32 PM, Michael S. Tsirkin wrote:
>> You do need to export available slot numbers from qemu.
>>      
>
> Why would a slot be unavailable?
>    

A slot needs to be configured in ACPI, and not be taken by onboard chips 
(piix takes slot 0, for example).
Avi Kivity June 15, 2009, 11:32 a.m. UTC | #46
On 06/15/2009 12:09 PM, Mark McLoughlin wrote:
>>>> I think the point is that you don't need version numbers if you have a
>>>> proper device tree.
>>>>
>>>>          
>>> How do you add a new attribute to the device tree and, when a supplied
>>> device tree lacking said attribute, distinguish between a device tree
>>> from an old version of qemu (i.e. use the old default) and a partial
>>> device tree from the VM manager (i.e. use the new default) ?
>>>
>>>        
>> -baseline 0.10
>>      
>
> That's a version number :-)
>
> (I was responding to Anthony's "you don't need a version number")
>    

If you want to prevent incompatibilities, you need to make everything 
new (potentially including bugfixes) non-default.  Eventually the 
default configuration becomes increasingly unusable and you need a new 
baseline.  You must still be able to fall back to the old baseline for 
older guests.  I don't think games with configuration files can hide that.
Michael S. Tsirkin June 15, 2009, 11:34 a.m. UTC | #47
On Mon, Jun 15, 2009 at 02:14:15PM +0300, Gleb Natapov wrote:
> On Mon, Jun 15, 2009 at 02:07:53PM +0300, Michael S. Tsirkin wrote:
> > On Mon, Jun 15, 2009 at 01:52:13PM +0300, Gleb Natapov wrote:
> > > On Mon, Jun 15, 2009 at 01:46:53PM +0300, Michael S. Tsirkin wrote:
> > > > On Mon, Jun 15, 2009 at 01:44:56PM +0300, Gleb Natapov wrote:
> > > > > On Mon, Jun 15, 2009 at 01:32:49PM +0300, Michael S. Tsirkin wrote:
> > > > > > > You do need to export available slot numbers from qemu.
> > > > > > 
> > > > > > Why would a slot be unavailable?
> > > > > > 
> > > > > Because it does not exist?
> > > > 
> > > > We can create a slot with any number, can't we?
> > > What do you mean? If the mobo has 4 slots you can't create fifth.
> > > KVM describes 32 slots in the BIOS.
> > 
> > Do you mean the KVM kernel module here? I don't know much about the
> No I don't mean KVM kernel module here.
> 
> > BIOS. Can't qemu control the number of slots declared?
> > 
> Qemu represents HW, BIOS drives this HW. They should be in sync on such
> important issues like pci slot configuration.

As a simple solution, let's stick to 32 slots per bus. That's the
maximum that the PCI spec allows, anyway.

> Even if QEMU can control the number of slots declared (which it can't
> easily do), it will be able to do it only on startup (before BIOS
> runs).

That's OK - this is when the machine description is read.

> The way to have dynamic
> number of slots may be pci bridge emulation. Not sure what is needed
> from BIOS for that.

Since bridge can be hot-plugged, probably nothing?
But we don't necessarily need dynamic number of slots IMO.

> 
> --
> 			Gleb.
Markus Armbruster June 15, 2009, 11:35 a.m. UTC | #48
Avi Kivity <avi@redhat.com> writes:

> On 06/15/2009 12:08 PM, Mark McLoughlin wrote:
>>> This last option makes sense to me: in a real world the user has
>>> control over where he places the device on the bus, so why
>>> not with qemu?
>>>      
>>
>> Yep, most people seem to agree that it makes sense to allow this, but
>> some believe it should only be via a machine description file, not the
>> command line.
>>    
>
> I don't understand this opposition.  It's clear a machine config file
> is a long way in our future.  It's also clear lack of stable PCI
> addresses hurts us now.

Correct.

>> However, the first problem is that it isn't a solution to the guest ABI
>> problem more generally.
>>    
>
> pci_addr was never meant to bring world peace, just stable PCI
> addresses.  The other issues should be addressed separately.
>
>> And the second problem is that for e.g. libvirt to use it, it would have
>> to be possible to query qemu for what PCI slots were assigned to the
>> devices - libvirt would need to be able to parse 'info pci' and match
>> the devices listed with the devices specified on the command line.
>>    
>
> If all devices (including vga, ide) are set up with pci_addr, then
> this is unneeded.  You do need to export available slot numbers from
> qemu.

Not really.  QEMU gives just the host bridge a fixed slot[*].  All the
other slots are available.

The real problem is devices that get implicitly added, like the SCSI
controller.  Those devices get their slots auto-assigned, which can
interfere with slot numbers chosen by the user.  We need a way to avoid
that, as you suggested elsewhere in this thread.


[*] There's an exception or two for oddball targets.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 15, 2009, 11:43 a.m. UTC | #49
(adding cc)

On 06/15/2009 02:35 PM, Markus Armbruster wrote:
> Not really.  QEMU gives just the host bridge a fixed slot[*].  All the
> other slots are available.
>    

qemu needs to export these two bits of information: the first free slot 
and the number of slots.

More generally, which slots are open.  We can assume 1:31, but that's 
unlovely.

> The real problem is devices that get implicitly added, like the SCSI
> controller.  Those devices get their slots auto-assigned, which can
> interfere with slot numbers chosen by the user.  We need a way to avoid
> that, as you suggested elsewhere in this thread.
>    

Paul/Anthony, can we have -vga pci_addr=, -usb-controller pci_addr=, and 
-drive pci_addr= (and later, -disk-controller)?  Stalling while waiting 
for the ultimate config file is only generating pain and out-of-tree 
patches.

(I'd be quite happy constructing the entire machine config on the 
command line, but I realize it's just me)
Michael S. Tsirkin June 15, 2009, 11:48 a.m. UTC | #50
On Mon, Jun 15, 2009 at 02:27:14PM +0300, Avi Kivity wrote:
> On 06/15/2009 01:32 PM, Michael S. Tsirkin wrote:
>>> You do need to export available slot numbers from qemu.
>>>      
>>
>> Why would a slot be unavailable?
>>    
>
> A slot needs to be configured in ACPI,

Can we configure all possible 32 slots?

> and not be taken by onboard chips  
> (piix takes slot 0, for example).

piix is the root complex, isn't it? Are there other examples? If not,
we could teach management about the root complex being special ...


> -- 
> error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 15, 2009, 11:56 a.m. UTC | #51
On 06/15/2009 02:48 PM, Michael S. Tsirkin wrote:
>> A slot needs to be configured in ACPI,
>>      
>
> Can we configure all possible 32 slots?
>    

That's what we do.  But one is always taken.  In the future, perhaps more.

>> and not be taken by onboard chips
>> (piix takes slot 0, for example).
>>      
>
> piix is the root complex, isn't it? Are there other examples? If not,
> we could teach management about the root complex being special ...
>    

We should just tell the user which slots are open.
Stefano Stabellini June 15, 2009, 11:59 a.m. UTC | #52
Avi Kivity wrote:

> (I'd be quite happy constructing the entire machine config on the 
> command line, but I realize it's just me)
> 


It is not just you.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori June 15, 2009, 12:41 p.m. UTC | #53
Avi Kivity wrote:
> On 06/15/2009 12:08 PM, Mark McLoughlin wrote:
>>> This last option makes sense to me: in a real world the user has
>>> control over where he places the device on the bus, so why
>>> not with qemu?
>>>      
>>
>> Yep, most people seem to agree that it makes sense to allow this, but
>> some believe it should only be via a machine description file, not the
>> command line.
>>    
>
> I don't understand this opposition.  It's clear a machine config file 
> is a long way in our future.  It's also clear lack of stable PCI 
> addresses hurts us now.

Is there opposition?  I don't ever recall seeing a patch...

I think it's a perfectly fine idea.

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Markus Armbruster June 15, 2009, 12:41 p.m. UTC | #54
Avi Kivity <avi@redhat.com> writes:

> (adding cc)
>
> On 06/15/2009 02:35 PM, Markus Armbruster wrote:
>> Not really.  QEMU gives just the host bridge a fixed slot[*].  All the
>> other slots are available.
>>    
>
> qemu needs to export these two bits of information: the first free
> slot and the number of slots.
>
> More generally, which slots are open.  We can assume 1:31, but that's
> unlovely.

Point.

>> The real problem is devices that get implicitly added, like the SCSI
>> controller.  Those devices get their slots auto-assigned, which can
>> interfere with slot numbers chosen by the user.  We need a way to avoid
>> that, as you suggested elsewhere in this thread.
>>    
>
> Paul/Anthony, can we have -vga pci_addr=, -usb-controller pci_addr=,
> and -drive pci_addr= (and later, -disk-controller)?  Stalling while
> waiting for the ultimate config file is only generating pain and
> out-of-tree patches.

Yup.

I got bit-rotten patches for pci_addr=, and I can unrot them if they're
wanted.

> (I'd be quite happy constructing the entire machine config on the
> command line, but I realize it's just me)

Ha, .bash_history as config file...
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin June 15, 2009, 12:41 p.m. UTC | #55
On Mon, Jun 15, 2009 at 02:56:42PM +0300, Avi Kivity wrote:
> On 06/15/2009 02:48 PM, Michael S. Tsirkin wrote:
>>> A slot needs to be configured in ACPI,
>>>      
>>
>> Can we configure all possible 32 slots?
>>    
>
> That's what we do.  But one is always taken.  In the future, perhaps more.
>
>>> and not be taken by onboard chips
>>> (piix takes slot 0, for example).
>>>      
>>
>> piix is the root complex, isn't it? Are there other examples? If not,
>> we could teach management about the root complex being special ...
>>    
>
> We should just tell the user which slots are open.

This might be tricky if the config is passed in with the command line
flags.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori June 15, 2009, 12:45 p.m. UTC | #56
Avi Kivity wrote:
> On 06/14/2009 12:50 PM, Michael S. Tsirkin wrote:
>> On Fri, Jun 12, 2009 at 05:48:23PM +0100, Mark McLoughlin wrote:
>>   
>>> However, in order to retain compat for that SCSI device (e.g. ensuring
>>> the PCI address doesn't change as other devices are added an removed),
>>> we're back to the same problem ... either:
>>>
>>>    1) Use '-drive file=foo.img,if=scsi,pci_addr=foo'; in order to 
>>> figure
>>>       out what address to use, libvirt would need to query qemu for 
>>> what
>>>       address was originally allocated to device or it would do all the
>>>       PCI address allocation itself ...
>>>      
>>
>> This last option makes sense to me: in a real world the user has
>> control over where he places the device on the bus, so why
>> not with qemu?
>>    
>
> Yes, the user build the machine using the command line and monitor 
> (or, in 2017, the machine configuration file),

Considering pbrook just posted a machine config for arm, I think it 
would be rather sad if pc wasn't converted to it by 2017...

> then turns on the power.  Command line options are the parts lying 
> around when we start.
>
> btw, -drive needs to be separated:
>
>   -controller type=lsi1234,pci_addr=foobar,name=blah
>   -drive file=foo.img,controller=blah,index=0
>   -drive file=bar.img,controller=blah,index=1
>
> Drives to not have pci addresses.

Drivers don't have indexes and buses but we specify it on the -drive 
line.  -drive is convenient syntax.  It stops being convenient when you 
force it to be two options.

Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori June 15, 2009, 12:48 p.m. UTC | #57
Avi Kivity wrote:
> On 06/15/2009 12:09 PM, Mark McLoughlin wrote:
>>>>> I think the point is that you don't need version numbers if you 
>>>>> have a
>>>>> proper device tree.
>>>>>
>>>>>          
>>>> How do you add a new attribute to the device tree and, when a supplied
>>>> device tree lacking said attribute, distinguish between a device tree
>>>> from an old version of qemu (i.e. use the old default) and a partial
>>>> device tree from the VM manager (i.e. use the new default) ?
>>>>
>>>>        
>>> -baseline 0.10
>>>      
>>
>> That's a version number :-)
>>
>> (I was responding to Anthony's "you don't need a version number")
>>    
>
> If you want to prevent incompatibilities, you need to make everything 
> new (potentially including bugfixes) non-default.  Eventually the 
> default configuration becomes increasingly unusable and you need a new 
> baseline.  You must still be able to fall back to the old baseline for 
> older guests.  I don't think games with configuration files can hide 
> that.

-M pc1
-M pc2

etc.

This is pretty easy to maintain with config files.

Regards,

Anthony Liguori


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 15, 2009, 12:50 p.m. UTC | #58
On 06/15/2009 03:41 PM, Michael S. Tsirkin wrote:
> We should just tell the user which slots are open.
>    
>
> This might be tricky if the config is passed in with the command line
> flags.
>    

qemu -show-available-pci-slots

(the qemu equivalent to KVM_CHECK_EXTENSION)
Anthony Liguori June 15, 2009, 12:50 p.m. UTC | #59
Markus Armbruster wrote:
> Avi Kivity <avi@redhat.com> writes:
>
>   
>> Paul/Anthony, can we have -vga pci_addr=, -usb-controller pci_addr=,
>> and -drive pci_addr= (and later, -disk-controller)?  Stalling while
>> waiting for the ultimate config file is only generating pain and
>> out-of-tree patches.
>>     
>
> Yup.
>
> I got bit-rotten patches for pci_addr=, and I can unrot them if they're
> wanted.
>   

Yes, would be good to have patches on the list to discuss.  In 
principle, I have no objection to this.

Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori June 15, 2009, 12:52 p.m. UTC | #60
Avi Kivity wrote:
> On 06/15/2009 03:41 PM, Michael S. Tsirkin wrote:
>> We should just tell the user which slots are open.
>>   
>> This might be tricky if the config is passed in with the command line
>> flags.
>>    
>
> qemu -show-available-pci-slots

Why does the user care?

Let QEMU allocate the PCI slot, then query it to see what slot it 
assigned and remember that.

It's not a good idea to have management applications attempt to do PCI 
slot allocation.  For instance, one day we may decide to make virtio 
devices multi-function.

Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 15, 2009, 12:55 p.m. UTC | #61
On 06/15/2009 03:41 PM, Anthony Liguori wrote:
>>> Yep, most people seem to agree that it makes sense to allow this, but
>>> some believe it should only be via a machine description file, not the
>>> command line.
>>
>> I don't understand this opposition.  It's clear a machine config file 
>> is a long way in our future.  It's also clear lack of stable PCI 
>> addresses hurts us now.
>
>
> Is there opposition?  I don't ever recall seeing a patch...

Izik Eidus posted a patch (using a different syntax) in November 2007.

>
> I think it's a perfectly fine idea.

Good.
Avi Kivity June 15, 2009, 1:03 p.m. UTC | #62
On 06/15/2009 03:45 PM, Anthony Liguori wrote:
>>> This last option makes sense to me: in a real world the user has
>>> control over where he places the device on the bus, so why
>>> not with qemu?
>>
>> Yes, the user build the machine using the command line and monitor 
>> (or, in 2017, the machine configuration file),
>
>
> Considering pbrook just posted a machine config for arm, I think it 
> would be rather sad if pc wasn't converted to it by 2017...

I'd be sad too, but not surprised.

>> then turns on the power.  Command line options are the parts lying 
>> around when we start.
>>
>> btw, -drive needs to be separated:
>>
>>   -controller type=lsi1234,pci_addr=foobar,name=blah
>>   -drive file=foo.img,controller=blah,index=0
>>   -drive file=bar.img,controller=blah,index=1
>>
>> Drives to not have pci addresses.
>
> Drivers don't have indexes and buses but we specify it on the -drive 
> line. 

Drives do have indexes.  On old parallel scsi drives you set the index 
by clicking a button on the back of the drive to cycle through scsi 
addresses 0-7.  An IDE drive's index is determined by the cable 
(master/slave).  A SATA drive's index is determined by which header on 
the motherboard the drive connects to.

If by bus you mean the if= parameter, then drives certainly do have 
buses.  Just try connecting the scsi drive from the previous paragraph 
to a USB port.

> -drive is convenient syntax.  It stops being convenient when you force 
> it to be two options.

controller= defaults to some builtin thing which autoinstantiates when 
necessary, so the old -drive syntax works.
Markus Armbruster June 15, 2009, 1:04 p.m. UTC | #63
Anthony Liguori <anthony@codemonkey.ws> writes:

> Avi Kivity wrote:
>> On 06/15/2009 12:08 PM, Mark McLoughlin wrote:
>>>> This last option makes sense to me: in a real world the user has
>>>> control over where he places the device on the bus, so why
>>>> not with qemu?
>>>>      
>>>
>>> Yep, most people seem to agree that it makes sense to allow this, but
>>> some believe it should only be via a machine description file, not the
>>> command line.
>>>    
>>
>> I don't understand this opposition.  It's clear a machine config
>> file is a long way in our future.  It's also clear lack of stable
>> PCI addresses hurts us now.
>
> Is there opposition?  I don't ever recall seeing a patch...

http://www.archivum.info/qemu-devel@nongnu.org/2009-01/msg01458.html

> I think it's a perfectly fine idea.

Off to dust off my patch series.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 15, 2009, 1:09 p.m. UTC | #64
On 06/15/2009 03:52 PM, Anthony Liguori wrote:
> Avi Kivity wrote:
>> On 06/15/2009 03:41 PM, Michael S. Tsirkin wrote:
>>> We should just tell the user which slots are open.
>>>   This might be tricky if the config is passed in with the command line
>>> flags.
>>
>> qemu -show-available-pci-slots
>
> Why does the user care?
>
> Let QEMU allocate the PCI slot, then query it to see what slot it 
> assigned and remember that.

It's a roundabout way of doing things.

As an example, if you try to fit too many devices into the machine, you 
have to try to add all devices and watch for a qemu error.  If you know 
in advance how many slots you have, you never enter into that situation 
(and you need to show the limit to the user anyway).

>
> It's not a good idea to have management applications attempt to do PCI 
> slot allocation.  For instance, one day we may decide to make virtio 
> devices multi-function.

Non-virtio, as well.  But we can't make that the default, so the user 
will have to specify this anyway.

Given that you can't hotunplug individual functions, the user will have 
to specify exactly how functions are aggregated into devices.  My 
recommendation would be for a GUI to allow the user to select a 'quad 
port virtio NIC' or 'dual port virtio scsi controller' rather than 
trying to do anything automatic.
Avi Kivity June 15, 2009, 1:12 p.m. UTC | #65
On 06/15/2009 03:48 PM, Anthony Liguori wrote:
>>>>> device tree lacking said attribute, distinguish between a device tree
>>>>> from an old version of qemu (i.e. use the old default) and a partial
>>>>> device tree from the VM manager (i.e. use the new default) ?
>>>>>
>>>> -baseline 0.10
>>>
>>> That's a version number :-)
>>>
>>> (I was responding to Anthony's "you don't need a version number")
>>
>> If you want to prevent incompatibilities, you need to make everything 
>> new (potentially including bugfixes) non-default.  Eventually the 
>> default configuration becomes increasingly unusable and you need a 
>> new baseline.  You must still be able to fall back to the old 
>> baseline for older guests.  I don't think games with configuration 
>> files can hide that.
> How do you add a new attribute to the device tree and, when a supplied
>
> -M pc1
> -M pc2

Certainly preferable to -baseline.

> This is pretty easy to maintain with config files.

Let's not tie the two together.
Gerd Hoffmann June 15, 2009, 1:17 p.m. UTC | #66
Hi,

>> Yes, the user build the machine using the command line and monitor
>> (or, in 2017, the machine configuration file),
>
> Considering pbrook just posted a machine config for arm, I think it
> would be rather sad if pc wasn't converted to it by 2017...

It shouldn't last until 2017, but the process isn't that trivial.
Some qemu code / control flow needs serious restruction until it is in a
state that creating the devices from a fdt can actually work.

> Drivers don't have indexes and buses but we specify it on the -drive
> line. -drive is convenient syntax. It stops being convenient when you
> force it to be two options.

One more issue:  -drive also mixes host and guest configuration.  These 
must be separated too.

cheers,
   Gerd

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori June 15, 2009, 1:20 p.m. UTC | #67
Avi Kivity wrote:
> On 06/15/2009 03:45 PM, Anthony Liguori wrote:
>>>> This last option makes sense to me: in a real world the user has
>>>> control over where he places the device on the bus, so why
>>>> not with qemu?
>>>
>>> Yes, the user build the machine using the command line and monitor 
>>> (or, in 2017, the machine configuration file),
>>
>>
>> Considering pbrook just posted a machine config for arm, I think it 
>> would be rather sad if pc wasn't converted to it by 2017...
>
> I'd be sad too, but not surprised.
>
>>> then turns on the power.  Command line options are the parts lying 
>>> around when we start.
>>>
>>> btw, -drive needs to be separated:
>>>
>>>   -controller type=lsi1234,pci_addr=foobar,name=blah
>>>   -drive file=foo.img,controller=blah,index=0
>>>   -drive file=bar.img,controller=blah,index=1
>>>
>>> Drives to not have pci addresses.
>>
>> Drivers don't have indexes and buses but we specify it on the -drive 
>> line. 
>
> Drives do have indexes.  On old parallel scsi drives you set the index 
> by clicking a button on the back of the drive to cycle through scsi 
> addresses 0-7.  An IDE drive's index is determined by the cable 
> (master/slave).  A SATA drive's index is determined by which header on 
> the motherboard the drive connects to.

It's not at all that simple.   SCSI has a hierarchical address mechanism 
with 0-7 targets but then potentially multiple LUNs per target.  Today, 
we always emulate a single LUN per target but if we ever wanted to 
support more than 7 disks on a SCSI controller, we would have to add 
multiple LUN support too.  So the current linear unit= parameter is 
actually pretty broken for SCSI.

For IDE, it's a combination of bus, slot, and master/slave.  For virtio, 
it's just a PCI address.  What we really need is something that is more 
opaque and controller specific.  For instance, if we were going to do 
controllers...

-controller type=lsi1234,pci_addr=foobar,name=blah
-controller-disk controller=blah,target=0,lun=1,name=sda

-controller type=ide,pci_addr=barfoo,name=ide
-controller-disk controller=ide,slot=secondary,cable=slave,name=hdd

-drive file=foo.img,controller-disk=sda
-drive file=bar.img,controller-disk=hdd

And having "-hdd file=foo.img" be short-hand for "-drive 
file=%s,controller-disk=%s".

>
>
> If by bus you mean the if= parameter, then drives certainly do have 
> buses.  Just try connecting the scsi drive from the previous paragraph 
> to a USB port.

No, I meant drive file=foo.img,bus=3.  If that doesn't seem obvious what 
it should do to you that's because it isn't at all obvious :-)  It ends 
up skipping a predefined number of locations in the drive table.  This 
is pretty broken fundamentally because it assumes controllers always 
support a fixed number of devices.  Nothing really respects bus_id 
though so in practice, I assume it's almost universally broken.

Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori June 15, 2009, 1:23 p.m. UTC | #68
Avi Kivity wrote:
> On 06/15/2009 03:52 PM, Anthony Liguori wrote:
>> Avi Kivity wrote:
>>> On 06/15/2009 03:41 PM, Michael S. Tsirkin wrote:
>>>> We should just tell the user which slots are open.
>>>>   This might be tricky if the config is passed in with the command 
>>>> line
>>>> flags.
>>>
>>> qemu -show-available-pci-slots
>>
>> Why does the user care?
>>
>> Let QEMU allocate the PCI slot, then query it to see what slot it 
>> assigned and remember that.
>
> It's a roundabout way of doing things.

Having libvirt do PCI slot allocation scares me.  It assumes we can 
return a whitelist of available slots, and then let libvirt just 
randomly assign things.  There's knowledge though in slot assignment 
that's board-specific.  For instance, depending on how many LNK lines 
you have, you may want to put things in slots in such a way to optimize 
interrupt balancing or something like that.

Some platforms have quirks about expecting a particular slot to have a 
particular device.  It's still an optimal device but it has to be in 
that slot.  You can't really express that via an available slot list.

> Non-virtio, as well.  But we can't make that the default, so the user 
> will have to specify this anyway.
>
> Given that you can't hotunplug individual functions, the user will 
> have to specify exactly how functions are aggregated into devices.  My 
> recommendation would be for a GUI to allow the user to select a 'quad 
> port virtio NIC' or 'dual port virtio scsi controller' rather than 
> trying to do anything automatic.

Yeah, I haven't thought much about that.

Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori June 15, 2009, 1:24 p.m. UTC | #69
Avi Kivity wrote:
>
> Certainly preferable to -baseline.
>
>> This is pretty easy to maintain with config files.
>
> Let's not tie the two together.

I mentioned it because it suggests a good transition.  We at least have 
to think through how things map to the post-config file world regardless 
of whether that's a few months from now or a decade :-)

Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 15, 2009, 1:35 p.m. UTC | #70
On 06/15/2009 04:20 PM, Anthony Liguori wrote:
>>
>>>> then turns on the power.  Command line options are the parts lying 
>>>> around when we start.
>>>>
>>>> btw, -drive needs to be separated:
>>>>
>>>>   -controller type=lsi1234,pci_addr=foobar,name=blah
>>>>   -drive file=foo.img,controller=blah,index=0
>>>>   -drive file=bar.img,controller=blah,index=1
>>>>
>>>> Drives to not have pci addresses.
>>>
>>> Drivers don't have indexes and buses but we specify it on the -drive 
>>> line. 
>>
>> Drives do have indexes.  On old parallel scsi drives you set the 
>> index by clicking a button on the back of the drive to cycle through 
>> scsi addresses 0-7.  An IDE drive's index is determined by the cable 
>> (master/slave).  A SATA drive's index is determined by which header 
>> on the motherboard the drive connects to.
>
>
> It's not at all that simple.   SCSI has a hierarchical address 
> mechanism with 0-7 targets but then potentially multiple LUNs per 
> target.  Today, we always emulate a single LUN per target but if we 
> ever wanted to support more than 7 disks on a SCSI controller, we 
> would have to add multiple LUN support too.  So the current linear 
> unit= parameter is actually pretty broken for SCSI.

Well, another level in the hierarchy, but I don't think it materially 
changes things.

>
> For IDE, it's a combination of bus, slot, and master/slave.  For 
> virtio, it's just a PCI address.  What we really need is something 
> that is more opaque and controller specific. 

virtio also has a bus (did you mean the pci bus for IDE?), master/slave 
is the index.  virtio doesn't have index, but IMO that was a mistake and 
we should have designed it as a disk controller in the first place.

> For instance, if we were going to do controllers...
>
> -controller type=lsi1234,pci_addr=foobar,name=blah
> -controller-disk controller=blah,target=0,lun=1,name=sda
>
> -controller type=ide,pci_addr=barfoo,name=ide
> -controller-disk controller=ide,slot=secondary,cable=slave,name=hdd
>
> -drive file=foo.img,controller-disk=sda
> -drive file=bar.img,controller-disk=hdd
>
> And having "-hdd file=foo.img" be short-hand for "-drive 
> file=%s,controller-disk=%s".

Yeah.

>>
>>
>> If by bus you mean the if= parameter, then drives certainly do have 
>> buses.  Just try connecting the scsi drive from the previous 
>> paragraph to a USB port.
>
> No, I meant drive file=foo.img,bus=3.  If that doesn't seem obvious 
> what it should do to you that's because it isn't at all obvious :-)  
> It ends up skipping a predefined number of locations in the drive 
> table.  This is pretty broken fundamentally because it assumes 
> controllers always support a fixed number of devices.  Nothing really 
> respects bus_id though so in practice, I assume it's almost 
> universally broken.

Isn't the drive table something totally internal?  And how does bus= 
relate to it?

Confused.
Avi Kivity June 15, 2009, 1:42 p.m. UTC | #71
On 06/15/2009 04:23 PM, Anthony Liguori wrote:
> Avi Kivity wrote:
>> On 06/15/2009 03:52 PM, Anthony Liguori wrote:
>>> Avi Kivity wrote:
>>>> On 06/15/2009 03:41 PM, Michael S. Tsirkin wrote:
>>>>> We should just tell the user which slots are open.
>>>>>   This might be tricky if the config is passed in with the command 
>>>>> line
>>>>> flags.
>>>>
>>>> qemu -show-available-pci-slots
>>>
>>> Why does the user care?
>>>
>>> Let QEMU allocate the PCI slot, then query it to see what slot it 
>>> assigned and remember that.
>>
>> It's a roundabout way of doing things.
>
> Having libvirt do PCI slot allocation scares me.  It assumes we can 
> return a whitelist of available slots, and then let libvirt just 
> randomly assign things.  There's knowledge though in slot assignment 
> that's board-specific.  For instance, depending on how many LNK lines 
> you have, you may want to put things in slots in such a way to 
> optimize interrupt balancing or something like that.

How would qemu know which slots to optimize for?

In practice, I don't see that as a real problem.  We should (a) add an 
ioapic and four more pci links (b) recommend that slots be assigned in 
ascending order, and everything works.

I don't see your concern about libvirt allocating slots.  If a human can 
plug a card into a slot, so can libvirt.  Doing an interactive 
back-and-forth (equivalent to plugging a card while blindfolded, then 
looking to see which slot we hit) is certainly more difficult.

> Some platforms have quirks about expecting a particular slot to have a 
> particular device.  It's still an optimal device but it has to be in 
> that slot.  You can't really express that via an available slot list.

I'll be surprised if we ever measure different dma speeds on different 
slots in the qemu virtual pci bus.  If we do, we'll find a way to 
express them:

   $ qemu -print-pci
   slot 0:01: available 33MHz
   slot 0:02: available 33MHz
   slot 0:03: available 66MHz

I feel a little silly typing this.
Avi Kivity June 15, 2009, 1:43 p.m. UTC | #72
On 06/15/2009 04:24 PM, Anthony Liguori wrote:
> Avi Kivity wrote:
>>
>> Certainly preferable to -baseline.
>>
>>> This is pretty easy to maintain with config files.
>>
>> Let's not tie the two together.
>
> I mentioned it because it suggests a good transition.  We at least 
> have to think through how things map to the post-config file world 
> regardless of whether that's a few months from now or a decade :-)

Sure, it's good both from the transitional point of view and in its own 
right.
Anthony Liguori June 15, 2009, 1:45 p.m. UTC | #73
Avi Kivity wrote:
> On 06/15/2009 04:20 PM, Anthony Liguori wrote:
>> It's not at all that simple.   SCSI has a hierarchical address 
>> mechanism with 0-7 targets but then potentially multiple LUNs per 
>> target.  Today, we always emulate a single LUN per target but if we 
>> ever wanted to support more than 7 disks on a SCSI controller, we 
>> would have to add multiple LUN support too.  So the current linear 
>> unit= parameter is actually pretty broken for SCSI.
>
> Well, another level in the hierarchy, but I don't think it materially 
> changes things.

Depends on whether you expect to say index=0,lun=3 or index=3.  If you 
mean the later, then it's quite conceivable that each target supports 
less than the maximum number of LUNs.  This makes things pretty 
confusing to the user because they have to know that in the current 
implementation, index=0 is valid, index=1 isn't, but index=8 is.

>> No, I meant drive file=foo.img,bus=3.  If that doesn't seem obvious 
>> what it should do to you that's because it isn't at all obvious :-)  
>> It ends up skipping a predefined number of locations in the drive 
>> table.  This is pretty broken fundamentally because it assumes 
>> controllers always support a fixed number of devices.  Nothing really 
>> respects bus_id though so in practice, I assume it's almost 
>> universally broken.
>
> Isn't the drive table something totally internal?  And how does bus= 
> relate to it?

The reality of unit=X,bus=Y,if=Z is that they expand to:

drive_table_index=Y*max_devs[Z] + X

Whereas max_devs = {"ide":4, "scsi": 7, *:0}

How drive_table_index is interpreted is "if" specific.  For if=scsi, 
each lsi device gets a base drive table index that starts at bus_index * 
7.  For virtio, the first empty spot in drive_table results in no more 
drives being created.

It's broken by design.

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori June 15, 2009, 1:51 p.m. UTC | #74
Avi Kivity wrote:
> On 06/15/2009 04:23 PM, Anthony Liguori wrote:
>
> How would qemu know which slots to optimize for?
>
> In practice, I don't see that as a real problem.  We should (a) add an 
> ioapic and four more pci links (b) recommend that slots be assigned in 
> ascending order, and everything works.
>
> I don't see your concern about libvirt allocating slots.  If a human 
> can plug a card into a slot, so can libvirt.  Doing an interactive 
> back-and-forth (equivalent to plugging a card while blindfolded, then 
> looking to see which slot we hit) is certainly more difficult.

Let's take a concrete example because I think you missed my point.  For 
the r2d board, if you have 1 on-board NIC, it has to go in slot 2.  
Additional NICs can go in any slot, but the primary on-board NIC is 
expected to live in slot 2.  It's possible to not have that on-board NIC.

If you let QEMU allocate which PCI slot a device goes in, we can hide 
this detail from libvirt.  If you have libvirt do PCI slot allocation by 
default, it has to know about this restriction in the r2d board unless 
you have a clever way to express this sort of information.

Once QEMU has allocated a device to a slot, libvirt can do a good job 
maintaining that relationship.

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 15, 2009, 1:54 p.m. UTC | #75
On 06/15/2009 04:45 PM, Anthony Liguori wrote:
> Avi Kivity wrote:
>> On 06/15/2009 04:20 PM, Anthony Liguori wrote:
>>> It's not at all that simple.   SCSI has a hierarchical address 
>>> mechanism with 0-7 targets but then potentially multiple LUNs per 
>>> target.  Today, we always emulate a single LUN per target but if we 
>>> ever wanted to support more than 7 disks on a SCSI controller, we 
>>> would have to add multiple LUN support too.  So the current linear 
>>> unit= parameter is actually pretty broken for SCSI.
>>
>> Well, another level in the hierarchy, but I don't think it materially 
>> changes things.
>
> Depends on whether you expect to say index=0,lun=3 or index=3.  If you 
> mean the later, then it's quite conceivable that each target supports 
> less than the maximum number of LUNs.  This makes things pretty 
> confusing to the user because they have to know that in the current 
> implementation, index=0 is valid, index=1 isn't, but index=8 is.

I'd object to any implicit addressing rules.  If we have to say 
target=2,lun=7,street=8,city=9,state=99,zip=12345 instead of 
index=8345345235 so be it.

>>> No, I meant drive file=foo.img,bus=3.  If that doesn't seem obvious 
>>> what it should do to you that's because it isn't at all obvious :-)  
>>> It ends up skipping a predefined number of locations in the drive 
>>> table.  This is pretty broken fundamentally because it assumes 
>>> controllers always support a fixed number of devices.  Nothing 
>>> really respects bus_id though so in practice, I assume it's almost 
>>> universally broken.
>>
>> Isn't the drive table something totally internal?  And how does bus= 
>> relate to it?
>
> The reality of unit=X,bus=Y,if=Z is that they expand to:
>
> drive_table_index=Y*max_devs[Z] + X
>
> Whereas max_devs = {"ide":4, "scsi": 7, *:0}
>
> How drive_table_index is interpreted is "if" specific.  For if=scsi, 
> each lsi device gets a base drive table index that starts at bus_index 
> * 7.  For virtio, the first empty spot in drive_table results in no 
> more drives being created.
>
> It's broken by design.

Agreed.  Pity that it's exposed to the poor users.
Mark McLoughlin June 15, 2009, 2 p.m. UTC | #76
On Mon, 2009-06-15 at 07:48 -0500, Anthony Liguori wrote:
> Avi Kivity wrote:
> > On 06/15/2009 12:09 PM, Mark McLoughlin wrote:
> >>>>> I think the point is that you don't need version numbers if you 
> >>>>> have a
> >>>>> proper device tree.
> >>>>>
> >>>>>          
> >>>> How do you add a new attribute to the device tree and, when a supplied
> >>>> device tree lacking said attribute, distinguish between a device tree
> >>>> from an old version of qemu (i.e. use the old default) and a partial
> >>>> device tree from the VM manager (i.e. use the new default) ?
> >>>>
> >>>>        
> >>> -baseline 0.10
> >>>      
> >>
> >> That's a version number :-)
> >>
> >> (I was responding to Anthony's "you don't need a version number")
> >>    
> >
> > If you want to prevent incompatibilities, you need to make everything 
> > new (potentially including bugfixes) non-default.

No need to punish new guests in order to maintain compatibility for old
guests.

> > Eventually the 
> > default configuration becomes increasingly unusable and you need a new 
> > baseline.  You must still be able to fall back to the old baseline for 
> > older guests.  I don't think games with configuration files can hide 
> > that.
> 
> -M pc1
> -M pc2
> 
> etc.
> 
> This is pretty easy to maintain with config files.

I think this would be reasonable, but it is essentially just a version
number which you objected to on the basis that it would make
cherry-picking harder for distros.

One thing that would be nice with this '-M pc1' thing would be to retain
'-M pc' as a symlink to the latest version. We'd also need a way to read
the symlink too, so that you can query what the current latest version
is and use that in future.

How would this machine type version relate to e.g. changing the default
PCI class of virtio-blk? Would we bump the version number of all machine
types can use virtio-blk?

A per-device version number is workable alternative, but only with a
saveabi type file IMHO.

I've tried to summarise the options here:

  https://fedoraproject.org/wiki/Features/KVM_Stable_Guest_ABI

Cheers,
Mark.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dor Laor June 15, 2009, 2:06 p.m. UTC | #77
Anthony Liguori wrote:
> Avi Kivity wrote:
>> On 06/15/2009 04:23 PM, Anthony Liguori wrote:
>>
>> How would qemu know which slots to optimize for?
>>
>> In practice, I don't see that as a real problem.  We should (a) add 
>> an ioapic and four more pci links (b) recommend that slots be 
>> assigned in ascending order, and everything works.
>>
>> I don't see your concern about libvirt allocating slots.  If a human 
>> can plug a card into a slot, so can libvirt.  Doing an interactive 
>> back-and-forth (equivalent to plugging a card while blindfolded, then 
>> looking to see which slot we hit) is certainly more difficult.
>
> Let's take a concrete example because I think you missed my point.  
> For the r2d board, if you have 1 on-board NIC, it has to go in slot 
> 2.  Additional NICs can go in any slot, but the primary on-board NIC 
> is expected to live in slot 2.  It's possible to not have that 
> on-board NIC.
Libvirt does not support r2d. I hope it won't start to support it.
We can have default values for these types of devices or something like 
pci_addr=auto.

>
> If you let QEMU allocate which PCI slot a device goes in, we can hide 
> this detail from libvirt.  If you have libvirt do PCI slot allocation 
> by default, it has to know about this restriction in the r2d board 
> unless you have a clever way to express this sort of information.
>
> Once QEMU has allocated a device to a slot, libvirt can do a good job 
> maintaining that relationship.
>

The end user should have a mechanism to control device slot positioning. 
For example, if you have several pci devices, some
get high rate of interrupts and some not, if you want to optimize you 
guest you should isolate the high rate 'interesting' devices.
This is something the user will need to do. I agree that the default 
behavior might be 'auto'

Also, while moving from one qemu version to another, you'll need to 
represent the older behavior. -qemu-0.10 is not good enough
since there will be multiple versions in the future with multiple 
distributions setting their defaults.

> Regards,
>
> Anthony Liguori
>
>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori June 15, 2009, 2:20 p.m. UTC | #78
Mark McLoughlin wrote:
> On Mon, 2009-06-15 at 07:48 -0500, Anthony Liguori wrote:
>   
>>> Eventually the 
>>> default configuration becomes increasingly unusable and you need a new 
>>> baseline.  You must still be able to fall back to the old baseline for 
>>> older guests.  I don't think games with configuration files can hide 
>>> that.
>>>       
>> -M pc1
>> -M pc2
>>
>> etc.
>>
>> This is pretty easy to maintain with config files.
>>     
>
> I think this would be reasonable, but it is essentially just a version
> number which you objected to on the basis that it would make
> cherry-picking harder for distros.
>   

It doesn't have to be pc1, pc2.  It could be pc-with-usb or 
pc-with-balloon.  If a distro cherry picks in such a way that their pc 
is not a standard QEMU pc, they would add a new PC type that's specific 
to their distro.

> One thing that would be nice with this '-M pc1' thing would be to retain
> '-M pc' as a symlink to the latest version. We'd also need a way to read
> the symlink too, so that you can query what the current latest version
> is and use that in future.
>   

Another option is an explicit -M default which always uses the default 
machine for the architecture.  Likewise, we would need a way to query 
what the default machine was for an architecture.

> How would this machine type version relate to e.g. changing the default
> PCI class of virtio-blk? Would we bump the version number of all machine
> types can use virtio-blk?
>   
You would introduce a new machine type.  For instance, 
pc-virtio-class-other.  The names don't have to look like that, I'm just 
doing that to make a point.  This may mean that you end up with dozens 
of machine types but you preserve compatibility, which is a good thing.

Of course, the flip side is that you make preserving the machine config 
the duty of the user and we don't maintain compatible machine types.  
This won't work without a proper config file though so for now, we're 
stuck maintaining machine types.

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Javier Guerra June 15, 2009, 2:23 p.m. UTC | #79
On Mon, Jun 15, 2009 at 6:43 AM, Avi Kivity<avi@redhat.com> wrote:
> (I'd be quite happy constructing the entire machine config on the command
> line, but I realize it's just me)

as a user-only (well, i'm a developer, but don't meddle in kernel
affairs since 0.99pl9); I also like that kvm is totally CLI-managed.

but migration-wise, i think it could be nicer if the 'origin' process
could send the config to the 'target' one.  IOW: the -incoming flag
shouldn't need any other parameter, and the 'migrate' command should
send the whole hardware description before the CPU state, and fail
with a 'can't comply' message if the target complains.

of course, that's a simplification.  for example, the 'target' process
should be able to respect some parameters, mostly the 'external'
descriptions, like storage pathnames, or '-net tap' ones.
Anthony Liguori June 15, 2009, 2:24 p.m. UTC | #80
Dor Laor wrote:
> Libvirt does not support r2d. I hope it won't start to support it.

It supports mips, sparc, and ppc machines now.  I don't see why it 
wouldn't support r2d.  For ppcemb, I expect this same problem to occur.  
This sort of restriction is going to be common with embedded boards.

> We can have default values for these types of devices or something 
> like pci_addr=auto.

Why wouldn't libvirt always use pci_addr=auto?  If the only argument for 
having libvirt do pci slot allocation is error messages, can't we find a 
nice way to allow libvirt to create friendly error messages when QEMU fails?

>> If you let QEMU allocate which PCI slot a device goes in, we can hide 
>> this detail from libvirt.  If you have libvirt do PCI slot allocation 
>> by default, it has to know about this restriction in the r2d board 
>> unless you have a clever way to express this sort of information.
>>
>> Once QEMU has allocated a device to a slot, libvirt can do a good job 
>> maintaining that relationship.
>>
>
> The end user should have a mechanism to control device slot 
> positioning. For example, if you have several pci devices, some
> get high rate of interrupts and some not, if you want to optimize you 
> guest you should isolate the high rate 'interesting' devices.
> This is something the user will need to do. I agree that the default 
> behavior might be 'auto'

I'm not at all arguing against pci_addr.  I'm arguing about how libvirt 
should use it with respect to the "genesis" use-case where libvirt has 
no specific reason to choose one PCI slot over another.  In that case, 
I'm merely advocating that we want to let QEMU make the decision.

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin June 15, 2009, 2:34 p.m. UTC | #81
On Mon, Jun 15, 2009 at 09:20:00AM -0500, Anthony Liguori wrote:
> Mark McLoughlin wrote:
>> On Mon, 2009-06-15 at 07:48 -0500, Anthony Liguori wrote:
>>   
>>>> Eventually the default configuration becomes increasingly unusable 
>>>> and you need a new baseline.  You must still be able to fall back 
>>>> to the old baseline for older guests.  I don't think games with 
>>>> configuration files can hide that.
>>>>       
>>> -M pc1
>>> -M pc2
>>>
>>> etc.
>>>
>>> This is pretty easy to maintain with config files.
>>>     
>>
>> I think this would be reasonable, but it is essentially just a version
>> number which you objected to on the basis that it would make
>> cherry-picking harder for distros.
>>   
>
> It doesn't have to be pc1, pc2.  It could be pc-with-usb or  
> pc-with-balloon.  If a distro cherry picks in such a way that their pc  
> is not a standard QEMU pc, they would add a new PC type that's specific  
> to their distro.
>
>> One thing that would be nice with this '-M pc1' thing would be to retain
>> '-M pc' as a symlink to the latest version. We'd also need a way to read
>> the symlink too, so that you can query what the current latest version
>> is and use that in future.
>>   
>
> Another option is an explicit -M default which always uses the default  
> machine for the architecture.  Likewise, we would need a way to query  
> what the default machine was for an architecture.
>
>> How would this machine type version relate to e.g. changing the default
>> PCI class of virtio-blk? Would we bump the version number of all machine
>> types can use virtio-blk?
>>   
> You would introduce a new machine type.  For instance,  
> pc-virtio-class-other.  The names don't have to look like that, I'm just  
> doing that to make a point.  This may mean that you end up with dozens  
> of machine types but you preserve compatibility, which is a good thing.

And then pc-virtio-class-other-with-balloon-without-usb? Wouldn't it be
more straightforward to have capability bits which can be switched on
and off independently rather than trying to fit unrelated features into
a machine type?  IMO it only seems more work at first, and QA gets a bit
nervious that they can't exhaustively test all options. But in the long
run it simplifies things as you don't have to set policy and invent
silly names.

> Of course, the flip side is that you make preserving the machine config  
> the duty of the user and we don't maintain compatible machine types.   
> This won't work without a proper config file though so for now, we're  
> stuck maintaining machine types.
>
> Regards,
>
> Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin June 15, 2009, 2:37 p.m. UTC | #82
On Mon, Jun 15, 2009 at 09:24:32AM -0500, Anthony Liguori wrote:
> Dor Laor wrote:
>> Libvirt does not support r2d. I hope it won't start to support it.
>
> It supports mips, sparc, and ppc machines now.  I don't see why it  
> wouldn't support r2d.  For ppcemb, I expect this same problem to occur.   
> This sort of restriction is going to be common with embedded boards.
>
>> We can have default values for these types of devices or something  
>> like pci_addr=auto.
>
> Why wouldn't libvirt always use pci_addr=auto?  If the only argument for  
> having libvirt do pci slot allocation is error messages, can't we find a  
> nice way to allow libvirt to create friendly error messages when QEMU 
> fails?
>
>>> If you let QEMU allocate which PCI slot a device goes in, we can hide 
>>> this detail from libvirt.  If you have libvirt do PCI slot allocation 
>>> by default, it has to know about this restriction in the r2d board  
>>> unless you have a clever way to express this sort of information.
>>>
>>> Once QEMU has allocated a device to a slot, libvirt can do a good job 
>>> maintaining that relationship.
>>>
>>
>> The end user should have a mechanism to control device slot  
>> positioning. For example, if you have several pci devices, some
>> get high rate of interrupts and some not, if you want to optimize you  
>> guest you should isolate the high rate 'interesting' devices.
>> This is something the user will need to do. I agree that the default  
>> behavior might be 'auto'
>
> I'm not at all arguing against pci_addr.  I'm arguing about how libvirt  
> should use it with respect to the "genesis" use-case where libvirt has  
> no specific reason to choose one PCI slot over another.  In that case,  
> I'm merely advocating that we want to let QEMU make the decision.

The allocation code could be moved out into a library, and libvirt could
link with it (ducks).

> Regards,
>
> Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori June 15, 2009, 3:03 p.m. UTC | #83
Michael S. Tsirkin wrote:
>> I'm not at all arguing against pci_addr.  I'm arguing about how libvirt  
>> should use it with respect to the "genesis" use-case where libvirt has  
>> no specific reason to choose one PCI slot over another.  In that case,  
>> I'm merely advocating that we want to let QEMU make the decision.
>>     
>
> The allocation code could be moved out into a library, and libvirt could
> link with it (ducks).
>   

Why does libvirt want to do allocation?

Regards,

Anthony Liguori


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 15, 2009, 3:05 p.m. UTC | #84
On 06/15/2009 05:24 PM, Anthony Liguori wrote:
> Dor Laor wrote:
>> Libvirt does not support r2d. I hope it won't start to support it.
>
> It supports mips, sparc, and ppc machines now.  I don't see why it 
> wouldn't support r2d.  For ppcemb, I expect this same problem to 
> occur.  This sort of restriction is going to be common with embedded 
> boards.

I expect these restrictions will have to be known by the management 
application.  Otherwise the users will try invalid configurations only 
to receive errors when they launch them.  GUIs exist to guide users, not 
as an inefficient means of trial-and-error.

>
>> We can have default values for these types of devices or something 
>> like pci_addr=auto.
>
> Why wouldn't libvirt always use pci_addr=auto?  If the only argument 
> for having libvirt do pci slot allocation is error messages, can't we 
> find a nice way to allow libvirt to create friendly error messages 
> when QEMU fails?

Error messages are not the only argument for pushing slot allocation to 
management.  See my previous messages on the topic.

>>> If you let QEMU allocate which PCI slot a device goes in, we can 
>>> hide this detail from libvirt.  If you have libvirt do PCI slot 
>>> allocation by default, it has to know about this restriction in the 
>>> r2d board unless you have a clever way to express this sort of 
>>> information.
>>>
>>> Once QEMU has allocated a device to a slot, libvirt can do a good 
>>> job maintaining that relationship.
>>>
>>
>> The end user should have a mechanism to control device slot 
>> positioning. For example, if you have several pci devices, some
>> get high rate of interrupts and some not, if you want to optimize you 
>> guest you should isolate the high rate 'interesting' devices.
>> This is something the user will need to do. I agree that the default 
>> behavior might be 'auto'
>
> I'm not at all arguing against pci_addr.  I'm arguing about how 
> libvirt should use it with respect to the "genesis" use-case where 
> libvirt has no specific reason to choose one PCI slot over another.  
> In that case, I'm merely advocating that we want to let QEMU make the 
> decision.


However this may end up, isn't it offtopic?  Whatever we do we have to 
support both pci_addr= and default placement, so we can push this 
discussion to livirt-devel and bid them godspeed.
Anthony Liguori June 15, 2009, 3:07 p.m. UTC | #85
Avi Kivity wrote:
>
> I'd object to any implicit addressing rules.  If we have to say 
> target=2,lun=7,street=8,city=9,state=99,zip=12345 instead of 
> index=8345345235 so be it.

The next observation is that while we expand the SCSI addressing, the 
current propose flattens the PCI hierarchy (i.e. pci_addr=00:01.0).

An alternative would be to either always expand or always flatten 
addressing.  I think the later has a lot of merit.  Consider:

-controller type=lsi1234,addr=00:01,name=blah
-controller-disk controller=blah,addr=00:01,name=sda

-controller type=ide,addr=00.02,name=ide
-controller-disk controller=ide,addr=3,name=hdd

-drive file=foo.img,controller-disk=sda
-drive file=bar.img,controller-disk=hdd

This means that addr's format depends on the parent device node which is 
a bit less explicit than the previous example.  However, it is much more 
consistent and easier to implement.  Basically, when adding a device to 
it's parent, you hand the parent the "addr" field and that lets you say 
where you want to sit on the bus.

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel P. Berrangé June 15, 2009, 3:08 p.m. UTC | #86
On Mon, Jun 15, 2009 at 10:03:22AM -0500, Anthony Liguori wrote:
> Michael S. Tsirkin wrote:
> >>I'm not at all arguing against pci_addr.  I'm arguing about how libvirt  
> >>should use it with respect to the "genesis" use-case where libvirt has  
> >>no specific reason to choose one PCI slot over another.  In that case,  
> >>I'm merely advocating that we want to let QEMU make the decision.
> >>    
> >
> >The allocation code could be moved out into a library, and libvirt could
> >link with it (ducks).
> >  
> 
> Why does libvirt want to do allocation?

It doesn't want to. As Mark said, libvirt just wants to be able to ensure
a stable guest ABI, of which stable PCI addresses is one aspect. This does
not imply libvirt wants to allocate the PCI addresses, just that it wants
a way to keep them stable. All else being equal I'd rather libvirt wasn't
in the PCI address allocation business.


Regards,
Daniel
Anthony Liguori June 15, 2009, 3:11 p.m. UTC | #87
Michael S. Tsirkin wrote:
> And then pc-virtio-class-other-with-balloon-without-usb? Wouldn't it be
> more straightforward to have capability bits which can be switched on
> and off independently rather than trying to fit unrelated features into
> a machine type?  IMO it only seems more work at first, and QA gets a bit
> nervious that they can't exhaustively test all options. But in the long
> run it simplifies things as you don't have to set policy and invent
> silly names.
>   

We're strictly talking about default machine configs.  That has nothing 
to do with capabilities.  You still need to know what the default set of 
enabled capabilities were and keep track of that.  All that I'm 
suggesting is that we use the machine name to collapse the default set 
of capabilities into something that libvirt can track.

The advantage of using something more opaque like that is that it 
simplifies things for management tools as they don't have to keep track 
of "capabilities" that we're adding.  Heck, you could even do:

pc-00000034

Where "pc-%08x" % (capabilities) :-)

Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 15, 2009, 3:11 p.m. UTC | #88
On 06/15/2009 06:07 PM, Anthony Liguori wrote:
> Avi Kivity wrote:
>>
>> I'd object to any implicit addressing rules.  If we have to say 
>> target=2,lun=7,street=8,city=9,state=99,zip=12345 instead of 
>> index=8345345235 so be it.
>
> The next observation is that while we expand the SCSI addressing, the 
> current propose flattens the PCI hierarchy (i.e. pci_addr=00:01.0).
>
> An alternative would be to either always expand or always flatten 
> addressing.  I think the later has a lot of merit.  Consider:
>
> -controller type=lsi1234,addr=00:01,name=blah
> -controller-disk controller=blah,addr=00:01,name=sda
>
> -controller type=ide,addr=00.02,name=ide
> -controller-disk controller=ide,addr=3,name=hdd
>
> -drive file=foo.img,controller-disk=sda
> -drive file=bar.img,controller-disk=hdd
>
> This means that addr's format depends on the parent device node which 
> is a bit less explicit than the previous example.  However, it is much 
> more consistent and easier to implement.  Basically, when adding a 
> device to it's parent, you hand the parent the "addr" field and that 
> lets you say where you want to sit on the bus.

I would prefer explicit names (pci_addr, lun, etc.) but would be okay 
with generic names too.

There's value in sticking to well-understood names and address formats.
Anthony Liguori June 15, 2009, 3:11 p.m. UTC | #89
Avi Kivity wrote:
> However this may end up, isn't it offtopic?  Whatever we do we have to 
> support both pci_addr= and default placement, so we can push this 
> discussion to livirt-devel and bid them godspeed.

I'm not sure how we got here but yeah, let's table this part of the 
discussion.

Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dor Laor June 15, 2009, 3:12 p.m. UTC | #90
Daniel P. Berrange wrote:
> On Mon, Jun 15, 2009 at 10:03:22AM -0500, Anthony Liguori wrote:
>   
>> Michael S. Tsirkin wrote:
>>     
>>>> I'm not at all arguing against pci_addr.  I'm arguing about how libvirt  
>>>> should use it with respect to the "genesis" use-case where libvirt has  
>>>> no specific reason to choose one PCI slot over another.  In that case,  
>>>> I'm merely advocating that we want to let QEMU make the decision.
>>>>    
>>>>         
>>> The allocation code could be moved out into a library, and libvirt could
>>> link with it (ducks).
>>>  
>>>       
>> Why does libvirt want to do allocation?
>>     
>
> It doesn't want to. As Mark said, libvirt just wants to be able to ensure
> a stable guest ABI, of which stable PCI addresses is one aspect. This does
> not imply libvirt wants to allocate the PCI addresses, just that it wants
> a way to keep them stable. All else being equal I'd rather libvirt wasn't
> in the PCI address allocation business.
>   

It's not about what libvirt wants. It's about what will serve the end 
user the most.
Apart for stable guest ABI, end users need to have the option to control 
the slot for
their devices. Just like them have for physical machines. It's not 
theoretical discussion,
limiting issues with shared irq is one real life example.

Thanks, dor

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 15, 2009, 3:15 p.m. UTC | #91
On 06/15/2009 06:12 PM, Dor Laor wrote:
>> It doesn't want to. As Mark said, libvirt just wants to be able to 
>> ensure
>> a stable guest ABI, of which stable PCI addresses is one aspect. This 
>> does
>> not imply libvirt wants to allocate the PCI addresses, just that it 
>> wants
>> a way to keep them stable. All else being equal I'd rather libvirt 
>> wasn't
>> in the PCI address allocation business.
>
>
> It's not about what libvirt wants. It's about what will serve the end 
> user the most.
> Apart for stable guest ABI, end users need to have the option to 
> control the slot for
> their devices. Just like them have for physical machines. It's not 
> theoretical discussion,
> limiting issues with shared irq is one real life example.
>

Another issue is enumeration.  Guests will present their devices in the 
order they find them on the pci bus (of course enumeration is guest 
specific).  So if I have 2 virtio controllers the only way I can 
distinguish between them is using their pci slots.
Anthony Liguori June 15, 2009, 3:20 p.m. UTC | #92
Avi Kivity wrote:
> I would prefer explicit names (pci_addr, lun, etc.) but would be okay 
> with generic names too.

I think having a generic address has a lot of value in terms of code 
implementation.  Otherwise, the valid options for -drive become 
context-sensitive which is going to be annoying and error-prone.  Some 
sanity could be added by using addressing prefixes like addr=pci:00:01.0 
or addr=scsi:0.3 but I'll leave that up to whoever takes this on.

Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 15, 2009, 3:26 p.m. UTC | #93
On 06/15/2009 06:20 PM, Anthony Liguori wrote:
> Avi Kivity wrote:
>> I would prefer explicit names (pci_addr, lun, etc.) but would be okay 
>> with generic names too.
>
> I think having a generic address has a lot of value in terms of code 
> implementation.  Otherwise, the valid options for -drive become 
> context-sensitive which is going to be annoying and error-prone.  Some 
> sanity could be added by using addressing prefixes like 
> addr=pci:00:01.0 or addr=scsi:0.3 but I'll leave that up to whoever 
> takes this on.

The code problems are easily solved by adding another level of 
indirection.  User confusion problems are only aggravated by additional 
abstraction, though ("what do I put in addr=, here?").
Mark McLoughlin June 15, 2009, 4:27 p.m. UTC | #94
On Mon, 2009-06-15 at 18:05 +0300, Avi Kivity wrote:
> On 06/15/2009 05:24 PM, Anthony Liguori wrote:
> > Dor Laor wrote:
> >> Libvirt does not support r2d. I hope it won't start to support it.
> >
> > It supports mips, sparc, and ppc machines now.  I don't see why it 
> > wouldn't support r2d.  For ppcemb, I expect this same problem to 
> > occur.  This sort of restriction is going to be common with embedded 
> > boards.
> 
> I expect these restrictions will have to be known by the management 
> application.  Otherwise the users will try invalid configurations only 
> to receive errors when they launch them.  GUIs exist to guide users, not 
> as an inefficient means of trial-and-error.

So long as the restrictions would be known to the management app via
some "what slots are available" mechanism in qemu, that sounds fine.

> > I'm not at all arguing against pci_addr.  I'm arguing about how 
> > libvirt should use it with respect to the "genesis" use-case where 
> > libvirt has no specific reason to choose one PCI slot over another.  
> > In that case, I'm merely advocating that we want to let QEMU make the 
> > decision.
> 
> However this may end up, isn't it offtopic?  Whatever we do we have to 
> support both pci_addr= and default placement, so we can push this 
> discussion to livirt-devel and bid them godspeed.

Presumably you're not proposing that qemu-devel completely ignore the
typical requirements of management apps?

You can push the discussion to libvirt-devel, and the conclusion would
most likely be:

  "We can do slot allocation if you provide us with a way to query free 
   slots, or we can use qemu's default allocation if you provide us a
   way to query the allocation.

   We'd prefer the default allocation problem, but we don't really 
   care. Both require about the same amount of work for us."

libvirt was only mentioned in this thread as a concrete example of how
the suggested solutions would actually be used by management apps.

Cheers,
Mark.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark McLoughlin June 15, 2009, 4:27 p.m. UTC | #95
On Mon, 2009-06-15 at 18:12 +0300, Dor Laor wrote:
> > It doesn't want to. As Mark said, libvirt just wants to be able to ensure
> > a stable guest ABI, of which stable PCI addresses is one aspect. This does
> > not imply libvirt wants to allocate the PCI addresses, just that it wants
> > a way to keep them stable. All else being equal I'd rather libvirt wasn't
> > in the PCI address allocation business.
> >   
> 
> It's not about what libvirt wants. It's about what will serve the end 
> user the most.

Absolutely. And not just about what most helps end users of libvirt
based management apps, but also any app managing qemu.

> Apart for stable guest ABI, end users need to have the option to
> control the slot for their devices. Just like them have for physical
> machines. It's not theoretical discussion, limiting issues with shared
> irq is one real life example.

Providing end users with the *option* to choose PCI slots sounds like a
fine feature request for any management app.

Requiring all management apps to force end users to explicitly choose
PCI slots in order for slots to be stable is not so reasonable.

Cheers,
Mark.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori June 15, 2009, 6:12 p.m. UTC | #96
Mark McLoughlin wrote:
> So long as the restrictions would be known to the management app via
> some "what slots are available" mechanism in qemu, that sounds fine.
>   

I'm not sure a "what slots are available" mechanism is as straight 
forward as has been claimed.  It doesn't matter though because it's 
orthogonal to the current proposal.

>>> I'm not at all arguing against pci_addr.  I'm arguing about how 
>>> libvirt should use it with respect to the "genesis" use-case where 
>>> libvirt has no specific reason to choose one PCI slot over another.  
>>> In that case, I'm merely advocating that we want to let QEMU make the 
>>> decision.
>>>       
>> However this may end up, isn't it offtopic?  Whatever we do we have to 
>> support both pci_addr= and default placement, so we can push this 
>> discussion to livirt-devel and bid them godspeed.
>>     
>
> Presumably you're not proposing that qemu-devel completely ignore the
> typical requirements of management apps?
>   

This is a happy case where the current proposals allow both usages to 
occur.  Which one libvirt chooses it up to it.

To summarize, I think we have:

1) Introduce addressing to all host device configurations
  - Either in the canonical form "pci_addr=bus:dev.fn or target=3,lun=1" 
or in flattened form "addr=bus:dev.fn or addr=target.lun".  I prefer the 
later form but I think either would be acceptable.

2) Whenever the default machine type changes in a guest-visible way, 
introduce a new machine type
  - Use explicit versions in name: pc-v1, pc-v2 or use more descriptive 
names pc-with-usb
  - Easily transitions to device config files

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 15, 2009, 6:21 p.m. UTC | #97
On 06/15/2009 09:12 PM, Anthony Liguori wrote:
>
> 2) Whenever the default machine type changes in a guest-visible way, 
> introduce a new machine type

s/whenever/qemu stable release/

>  - Use explicit versions in name: pc-v1, pc-v2

pc-qemu-0.10?

This is similar to a hardware vendor's model number (though they tend to 
change components without changing model numbers, though naughty vendors)

> or use more descriptive names pc-with-usb
>  - Easily transitions to device config files

Combinatorial explosion.  Just use -usb.
Anthony Liguori June 15, 2009, 6:24 p.m. UTC | #98
Avi Kivity wrote:
> On 06/15/2009 09:12 PM, Anthony Liguori wrote:
>>
>> 2) Whenever the default machine type changes in a guest-visible way, 
>> introduce a new machine type
>
> s/whenever/qemu stable release/
>
>>  - Use explicit versions in name: pc-v1, pc-v2
>
> pc-qemu-0.10?
>
> This is similar to a hardware vendor's model number (though they tend 
> to change components without changing model numbers, though naughty 
> vendors)

Yup, that makes a whole lot of sense.

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Blue Swirl June 15, 2009, 6:44 p.m. UTC | #99
On 6/15/09, Avi Kivity <avi@redhat.com> wrote:
> On 06/15/2009 09:12 PM, Anthony Liguori wrote:
>
> >
> > 2) Whenever the default machine type changes in a guest-visible way,
> introduce a new machine type
> >
>
>  s/whenever/qemu stable release/
>
>
> >  - Use explicit versions in name: pc-v1, pc-v2
> >
>
>  pc-qemu-0.10?

pc-2009.06? Or given the hardware, should that be pc-1997?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 16, 2009, 8:56 a.m. UTC | #100
On 06/15/2009 09:44 PM, Blue Swirl wrote:
>>   pc-qemu-0.10?
>>      
>
> pc-2009.06? Or given the hardware, should that be pc-1997?
>    

pc-qemu-0.10 has the obvious benefit of allowing people to immediately 
know what's the oldest version of qemu that supports it.
Mark McLoughlin June 16, 2009, 12:14 p.m. UTC | #101
On Mon, 2009-06-15 at 13:12 -0500, Anthony Liguori wrote:
> Mark McLoughlin wrote:
> > So long as the restrictions would be known to the management app via
> > some "what slots are available" mechanism in qemu, that sounds fine.
> >   
> 
> I'm not sure a "what slots are available" mechanism is as straight 
> forward as has been claimed.

If qemu can't provide that information, then the management app does not
have sufficient information to do the slot allocation itself. In which
case, it must leave it up to qemu to do it.

> It doesn't matter though because it's orthogonal to the current proposal.

It is not orthogonal to solving the actual problem at hand, though -
i.e. how to allow management apps to provide stable PCI addresses.

> >>> I'm not at all arguing against pci_addr.  I'm arguing about how 
> >>> libvirt should use it with respect to the "genesis" use-case where 
> >>> libvirt has no specific reason to choose one PCI slot over another.  
> >>> In that case, I'm merely advocating that we want to let QEMU make the 
> >>> decision.
> >>>       
> >> However this may end up, isn't it offtopic?  Whatever we do we have to 
> >> support both pci_addr= and default placement, so we can push this 
> >> discussion to livirt-devel and bid them godspeed.
> >>     
> >
> > Presumably you're not proposing that qemu-devel completely ignore the
> > typical requirements of management apps?
> >   
> 
> This is a happy case where the current proposals allow both usages to 
> occur.  Which one libvirt chooses it up to it.
> 
> To summarize, I think we have:
> 
> 1) Introduce addressing to all host device configurations
>   - Either in the canonical form "pci_addr=bus:dev.fn or target=3,lun=1" 
> or in flattened form "addr=bus:dev.fn or addr=target.lun".  I prefer the 
> later form but I think either would be acceptable.

That helps, but it's not enough on its own.

The management app needs to figure out what addresses to pass either by:

   a) Initially allowing qemu to do the address allocation, and 
      thereafter using those addresses - this requires some way to query
      the addresses of devices

or b) Doing the initial address allocation itself - this requires some 
      way to query what slots are available.

> 2) Whenever the default machine type changes in a guest-visible way, 
> introduce a new machine type
>   - Use explicit versions in name: pc-v1, pc-v2 or use more descriptive 
> names pc-with-usb
>   - Easily transitions to device config files

To be clear - you're not proposing this is a solution to the "stable PCI
addresses" problem, are you? The main requirement is for the addresses
to stay stable even if the user adds/removes other devices.

This is a fine solution to the "stable guest ABI" problem ... assuming
there's some way of querying the current default machine type.

Cheers,
Mark.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 16, 2009, 12:28 p.m. UTC | #102
On 06/16/2009 03:14 PM, Mark McLoughlin wrote:
> On Mon, 2009-06-15 at 13:12 -0500, Anthony Liguori wrote:
>    
>> Mark McLoughlin wrote:
>>      
>>> So long as the restrictions would be known to the management app via
>>> some "what slots are available" mechanism in qemu, that sounds fine.
>>>
>>>        
>> I'm not sure a "what slots are available" mechanism is as straight
>> forward as has been claimed.
>>      
>
> If qemu can't provide that information, then the management app does not
> have sufficient information to do the slot allocation itself. In which
> case, it must leave it up to qemu to do it.
>    

A given -M machine will have well-known open slots (since it's an ABI), 
same as it has rtl8139 and ne2000 cards.  Worst case we hardcode those 
numbers (gasp, faint).

>> It doesn't matter though because it's orthogonal to the current proposal.
>>      
>
> It is not orthogonal to solving the actual problem at hand, though -
> i.e. how to allow management apps to provide stable PCI addresses.
>    

It's part of the solution, but hardly a difficult the most difficult part.

> This is a fine solution to the "stable guest ABI" problem ... assuming
> there's some way of querying the current default machine type.
>    

     $ qemu -print-default-machine

or maybe

     $ qemu -show default-machine
     $ qemu -show pci-bus
     $ qemu -show me a way out
Mark McLoughlin June 16, 2009, 12:39 p.m. UTC | #103
On Tue, 2009-06-16 at 15:28 +0300, Avi Kivity wrote:
> On 06/16/2009 03:14 PM, Mark McLoughlin wrote:
> > On Mon, 2009-06-15 at 13:12 -0500, Anthony Liguori wrote:
> >    
> >> Mark McLoughlin wrote:
> >>      
> >>> So long as the restrictions would be known to the management app via
> >>> some "what slots are available" mechanism in qemu, that sounds fine.
> >>>
> >>>        
> >> I'm not sure a "what slots are available" mechanism is as straight
> >> forward as has been claimed.
> >>      
> >
> > If qemu can't provide that information, then the management app does not
> > have sufficient information to do the slot allocation itself. In which
> > case, it must leave it up to qemu to do it.
> >    
> 
> A given -M machine will have well-known open slots (since it's an ABI), 
> same as it has rtl8139 and ne2000 cards.

If they're so obviously well-known, I don't see how the query mechanism
would not be straightforward, which is the comment I was replying to.

> Worst case we hardcode those numbers (gasp, faint).

Maybe we can just add the open slots to the -help output. That'd be nice
and clean.

> >> It doesn't matter though because it's orthogonal to the current proposal.
> >>      
> >
> > It is not orthogonal to solving the actual problem at hand, though -
> > i.e. how to allow management apps to provide stable PCI addresses.
> >    
> 
> It's part of the solution, but hardly a difficult the most difficult part.

Agree.

> > This is a fine solution to the "stable guest ABI" problem ... assuming
> > there's some way of querying the current default machine type.
> >    
> 
>      $ qemu -print-default-machine

Or:

  $ readlink /usr/share/qemu/machine-types/pc.dt

Cheers,
Mark.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 16, 2009, 12:51 p.m. UTC | #104
On 06/16/2009 03:39 PM, Mark McLoughlin wrote:

>> Worst case we hardcode those numbers (gasp, faint).
>>      
>
> Maybe we can just add the open slots to the -help output. That'd be nice
> and clean.
>    

Yeah, there's precedent too.

> Or:
>
>    $ readlink /usr/share/qemu/machine-types/pc.dt
>
>    

That works if you have exactly one qemu installed.  It's best if qemu 
itself is the entry point (qemu -print-device-tree).

Though I wouldn't want to inflict it upon the management application 
writers.
Jamie Lokier June 16, 2009, 6:32 p.m. UTC | #105
Avi Kivity wrote:
> Another issue is enumeration.  Guests will present their devices in the 
> order they find them on the pci bus (of course enumeration is guest 
> specific).  So if I have 2 virtio controllers the only way I can 
> distinguish between them is using their pci slots.

virtio controllers really should have a user-suppliable string or UUID
to identify them to the guest.  Don't they?

-- Jamie
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamie Lokier June 16, 2009, 6:38 p.m. UTC | #106
Mark McLoughlin wrote:
> > After libvirt has done -drive file=foo... it should dump the machine 
> > config and use that from then on.
> 
> Right - libvirt then wouldn't be able to avoid the complexity of merging
> any future changes into the dumped machine config.

As long as qemu can accept a machine config _and_ -drive file=foo (and
monitor commands to add/remove devices), libvirt could merge by simply
calling qemu with whatever additional command line options or monitor
commands modify the config, then dump the new config.

That way, virtio would not have to deal with that complexity.  It
would be written in one place: qemu.

Or better, a utility: qemu-machine-config.

-- Jamie
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamie Lokier June 16, 2009, 6:44 p.m. UTC | #107
Mark McLoughlin wrote:
> > Worst case we hardcode those numbers (gasp, faint).
> 
> Maybe we can just add the open slots to the -help output. That'd be nice
> and clean.

Make them part of the machine configuration.

After all, they are part of the machine configuration, and ACPI, BIOS
etc. need to know about all the machine slots anyway.

Having said that, I prefer the idea that slot allocation is handled
either in Qemu, or in a separate utility called qemu-machine-config
(for working with machine configs), or in a library
libqemu-machine-config.so.

I particularly don't like the idea of arcane machine-dependent slot
allocation knowledge living in libvirt, because it needs to be in Qemu
anyway for non-libvirt users.  No point in having two implementations
of something tricky and likely to have machine quirks, if one will do.

-- Jamie
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 17, 2009, 6:38 a.m. UTC | #108
On 06/16/2009 09:32 PM, Jamie Lokier wrote:
> Avi Kivity wrote:
>    
>> Another issue is enumeration.  Guests will present their devices in the
>> order they find them on the pci bus (of course enumeration is guest
>> specific).  So if I have 2 virtio controllers the only way I can
>> distinguish between them is using their pci slots.
>>      
>
> virtio controllers really should have a user-suppliable string or UUID
> to identify them to the guest.  Don't they?
>    

virtio controllers don't exist.  When they do, they may have a UUID or 
not, but in either case guest infrastructure is in place for reporting 
the PCI slot, not the UUID.

virtio disks do have a UUID.  I don't think older versions of Windows 
will use it though, so if you reorder your slots you'll see your drive 
letters change.  Same with Linux if you don't use udev by-uuid rules.
Mark McLoughlin June 17, 2009, 8:33 a.m. UTC | #109
On Tue, 2009-06-16 at 19:44 +0100, Jamie Lokier wrote:
> Mark McLoughlin wrote:
> > > Worst case we hardcode those numbers (gasp, faint).
> > 
> > Maybe we can just add the open slots to the -help output. That'd be nice
> > and clean.

I was being sarcastic - libvirt currently must parse qemu -help, and
even has some test infrastructure to check that it works with various
versions of qemu. Extending this would not be nice and clean :-)

> I particularly don't like the idea of arcane machine-dependent slot
> allocation knowledge living in libvirt, because it needs to be in Qemu
> anyway for non-libvirt users.  No point in having two implementations
> of something tricky and likely to have machine quirks, if one will do.

Indeed.

Cheers,
Mark.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 17, 2009, 9:03 a.m. UTC | #110
On 06/17/2009 11:33 AM, Mark McLoughlin wrote:
>> I particularly don't like the idea of arcane machine-dependent slot
>> allocation knowledge living in libvirt, because it needs to be in Qemu
>> anyway for non-libvirt users.  No point in having two implementations
>> of something tricky and likely to have machine quirks, if one will do.
>>      
>
> Indeed.
>    

I don't understand this.  Management already has to allocate MAC 
addresses, UUIDs, IDE interface and master/slave role, SCSI 
LUNs/targets/whatever.  It has to understand NUMA (if not do actual 
allocation).  Even if it doesn't allocate the slots, it has to be able 
to query them so it can tell the user which NIC or controller is 
connected where, or to do hotunplug.  It has to understand that there is 
a limitation on the number of slots, and know what that limitation is 
(unless it feels that launching an overcommitted guest and showing an 
error to the user is preferable to not allowing the user to overcommit 
in the first place.

If you'll review my patent application for pci slot allocation, you'll 
see the following line:

   slot_nr = nb_allocated_slots++; /* Allocate pci slot */

while there is a lot of complicated setup code before that (see the 
prior art section as well), I believe licensees could well implement the 
algorithm in two short months, including testing.
Mark McLoughlin June 17, 2009, 9:18 a.m. UTC | #111
On Wed, 2009-06-17 at 12:03 +0300, Avi Kivity wrote:
> On 06/17/2009 11:33 AM, Mark McLoughlin wrote:
> >> I particularly don't like the idea of arcane machine-dependent slot
> >> allocation knowledge living in libvirt, because it needs to be in Qemu
> >> anyway for non-libvirt users.  No point in having two implementations
> >> of something tricky and likely to have machine quirks, if one will do.
> >
> > Indeed.
> 
> I don't understand this.

Take note of the "arcane machine-dependent slot allocation knowledge"
bit.

If the algorithm in for management apps is as simple as "query qemu for
available slots and sequentially allocate slots", then that's perfectly
fine.

If management apps need to hard-code which slots are available on
different targets and different qemu versions, or restrictions on which
devices can use which slots, or knowledge that some devices can be
multi-function, or ... anything like that is just lame.

Cheers,
Mark.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 17, 2009, 9:26 a.m. UTC | #112
On 06/17/2009 12:18 PM, Mark McLoughlin wrote:
> On Wed, 2009-06-17 at 12:03 +0300, Avi Kivity wrote:
>    
>> On 06/17/2009 11:33 AM, Mark McLoughlin wrote:
>>      
>>>> I particularly don't like the idea of arcane machine-dependent slot
>>>> allocation knowledge living in libvirt, because it needs to be in Qemu
>>>> anyway for non-libvirt users.  No point in having two implementations
>>>> of something tricky and likely to have machine quirks, if one will do.
>>>>          
>>> Indeed.
>>>        
>> I don't understand this.
>>      
>
> Take note of the "arcane machine-dependent slot allocation knowledge"
> bit.
>
> If the algorithm in for management apps is as simple as "query qemu for
> available slots and sequentially allocate slots", then that's perfectly
> fine.
>    

That's the thinking.

> If management apps need to hard-code which slots are available on
> different targets and different qemu versions, or restrictions on which
> devices can use which slots, or knowledge that some devices can be
> multi-function, or ... anything like that is just lame.
>    

You can't abstract these things away.  If you can't put a NIC in slot 4, 
and you have 7 slots, then you cannot have 7 NICs.  Having qemu allocate 
the slot numbers does not absolve management from knowing this 
limitation and preventing the user from creating a machine with 7 slots.

Likewise, management will have to know which devices are multi-function, 
since that affects their hotpluggability.  Ditto if some slot if faster 
than others, if you want to make use of this information you have to let 
the upper layers know.

It could be done using an elaborate machine description that qemu 
exposes to management coupled with a constraint solver that optimizes 
the machine layout according to user specifications and hardware 
limitations.  Or we could take the view that real life is not perfect 
(especially where computers are involved), add some machine specific 
knowledge, and spend the rest of the summer at the beach.
Jamie Lokier June 17, 2009, 11:51 a.m. UTC | #113
Avi Kivity wrote:
> On 06/16/2009 09:32 PM, Jamie Lokier wrote:
> >Avi Kivity wrote:
> >   
> >>Another issue is enumeration.  Guests will present their devices in the
> >>order they find them on the pci bus (of course enumeration is guest
> >>specific).  So if I have 2 virtio controllers the only way I can
> >>distinguish between them is using their pci slots.
> >
> >virtio controllers really should have a user-suppliable string or UUID
> >to identify them to the guest.  Don't they?
> 
> virtio controllers don't exist.  When they do, they may have a UUID or 
> not, but in either case guest infrastructure is in place for reporting 
> the PCI slot, not the UUID.
> 
> virtio disks do have a UUID.  I don't think older versions of Windows 
> will use it though, so if you reorder your slots you'll see your drive 
> letters change.  Same with Linux if you don't use udev by-uuid rules.

I guess I meant virtio disks, so that's ok.

-- Jamie
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamie Lokier June 17, 2009, 11:58 a.m. UTC | #114
Avi Kivity wrote:
> >If management apps need to hard-code which slots are available on
> >different targets and different qemu versions, or restrictions on which
> >devices can use which slots, or knowledge that some devices can be
> >multi-function, or ... anything like that is just lame.
> >   
> 
> You can't abstract these things away.  If you can't put a NIC in slot 4, 
> and you have 7 slots, then you cannot have 7 NICs.  Having qemu allocate 
> the slot numbers does not absolve management from knowing this 
> limitation and preventing the user from creating a machine with 7 slots.
> 
> Likewise, management will have to know which devices are multi-function, 
> since that affects their hotpluggability.  Ditto if some slot if faster 
> than others, if you want to make use of this information you have to let 
> the upper layers know.
> 
> It could be done using an elaborate machine description that qemu 
> exposes to management coupled with a constraint solver that optimizes 
> the machine layout according to user specifications and hardware 
> limitations.  Or we could take the view that real life is not perfect 
> (especially where computers are involved), add some machine specific 
> knowledge, and spend the rest of the summer at the beach.

To be honest, an elaborate machine description is probably fine...

A fancy constraint solver is not required.  A simple one strikes me as
about as simple as what you'd hard-code anyway, but with fewer special
cases.

Note that the result can fail due to things like insufficient address
space for all the device BARs even when they _are_ in the right slots.
Especially if there are lots of slots, or bridges which can provide
unlimited slots.

That is arcane: device-dependent, CPU-dependent, machine-dependent,
RAM-size dependent (in a non-linear way), device-option-dependent and
probably QEMU-version-dependent too.

It would be nice if libvirt (et al) would prevent the user from
creating a VM with insufficient BAR space for that machine, but I'm
not sure how to do it sanely, without arcane knowledge getting about.

Maybe that idea of a .so shared by qemu and libvirt, to manipulate
device configurations, is a sane one after all.

-- Jamie
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/hw/pci.c b/hw/pci.c
index 361d741..ed011b5 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -130,12 +130,13 @@  void pci_device_save(PCIDevice *s, QEMUFile *f)
     int version = s->cap_present ? 3 : 2;
     int i;
 
-    qemu_put_be32(f, version); /* PCI device version */
+    /* PCI device version and capabilities */
+    qemu_put_be32(f, version);
+    if (version >= 3)
+        qemu_put_be32(f, s->cap_present);
     qemu_put_buffer(f, s->config, 256);
     for (i = 0; i < 4; i++)
         qemu_put_be32(f, s->irq_state[i]);
-    if (version >= 3)
-        qemu_put_be32(f, s->cap_present);
 }
 
 int pci_device_load(PCIDevice *s, QEMUFile *f)
@@ -146,12 +147,6 @@  int pci_device_load(PCIDevice *s, QEMUFile *f)
     version_id = qemu_get_be32(f);
     if (version_id > 3)
         return -EINVAL;
-    qemu_get_buffer(f, s->config, 256);
-    pci_update_mappings(s);
-
-    if (version_id >= 2)
-        for (i = 0; i < 4; i ++)
-            s->irq_state[i] = qemu_get_be32(f);
     if (version_id >= 3)
         s->cap_present = qemu_get_be32(f);
     else
@@ -160,6 +155,18 @@  int pci_device_load(PCIDevice *s, QEMUFile *f)
     if (s->cap_present & ~s->cap_supported)
         return -EINVAL;
 
+    qemu_get_buffer(f, s->config, 256);
+    pci_update_mappings(s);
+
+    if (version_id >= 2)
+        for (i = 0; i < 4; i ++)
+            s->irq_state[i] = qemu_get_be32(f);
+    /* Clear wmask and used bits for capabilities.
+       Must be restored separately, since capabilities can
+       be placed anywhere in config space. */
+    memset(s->used, 0, PCI_CONFIG_SPACE_SIZE);
+    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i)
+        s->wmask[i] = 0xff;
     return 0;
 }
 
@@ -870,3 +877,76 @@  PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name)
 
     return (PCIDevice *)dev;
 }
+
+static int pci_find_space(PCIDevice *pdev, uint8_t size)
+{
+    int offset = PCI_CONFIG_HEADER_SIZE;
+    int i;
+    for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i)
+        if (pdev->used[i])
+            offset = i + 1;
+        else if (i - offset + 1 == size)
+            return offset;
+    return 0;
+}
+
+static uint8_t pci_find_capability_list(PCIDevice *pdev, uint8_t cap_id,
+                                        uint8_t *prev_p)
+{
+    uint8_t next, prev;
+
+    if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST))
+        return 0;
+
+    for (prev = PCI_CAPABILITY_LIST; (next = pdev->config[prev]);
+         prev = next + PCI_CAP_LIST_NEXT)
+        if (pdev->config[next + PCI_CAP_LIST_ID] == cap_id)
+            break;
+
+    *prev_p = prev;
+    return next;
+}
+
+/* Reserve space and add capability to the linked list in pci config space */
+int pci_add_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t size)
+{
+    uint8_t offset = pci_find_space(pdev, size);
+    uint8_t *config = pdev->config + offset;
+    if (!offset)
+        return -ENOSPC;
+    config[PCI_CAP_LIST_ID] = cap_id;
+    config[PCI_CAP_LIST_NEXT] = pdev->config[PCI_CAPABILITY_LIST];
+    pdev->config[PCI_CAPABILITY_LIST] = offset;
+    pdev->config[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
+    memset(pdev->used + offset, 0xFF, size);
+    /* Make capability read-only by default */
+    memset(pdev->wmask + offset, 0, size);
+    return offset;
+}
+
+/* Unlink capability from the pci config space. */
+void pci_del_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t size)
+{
+    uint8_t prev, offset = pci_find_capability_list(pdev, cap_id, &prev);
+    if (!offset)
+        return;
+    pdev->config[prev] = pdev->config[offset + PCI_CAP_LIST_NEXT];
+    /* Make capability writeable again */
+    memset(pdev->wmask + offset, 0xff, size);
+    memset(pdev->used + offset, 0, size);
+
+    if (!pdev->config[PCI_CAPABILITY_LIST])
+        pdev->config[PCI_STATUS] &= ~PCI_STATUS_CAP_LIST;
+}
+
+/* Reserve space for capability at a known offset (to call after load). */
+void pci_reserve_capability(PCIDevice *pdev, uint8_t offset, uint8_t size)
+{
+    memset(pdev->used + offset, 0xff, size);
+}
+
+uint8_t pci_find_capability(PCIDevice *pdev, uint8_t cap_id)
+{
+    uint8_t prev;
+    return pci_find_capability_list(pdev, cap_id, &prev);
+}
diff --git a/hw/pci.h b/hw/pci.h
index 6f0803f..4838c59 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -123,6 +123,10 @@  typedef struct PCIIORegion {
 #define PCI_MIN_GNT		0x3e	/* 8 bits */
 #define PCI_MAX_LAT		0x3f	/* 8 bits */
 
+/* Capability lists */
+#define PCI_CAP_LIST_ID		0	/* Capability ID */
+#define PCI_CAP_LIST_NEXT	1	/* Next capability in the list */
+
 #define PCI_REVISION            0x08    /* obsolete, use PCI_REVISION_ID */
 #define PCI_SUBVENDOR_ID        0x2c    /* obsolete, use PCI_SUBSYSTEM_VENDOR_ID */
 #define PCI_SUBDEVICE_ID        0x2e    /* obsolete, use PCI_SUBSYSTEM_ID */
@@ -130,7 +134,7 @@  typedef struct PCIIORegion {
 /* Bits in the PCI Status Register (PCI 2.3 spec) */
 #define PCI_STATUS_RESERVED1	0x007
 #define PCI_STATUS_INT_STATUS	0x008
-#define PCI_STATUS_CAPABILITIES	0x010
+#define PCI_STATUS_CAP_LIST	0x010
 #define PCI_STATUS_66MHZ	0x020
 #define PCI_STATUS_RESERVED2	0x040
 #define PCI_STATUS_FAST_BACK	0x080
@@ -160,6 +164,9 @@  struct PCIDevice {
     /* Used to implement R/W bytes */
     uint8_t wmask[PCI_CONFIG_SPACE_SIZE];
 
+    /* Used to allocate config space for capabilities. */
+    uint8_t used[PCI_CONFIG_SPACE_SIZE];
+
     /* the following fields are read only */
     PCIBus *bus;
     int devfn;
@@ -194,6 +201,15 @@  void pci_register_io_region(PCIDevice *pci_dev, int region_num,
                             uint32_t size, int type,
                             PCIMapIORegionFunc *map_func);
 
+int pci_add_capability(PCIDevice *pci_dev, uint8_t cap_id, uint8_t cap_size);
+
+void pci_del_capability(PCIDevice *pci_dev, uint8_t cap_id, uint8_t cap_size);
+
+void pci_reserve_capability(PCIDevice *pci_dev, uint8_t offset, uint8_t size);
+
+uint8_t pci_find_capability(PCIDevice *pci_dev, uint8_t cap_id);
+
+
 uint32_t pci_default_read_config(PCIDevice *d,
                                  uint32_t address, int len);
 void pci_default_write_config(PCIDevice *d,