diff mbox

Graphics pass-through

Message ID 1296175554.2891.29.camel@x201 (mailing list archive)
State New, archived
Headers show

Commit Message

Alex Williamson Jan. 28, 2011, 12:45 a.m. UTC
None

Comments

Jan Kiszka May 5, 2011, 8:50 a.m. UTC | #1
Hi Alex,

On 2011-01-28 01:45, Alex Williamson wrote:
> On Thu, 2011-01-27 at 12:56 +0100, André Weidemann wrote:
>> Hi Alex,
>>
>> On 26.01.2011 06:12, Alex Williamson wrote:
>>
>>> So while your initial results are promising, my guess is that you're
>>> using card specific drivers and still need to consider some of the
>>> harder problems with generic support for vga assignment.  I hacked on
>>> this for a bit trying to see if I could get vga assignment working
>>> with the vfio driver.  Setting up the legacy access and preventing
>>> qemu from stealing it back should get you basic vga modes and might
>>> even allow the option rom to run to initialize the card for pre-boot.
>>> I was able to get this far on a similar ATI card.  I never hard much
>>> luck with other cards though, and I was never able to get the vesa
>>> extensions working.  Thanks,
>>
>> Do you mind sharing these patches?
> 
> Attached.
> 

We are about to try some pass-through with an NVIDA card. So I already
hacked on your vfio patch to make it build against current devices
assignment code. Some questions arose while studying the code:

...

> --- /dev/null
> +++ b/hw/vfio-vga.c
> @@ -0,0 +1,291 @@
> +/*
> + * vfio VGA device assignment support
> + *
> + * Copyright Red Hat, Inc. 2010
> + *
> + * Authors:
> + *  Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Based on qemu-kvm device-assignment:
> + *  Adapted for KVM by Qumranet.
> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
> + */
> +
> +#include <stdio.h>
> +#include <unistd.h>
> +#include <sys/io.h>
> +#include <sys/mman.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include "event_notifier.h"
> +#include "hw.h"
> +#include "memory.h"
> +#include "monitor.h"
> +#include "pc.h"
> +#include "qemu-error.h"
> +#include "sysemu.h"
> +#include "vfio.h"
> +#include <pci/header.h>
> +#include <pci/types.h>
> +#include <linux/types.h>
> +#include "linux-vfio.h"
> +
> +//#define DEBUG_VFIO_VGA
> +#ifdef DEBUG_VFIO_VGA
> +#define DPRINTF(fmt, ...) \
> +    do { printf("vfio-vga: " fmt, ## __VA_ARGS__); } while (0)
> +#else
> +#define DPRINTF(fmt, ...) \
> +    do { } while (0)
> +#endif
> +
> +/*
> + * VGA setup
> + */
> +static void vfio_vga_write(VFIODevice *vdev, uint32_t addr,
> +                           uint32_t val, int len)
> +{
> +    DPRINTF("%s 0x%x %d - 0x%x\n", __func__, 0xa0000 + addr, len, val);
> +    switch (len) {
> +        case 1:
> +            *(uint8_t *)(vdev->vga_mmio + addr) = (uint8_t)val;
> +            break;
> +        case 2:
> +            *(uint16_t *)(vdev->vga_mmio + addr) = (uint16_t)val;
> +            break;
> +        case 4:
> +            *(uint32_t *)(vdev->vga_mmio + addr) = val;
> +            break;
> +    }
> +}
> +
> +static void vfio_vga_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    vfio_vga_write(opaque, addr, val, 1);
> +}
> +
> +static void vfio_vga_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    vfio_vga_write(opaque, addr, val, 2);
> +}
> +
> +static void vfio_vga_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    vfio_vga_write(opaque, addr, val, 4);
> +}
> +
> +static CPUWriteMemoryFunc * const vfio_vga_writes[] = {
> +    &vfio_vga_writeb,
> +    &vfio_vga_writew,
> +    &vfio_vga_writel
> +};
> +
> +static uint32_t vfio_vga_read(VFIODevice *vdev, uint32_t addr, int len)
> +{
> +    uint32_t val = 0xffffffff;
> +    switch (len) {
> +        case 1:
> +            val = (uint32_t)*(uint8_t *)(vdev->vga_mmio + addr);
> +            break;
> +        case 2:
> +            val = (uint32_t)*(uint16_t *)(vdev->vga_mmio + addr);
> +            break;
> +        case 4:
> +            val = *(uint32_t *)(vdev->vga_mmio + addr);
> +            break;
> +    }
> +    DPRINTF("%s 0x%x %d = 0x%x\n", __func__, 0xa0000 + addr, len, val);
> +    return val;
> +}
> +
> +static uint32_t vfio_vga_readb(void *opaque, target_phys_addr_t addr)
> +{
> +    return vfio_vga_read(opaque, addr, 1);
> +}
> +
> +static uint32_t vfio_vga_readw(void *opaque, target_phys_addr_t addr)
> +{
> +    return vfio_vga_read(opaque, addr, 2);
> +}
> +
> +static uint32_t vfio_vga_readl(void *opaque, target_phys_addr_t addr)
> +{
> +    return vfio_vga_read(opaque, addr, 4);
> +}
> +
> +static CPUReadMemoryFunc * const vfio_vga_reads[] = {
> +    &vfio_vga_readb,
> +    &vfio_vga_readw,
> +    &vfio_vga_readl
> +};
> +
> +static void vfio_vga_out(VFIODevice *vdev, uint32_t addr, uint32_t val, int len)
> +{
> +    DPRINTF("%s 0x%x %d - 0x%x\n", __func__, addr, len, val);
> +    ioperm(0x3b0, 0x30, 1); /* XXX fix me */

Why do you have to re-establish the ioperms here on each access? Are we
just lacking the use of generic kvm ioperm management?

> +    switch (len) {
> +        case 1:
> +            outb(val, addr);
> +            break;
> +        case 2:
> +            outw(val, addr);
> +            break;
> +        case 4:
> +            outl(val, addr);
> +            break;
> +    }
> +}
> +
> +static void vfio_vga_outb(void *opaque, uint32_t addr, uint32_t val)
> +{
> +    vfio_vga_out(opaque, addr, val, 1);
> +}
> +
> +static void vfio_vga_outw(void *opaque, uint32_t addr, uint32_t val)
> +{
> +    vfio_vga_out(opaque, addr, val, 2);
> +}
> +
> +static void vfio_vga_outl(void *opaque, uint32_t addr, uint32_t val)
> +{
> +    vfio_vga_out(opaque, addr, val, 4);
> +}
> +
> +static uint32_t vfio_vga_in(VFIODevice *vdev, uint32_t addr, int len)
> +{
> +    uint32_t val = 0xffffffff;
> +    ioperm(0x3b0, 0x30, 1); /* XXX fix me */
> +    switch (len) {
> +        case 1:
> +            val = inb(addr);
> +            break;
> +        case 2:
> +            val = inw(addr);
> +            break;
> +        case 4:
> +            val = inl(addr);
> +            break;
> +    }
> +    DPRINTF("%s 0x%x, %d = 0x%x\n", __func__, addr, len, val);
> +    return val;
> +}
> +
> +static uint32_t vfio_vga_inb(void *opaque, uint32_t addr)
> +{
> +    return vfio_vga_in(opaque, addr, 1);
> +}
> +
> +static uint32_t vfio_vga_inw(void *opaque, uint32_t addr)
> +{
> +    return vfio_vga_in(opaque, addr, 2);
> +}
> +
> +static uint32_t vfio_vga_inl(void *opaque, uint32_t addr)
> +{
> +    return vfio_vga_in(opaque, addr, 4);
> +}
> +
> +int vfio_vga_setup(VFIODevice *vdev)
> +{
> +    char buf[256];
> +    int ret;
> +
> +    if (vga_interface_type != VGA_NONE) {
> +        fprintf(stderr,
> +                "VGA devie assigned without -vga none param, no ISA VGA\n");
> +        return -1;
> +    }
> +
> +    vdev->vga_fd = open("/dev/vga_arbiter", O_RDWR);
> +    if (vdev->vga_fd < 0) {
> +        fprintf(stderr, "%s - Failed to open vga arbiter (%s)\n",
> +                __func__, strerror(errno));
> +        return -1;
> +    }
> +    ret = read(vdev->vga_fd, buf, sizeof(buf));
> +    if (ret <= 0) {
> +        fprintf(stderr, "%s - Failed to read from vga arbiter (%s)\n",
> +                __func__, strerror(errno));
> +        close(vdev->vga_fd);
> +        return -1;
> +    }
> +    buf[ret - 1] = 0;
> +    vdev->vga_orig = qemu_strdup(buf);
> +
> +    snprintf(buf, sizeof(buf), "target PCI:%04x:%02x:%02x.%x",
> +             vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func);
> +    ret = write(vdev->vga_fd, buf, strlen(buf));
> +    if (ret != strlen(buf)) {
> +        fprintf(stderr, "%s - Failed to write to vga arbiter (%s)\n",
> +                __func__, strerror(errno));
> +        close(vdev->vga_fd);
> +        return -1;
> +    }
> +    snprintf(buf, sizeof(buf), "decodes io+mem");
> +    ret = write(vdev->vga_fd, buf, strlen(buf));
> +    if (ret != strlen(buf)) {
> +        fprintf(stderr, "%s - Failed to write to vga arbiter (%s)\n",
> +                __func__, strerror(errno));
> +        close(vdev->vga_fd);
> +        return -1;
> +    }

OK, so we grab the assigned adapter and make it handle legacy io+mem. I
guess this approach only works with a single guest with an assigned
adapter. Would it be possible and not extremely costly to do some
on-demand grabbing of the range to share it with multiple VMs?

And what about the host? When does Linux release the legacy range?
Always or only when a specific (!=vga/vesa) framebuffer driver is loaded?

Is there some other way to pass the legacy accesses from the guest to a
specific adapter without going via the host's legacy area? I.e. do some
adapters allow remapping?

> +
> +    vdev->vga_mmio_fd = open("/dev/mem", O_RDWR);
> +    if (vdev->vga_mmio_fd < 0) {
> +        fprintf(stderr, "%s - Failed to open /dev/mem (%s)\n",
> +                __func__, strerror(errno));
> +        return -1;
> +    }
> +    vdev->vga_mmio = mmap(NULL, 0x40000, PROT_READ | PROT_WRITE,
> +                          MAP_SHARED, vdev->vga_mmio_fd, 0xa0000);
> +    if (vdev->vga_mmio == MAP_FAILED) {
> +        fprintf(stderr, "%s - mmap failed (%s)\n", __func__, strerror(errno));
> +        return -1;
> +    }
> +
> +#if 1
> +    vdev->vga_io = cpu_register_io_memory(vfio_vga_reads,
> +                                          vfio_vga_writes, vdev);
> +    cpu_register_physical_memory(0xa0000, 0x20000, vdev->vga_io);
> +    qemu_register_coalesced_mmio(0xa0000, 0x20000);
> +#else
> +    cpu_register_physical_memory(0xa0000, 0x20000, 
> +        qemu_ram_map(&vdev->pdev.qdev, "VGA", 0x20000, vdev->vga_mmio));
> +    qemu_register_coalesced_mmio(0xa0000, 0x20000);
> +#endif

To make the second case work, we would have to track the mode switches
of the guest via legacy VGA interfaces and switch the mapping on the
fly, right?

> +
> +    register_ioport_write(0x3b0, 0x30, 1, vfio_vga_outb, vdev);
> +    register_ioport_write(0x3b0, 0x30, 2, vfio_vga_outw, vdev);
> +    register_ioport_write(0x3b0, 0x30, 4, vfio_vga_outl, vdev);
> +    register_ioport_read(0x3b0, 0x30, 1, vfio_vga_inb, vdev);
> +    register_ioport_read(0x3b0, 0x30, 2, vfio_vga_inw, vdev);
> +    register_ioport_read(0x3b0, 0x30, 4, vfio_vga_inl, vdev);
> +    if (ioperm(0x3b0, 0x30, 1)) {
> +        fprintf(stderr, "%s - ioperm failed (%s)\n", __func__, strerror(errno));
> +        return -1;
> +    }
> +    return 0;
> +}
> +
> +void vfio_vga_exit(VFIODevice *vdev)
> +{
> +    if (!vdev->vga_io)
> +        return;
> +
> +    isa_unassign_ioport(0x3b0, 0x30);
> +    qemu_unregister_coalesced_mmio(0xa0000, 0x20000);
> +    cpu_register_physical_memory(0xa0000, 0x20000, IO_MEM_UNASSIGNED);
> +    cpu_unregister_io_memory(vdev->vga_io);
> +    munmap(vdev->vga_mmio, 0x40000);
> +    close(vdev->vga_mmio_fd);
> +    qemu_free(vdev->vga_orig);
> +    close(vdev->vga_fd);
> +}
> +

Thanks,
Jan
Alex Williamson May 5, 2011, 3:17 p.m. UTC | #2
Hi Jan,

On Thu, 2011-05-05 at 10:50 +0200, Jan Kiszka wrote:
> Hi Alex,
> 
> On 2011-01-28 01:45, Alex Williamson wrote:
> > On Thu, 2011-01-27 at 12:56 +0100, André Weidemann wrote:
> >> Hi Alex,
> >>
> >> On 26.01.2011 06:12, Alex Williamson wrote:
> >>
> >>> So while your initial results are promising, my guess is that you're
> >>> using card specific drivers and still need to consider some of the
> >>> harder problems with generic support for vga assignment.  I hacked on
> >>> this for a bit trying to see if I could get vga assignment working
> >>> with the vfio driver.  Setting up the legacy access and preventing
> >>> qemu from stealing it back should get you basic vga modes and might
> >>> even allow the option rom to run to initialize the card for pre-boot.
> >>> I was able to get this far on a similar ATI card.  I never hard much
> >>> luck with other cards though, and I was never able to get the vesa
> >>> extensions working.  Thanks,
> >>
> >> Do you mind sharing these patches?
> > 
> > Attached.
> > 
> 
> We are about to try some pass-through with an NVIDA card. So I already
> hacked on your vfio patch to make it build against current devices
> assignment code. Some questions arose while studying the code:

Cool!

> > --- /dev/null
> > +++ b/hw/vfio-vga.c
> > @@ -0,0 +1,291 @@
> > +/*
> > + * vfio VGA device assignment support
> > + *
> > + * Copyright Red Hat, Inc. 2010
> > + *
> > + * Authors:
> > + *  Alex Williamson <alex.williamson@redhat.com>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > + * the COPYING file in the top-level directory.
> > + *
> > + * Based on qemu-kvm device-assignment:
> > + *  Adapted for KVM by Qumranet.
> > + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
> > + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
> > + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
> > + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
> > + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
> > + */
> > +
> > +#include <stdio.h>
> > +#include <unistd.h>
> > +#include <sys/io.h>
> > +#include <sys/mman.h>
> > +#include <sys/types.h>
> > +#include <sys/stat.h>
> > +#include "event_notifier.h"
> > +#include "hw.h"
> > +#include "memory.h"
> > +#include "monitor.h"
> > +#include "pc.h"
> > +#include "qemu-error.h"
> > +#include "sysemu.h"
> > +#include "vfio.h"
> > +#include <pci/header.h>
> > +#include <pci/types.h>
> > +#include <linux/types.h>
> > +#include "linux-vfio.h"
> > +
> > +//#define DEBUG_VFIO_VGA
> > +#ifdef DEBUG_VFIO_VGA
> > +#define DPRINTF(fmt, ...) \
> > +    do { printf("vfio-vga: " fmt, ## __VA_ARGS__); } while (0)
> > +#else
> > +#define DPRINTF(fmt, ...) \
> > +    do { } while (0)
> > +#endif
> > +
> > +/*
> > + * VGA setup
> > + */
> > +static void vfio_vga_write(VFIODevice *vdev, uint32_t addr,
> > +                           uint32_t val, int len)
> > +{
> > +    DPRINTF("%s 0x%x %d - 0x%x\n", __func__, 0xa0000 + addr, len, val);
> > +    switch (len) {
> > +        case 1:
> > +            *(uint8_t *)(vdev->vga_mmio + addr) = (uint8_t)val;
> > +            break;
> > +        case 2:
> > +            *(uint16_t *)(vdev->vga_mmio + addr) = (uint16_t)val;
> > +            break;
> > +        case 4:
> > +            *(uint32_t *)(vdev->vga_mmio + addr) = val;
> > +            break;
> > +    }
> > +}
> > +
> > +static void vfio_vga_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
> > +{
> > +    vfio_vga_write(opaque, addr, val, 1);
> > +}
> > +
> > +static void vfio_vga_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
> > +{
> > +    vfio_vga_write(opaque, addr, val, 2);
> > +}
> > +
> > +static void vfio_vga_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
> > +{
> > +    vfio_vga_write(opaque, addr, val, 4);
> > +}
> > +
> > +static CPUWriteMemoryFunc * const vfio_vga_writes[] = {
> > +    &vfio_vga_writeb,
> > +    &vfio_vga_writew,
> > +    &vfio_vga_writel
> > +};
> > +
> > +static uint32_t vfio_vga_read(VFIODevice *vdev, uint32_t addr, int len)
> > +{
> > +    uint32_t val = 0xffffffff;
> > +    switch (len) {
> > +        case 1:
> > +            val = (uint32_t)*(uint8_t *)(vdev->vga_mmio + addr);
> > +            break;
> > +        case 2:
> > +            val = (uint32_t)*(uint16_t *)(vdev->vga_mmio + addr);
> > +            break;
> > +        case 4:
> > +            val = *(uint32_t *)(vdev->vga_mmio + addr);
> > +            break;
> > +    }
> > +    DPRINTF("%s 0x%x %d = 0x%x\n", __func__, 0xa0000 + addr, len, val);
> > +    return val;
> > +}
> > +
> > +static uint32_t vfio_vga_readb(void *opaque, target_phys_addr_t addr)
> > +{
> > +    return vfio_vga_read(opaque, addr, 1);
> > +}
> > +
> > +static uint32_t vfio_vga_readw(void *opaque, target_phys_addr_t addr)
> > +{
> > +    return vfio_vga_read(opaque, addr, 2);
> > +}
> > +
> > +static uint32_t vfio_vga_readl(void *opaque, target_phys_addr_t addr)
> > +{
> > +    return vfio_vga_read(opaque, addr, 4);
> > +}
> > +
> > +static CPUReadMemoryFunc * const vfio_vga_reads[] = {
> > +    &vfio_vga_readb,
> > +    &vfio_vga_readw,
> > +    &vfio_vga_readl
> > +};
> > +
> > +static void vfio_vga_out(VFIODevice *vdev, uint32_t addr, uint32_t val, int len)
> > +{
> > +    DPRINTF("%s 0x%x %d - 0x%x\n", __func__, addr, len, val);
> > +    ioperm(0x3b0, 0x30, 1); /* XXX fix me */
> 
> Why do you have to re-establish the ioperms here on each access? Are we
> just lacking the use of generic kvm ioperm management?

IIRC, setting it up initially wasn't sticking, so I put it here as just
a quick fix to make sure it was set before we used it.  I never fully
made it though debugging why it wasn't working when set earlier.

In general, legacy mmio and ioport needs a better solution.  I wish x86
implemented the legacy io feature of pci sysfs so we could do it that
way, which might also move vga arbitration and chipset vga routing into
the host kernel.

> > +    switch (len) {
> > +        case 1:
> > +            outb(val, addr);
> > +            break;
> > +        case 2:
> > +            outw(val, addr);
> > +            break;
> > +        case 4:
> > +            outl(val, addr);
> > +            break;
> > +    }
> > +}
> > +
> > +static void vfio_vga_outb(void *opaque, uint32_t addr, uint32_t val)
> > +{
> > +    vfio_vga_out(opaque, addr, val, 1);
> > +}
> > +
> > +static void vfio_vga_outw(void *opaque, uint32_t addr, uint32_t val)
> > +{
> > +    vfio_vga_out(opaque, addr, val, 2);
> > +}
> > +
> > +static void vfio_vga_outl(void *opaque, uint32_t addr, uint32_t val)
> > +{
> > +    vfio_vga_out(opaque, addr, val, 4);
> > +}
> > +
> > +static uint32_t vfio_vga_in(VFIODevice *vdev, uint32_t addr, int len)
> > +{
> > +    uint32_t val = 0xffffffff;
> > +    ioperm(0x3b0, 0x30, 1); /* XXX fix me */
> > +    switch (len) {
> > +        case 1:
> > +            val = inb(addr);
> > +            break;
> > +        case 2:
> > +            val = inw(addr);
> > +            break;
> > +        case 4:
> > +            val = inl(addr);
> > +            break;
> > +    }
> > +    DPRINTF("%s 0x%x, %d = 0x%x\n", __func__, addr, len, val);
> > +    return val;
> > +}
> > +
> > +static uint32_t vfio_vga_inb(void *opaque, uint32_t addr)
> > +{
> > +    return vfio_vga_in(opaque, addr, 1);
> > +}
> > +
> > +static uint32_t vfio_vga_inw(void *opaque, uint32_t addr)
> > +{
> > +    return vfio_vga_in(opaque, addr, 2);
> > +}
> > +
> > +static uint32_t vfio_vga_inl(void *opaque, uint32_t addr)
> > +{
> > +    return vfio_vga_in(opaque, addr, 4);
> > +}
> > +
> > +int vfio_vga_setup(VFIODevice *vdev)
> > +{
> > +    char buf[256];
> > +    int ret;
> > +
> > +    if (vga_interface_type != VGA_NONE) {
> > +        fprintf(stderr,
> > +                "VGA devie assigned without -vga none param, no ISA VGA\n");
> > +        return -1;
> > +    }
> > +
> > +    vdev->vga_fd = open("/dev/vga_arbiter", O_RDWR);
> > +    if (vdev->vga_fd < 0) {
> > +        fprintf(stderr, "%s - Failed to open vga arbiter (%s)\n",
> > +                __func__, strerror(errno));
> > +        return -1;
> > +    }
> > +    ret = read(vdev->vga_fd, buf, sizeof(buf));
> > +    if (ret <= 0) {
> > +        fprintf(stderr, "%s - Failed to read from vga arbiter (%s)\n",
> > +                __func__, strerror(errno));
> > +        close(vdev->vga_fd);
> > +        return -1;
> > +    }
> > +    buf[ret - 1] = 0;
> > +    vdev->vga_orig = qemu_strdup(buf);
> > +
> > +    snprintf(buf, sizeof(buf), "target PCI:%04x:%02x:%02x.%x",
> > +             vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func);
> > +    ret = write(vdev->vga_fd, buf, strlen(buf));
> > +    if (ret != strlen(buf)) {
> > +        fprintf(stderr, "%s - Failed to write to vga arbiter (%s)\n",
> > +                __func__, strerror(errno));
> > +        close(vdev->vga_fd);
> > +        return -1;
> > +    }
> > +    snprintf(buf, sizeof(buf), "decodes io+mem");
> > +    ret = write(vdev->vga_fd, buf, strlen(buf));
> > +    if (ret != strlen(buf)) {
> > +        fprintf(stderr, "%s - Failed to write to vga arbiter (%s)\n",
> > +                __func__, strerror(errno));
> > +        close(vdev->vga_fd);
> > +        return -1;
> > +    }
> 
> OK, so we grab the assigned adapter and make it handle legacy io+mem. I
> guess this approach only works with a single guest with an assigned
> adapter. Would it be possible and not extremely costly to do some
> on-demand grabbing of the range to share it with multiple VMs?

Yes, and that was my intention but never got that far.  Each legacy io
access should switch the arbiter to the necessary device.  Unfortunately
the vga arbiter only works if everyone uses it, and so far it seems like
nobody does.  Obviously some pretty hefty performance implications with
switch on every read.  I'm not sure how that's going to play out.  I
expect once we bootstrap the VGA device and load a real driver, the
legacy areas are seldom used.

> And what about the host? When does Linux release the legacy range?
> Always or only when a specific (!=vga/vesa) framebuffer driver is loaded?

Well, that's where it'd be nice if the vga arbiter was actually in more
widespread use.  It currently seems to be nothing more than a shared
mutex, but it would actually be useful if it included backends to do the
chipset vga routing changes.  I think when I was testing this, I was
externally poking PCI bridge chipset to toggle the VGA_EN bit.

> Is there some other way to pass the legacy accesses from the guest to a
> specific adapter without going via the host's legacy area? I.e. do some
> adapters allow remapping?

Not that I know of on x86.  I wouldn't be surprised if some adapters
just re-route the legacy address ranges to standard PCI mappings, but I
don't know how to figure out if that's true and what the offsets would
be.  I've seen ia64 hardware that supports a _TRA offset such that each
PCI root bridge can support it's own legacy io port space, but that
requires a whole different ioport model.

I believe X.org tries to tackle this by brute force, manually changing
VGA enabled bits on PCI bridges.  I think this is part if why it's
difficult to run multiple X servers on the same system.  Not sure if
that problem has gotten any better since I last looked.

> > +
> > +    vdev->vga_mmio_fd = open("/dev/mem", O_RDWR);
> > +    if (vdev->vga_mmio_fd < 0) {
> > +        fprintf(stderr, "%s - Failed to open /dev/mem (%s)\n",
> > +                __func__, strerror(errno));
> > +        return -1;
> > +    }
> > +    vdev->vga_mmio = mmap(NULL, 0x40000, PROT_READ | PROT_WRITE,
> > +                          MAP_SHARED, vdev->vga_mmio_fd, 0xa0000);
> > +    if (vdev->vga_mmio == MAP_FAILED) {
> > +        fprintf(stderr, "%s - mmap failed (%s)\n", __func__, strerror(errno));
> > +        return -1;
> > +    }
> > +
> > +#if 1
> > +    vdev->vga_io = cpu_register_io_memory(vfio_vga_reads,
> > +                                          vfio_vga_writes, vdev);
> > +    cpu_register_physical_memory(0xa0000, 0x20000, vdev->vga_io);
> > +    qemu_register_coalesced_mmio(0xa0000, 0x20000);
> > +#else
> > +    cpu_register_physical_memory(0xa0000, 0x20000, 
> > +        qemu_ram_map(&vdev->pdev.qdev, "VGA", 0x20000, vdev->vga_mmio));
> > +    qemu_register_coalesced_mmio(0xa0000, 0x20000);
> > +#endif
> 
> To make the second case work, we would have to track the mode switches
> of the guest via legacy VGA interfaces and switch the mapping on the
> fly, right?

Yeah, something like that.   IIRC, I was expecting the second case to
work since I'm doing a static switch of the legacy address space and I
can't recall if it wasn't working or if I used the read/write interface
just so I could add fprintfs to make sure something is happening.
Thanks,

Alex

> > +
> > +    register_ioport_write(0x3b0, 0x30, 1, vfio_vga_outb, vdev);
> > +    register_ioport_write(0x3b0, 0x30, 2, vfio_vga_outw, vdev);
> > +    register_ioport_write(0x3b0, 0x30, 4, vfio_vga_outl, vdev);
> > +    register_ioport_read(0x3b0, 0x30, 1, vfio_vga_inb, vdev);
> > +    register_ioport_read(0x3b0, 0x30, 2, vfio_vga_inw, vdev);
> > +    register_ioport_read(0x3b0, 0x30, 4, vfio_vga_inl, vdev);
> > +    if (ioperm(0x3b0, 0x30, 1)) {
> > +        fprintf(stderr, "%s - ioperm failed (%s)\n", __func__, strerror(errno));
> > +        return -1;
> > +    }
> > +    return 0;
> > +}
> > +
> > +void vfio_vga_exit(VFIODevice *vdev)
> > +{
> > +    if (!vdev->vga_io)
> > +        return;
> > +
> > +    isa_unassign_ioport(0x3b0, 0x30);
> > +    qemu_unregister_coalesced_mmio(0xa0000, 0x20000);
> > +    cpu_register_physical_memory(0xa0000, 0x20000, IO_MEM_UNASSIGNED);
> > +    cpu_unregister_io_memory(vdev->vga_io);
> > +    munmap(vdev->vga_mmio, 0x40000);
> > +    close(vdev->vga_mmio_fd);
> > +    qemu_free(vdev->vga_orig);
> > +    close(vdev->vga_fd);
> > +}
> > +
> 
> Thanks,
> Jan
> 



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka May 9, 2011, 11:14 a.m. UTC | #3
On 2011-05-05 17:17, Alex Williamson wrote:
>> And what about the host? When does Linux release the legacy range?
>> Always or only when a specific (!=vga/vesa) framebuffer driver is loaded?
> 
> Well, that's where it'd be nice if the vga arbiter was actually in more
> widespread use.  It currently seems to be nothing more than a shared
> mutex, but it would actually be useful if it included backends to do the
> chipset vga routing changes.  I think when I was testing this, I was
> externally poking PCI bridge chipset to toggle the VGA_EN bit.

Right, we had to drop the approach to pass through the secondary card
for now, the arbiter was not switching properly. Haven't checked yet if
VGA_EN was properly set, though the kernel code looks like it should
take care of this.

Even with handing out the primary adapter, we had only mixed success so
far. The onboard adapter worked well (in VESA mode), but the NVIDIA is
not displaying early boot messages at all. Maybe a vgabios issue.
Windows was booting nevertheless - until we installed the NVIDIA
drivers. Than it ran into a blue screen.

BTW, what ATI adapter did you use precisely, and what did work, what not?

One thing I was wondering: Most modern adapters should be PCIe these
days. Our NVIDIA definitely is. But so far we are claiming to have it
attached to a PCI bus. That caps all the extended capabilities e.g.
Could this make some relevant difference?

Jan
Alex Williamson May 9, 2011, 2:29 p.m. UTC | #4
On Mon, 2011-05-09 at 13:14 +0200, Jan Kiszka wrote:
> On 2011-05-05 17:17, Alex Williamson wrote:
> >> And what about the host? When does Linux release the legacy range?
> >> Always or only when a specific (!=vga/vesa) framebuffer driver is loaded?
> > 
> > Well, that's where it'd be nice if the vga arbiter was actually in more
> > widespread use.  It currently seems to be nothing more than a shared
> > mutex, but it would actually be useful if it included backends to do the
> > chipset vga routing changes.  I think when I was testing this, I was
> > externally poking PCI bridge chipset to toggle the VGA_EN bit.
> 
> Right, we had to drop the approach to pass through the secondary card
> for now, the arbiter was not switching properly. Haven't checked yet if
> VGA_EN was properly set, though the kernel code looks like it should
> take care of this.
> 
> Even with handing out the primary adapter, we had only mixed success so
> far. The onboard adapter worked well (in VESA mode), but the NVIDIA is
> not displaying early boot messages at all. Maybe a vgabios issue.
> Windows was booting nevertheless - until we installed the NVIDIA
> drivers. Than it ran into a blue screen.

Interesting, IIRC I could never get VESA modes to work.  I believe I
only had a basic VGA16 mode running in a Windows guest too.

> BTW, what ATI adapter did you use precisely, and what did work, what not?

I have an old X550 (rv380?).  I also have an Nvidia gs8400, but ISTR the
ATI working better for me.

> One thing I was wondering: Most modern adapters should be PCIe these
> days. Our NVIDIA definitely is. But so far we are claiming to have it
> attached to a PCI bus. That caps all the extended capabilities e.g.
> Could this make some relevant difference?

The BIOS and early boot use shouldn't care too much about that, but I
could imagine the high performance drivers potentially failing.  Thanks,

Alex


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Prasad Joshi May 9, 2011, 2:55 p.m. UTC | #5
On Mon, May 9, 2011 at 12:14 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2011-05-05 17:17, Alex Williamson wrote:
>>> And what about the host? When does Linux release the legacy range?
>>> Always or only when a specific (!=vga/vesa) framebuffer driver is loaded?
>>
>> Well, that's where it'd be nice if the vga arbiter was actually in more
>> widespread use.  It currently seems to be nothing more than a shared
>> mutex, but it would actually be useful if it included backends to do the
>> chipset vga routing changes.  I think when I was testing this, I was
>> externally poking PCI bridge chipset to toggle the VGA_EN bit.
>
> Right, we had to drop the approach to pass through the secondary card
> for now, the arbiter was not switching properly. Haven't checked yet if
> VGA_EN was properly set, though the kernel code looks like it should
> take care of this.
>
> Even with handing out the primary adapter, we had only mixed success so
> far. The onboard adapter worked well (in VESA mode), but the NVIDIA is
> not displaying early boot messages at all. Maybe a vgabios issue.
> Windows was booting nevertheless - until we installed the NVIDIA
> drivers. Than it ran into a blue screen.
>
> BTW, what ATI adapter did you use precisely, and what did work, what not?

Not hijacking the mail thread. Just wanted to provide some inputs.

Few days back I had tried passing through the secondary graphics card.
I could pass-through two graphics cards to virtual machine.

02:00.0 VGA compatible controller: ATI Technologies Inc Redwood
[Radeon HD 5670] (prog-if 00 [VGA controller])
	Subsystem: PC Partner Limited Device e151
	Flags: bus master, fast devsel, latency 0, IRQ 87
	Memory at d0000000 (64-bit, prefetchable) [size=256M]
	Memory at fe6e0000 (64-bit, non-prefetchable) [size=128K]
	I/O ports at b000 [size=256]
	Expansion ROM at fe6c0000 [disabled] [size=128K]
	Capabilities: <access denied>
	Kernel driver in use: radeon
	Kernel modules: radeon

07:00.0 VGA compatible controller: nVidia Corporation G86 [Quadro NVS
290] (rev a1) (prog-if 00 [VGA controller])
       Subsystem: nVidia Corporation Device 0492
       Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop-
ParErr-Stepping- SERR+ FastB2B- DisINTx-
       Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast
>TAbort-<TAbort- <MAbort- >SERR- <PERR- INTx-
       Latency: 0, Cache Line Size: 64 bytes
       Interrupt: pin A routed to IRQ 24
       Region 0: Memory at fd000000 (32-bit, non-prefetchable) [size=16M]
       Region 1: Memory at d0000000 (64-bit, prefetchable) [size=256M]
       Region 3: Memory at fa000000 (64-bit, non-prefetchable) [size=32M]
       Region 5: I/O ports at ec00 [size=128]
       Expansion ROM at fe9e0000 [disabled] [size=128K]
       Capabilities: <access denied>
       Kernel driver in use: nouveau
       Kernel modules: nouveau, nvidiafb

Both of them are PCIe cards. I have one more ATI card and another
NVIDIA card which does not work.

One of the reason the pass-through did not work is because of the
limit on amount of pci configuration memory by SeaBIOS. SeaBIOS places
a hard limit of 256MB or so on the amount of PCI memory space. Thus,
for some of the VGA device that need more memory never worked for me.

SeaBIOS allows this memory region to be extended to some value near
512MB, but even then the range is not enough.

Another problem with SeaBIOS which limits the amount of memory space
is: SeaBIOS allocates the BAR regions as they are encountered. As far
as I know, the BAR regions should be naturally aligned. Thus the
simple strategy of the SeaBIOS results in large fragmentation.
Therefore, even after increasing the PCI memory space to 512MB the BAR
regions were unallocated.

I will confirm you the details of other graphics cards which do not work.

Thanks and Regards,
Prasad

>
> One thing I was wondering: Most modern adapters should be PCIe these
> days. Our NVIDIA definitely is. But so far we are claiming to have it
> attached to a PCI bus. That caps all the extended capabilities e.g.
> Could this make some relevant difference?
>
> Jan
>
> --
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka May 9, 2011, 3:02 p.m. UTC | #6
On 2011-05-09 16:29, Alex Williamson wrote:
> On Mon, 2011-05-09 at 13:14 +0200, Jan Kiszka wrote:
>> On 2011-05-05 17:17, Alex Williamson wrote:
>>>> And what about the host? When does Linux release the legacy range?
>>>> Always or only when a specific (!=vga/vesa) framebuffer driver is loaded?
>>>
>>> Well, that's where it'd be nice if the vga arbiter was actually in more
>>> widespread use.  It currently seems to be nothing more than a shared
>>> mutex, but it would actually be useful if it included backends to do the
>>> chipset vga routing changes.  I think when I was testing this, I was
>>> externally poking PCI bridge chipset to toggle the VGA_EN bit.
>>
>> Right, we had to drop the approach to pass through the secondary card
>> for now, the arbiter was not switching properly. Haven't checked yet if
>> VGA_EN was properly set, though the kernel code looks like it should
>> take care of this.
>>
>> Even with handing out the primary adapter, we had only mixed success so
>> far. The onboard adapter worked well (in VESA mode), but the NVIDIA is
>> not displaying early boot messages at all. Maybe a vgabios issue.
>> Windows was booting nevertheless - until we installed the NVIDIA
>> drivers. Than it ran into a blue screen.
> 
> Interesting, IIRC I could never get VESA modes to work.  I believe I
> only had a basic VGA16 mode running in a Windows guest too.
> 
>> BTW, what ATI adapter did you use precisely, and what did work, what not?
> 
> I have an old X550 (rv380?).  I also have an Nvidia gs8400, but ISTR the
> ATI working better for me.

Is that Nvidia a PCIe adapter? Did it show BIOS / early boot messages
properly?

BTW, we are fighting with a Quadro FX 3800.

> 
>> One thing I was wondering: Most modern adapters should be PCIe these
>> days. Our NVIDIA definitely is. But so far we are claiming to have it
>> attached to a PCI bus. That caps all the extended capabilities e.g.
>> Could this make some relevant difference?
> 
> The BIOS and early boot use shouldn't care too much about that, but I
> could imagine the high performance drivers potentially failing.  Thanks,

Yeah, that was my thinking as well. But we will try to confirm this by
tracing the BIOS activities. There is a telling that some adapters do
not allow reading the true cold-boot ROM content during runtime, thus
booting those adapters inside the guest may fail to some degree.

Anyway, I've hacked on the q35 patches until they allowed me to boot a
Linux guest with an assigned PCIe Atheros WLAN adapter - all caps were
suddenly visible. Those bits are now on their way to our test box. Let's
see if they are able to change the BSOD a bit...

Jan
Jan Kiszka May 9, 2011, 3:27 p.m. UTC | #7
On 2011-05-09 16:55, Prasad Joshi wrote:
> On Mon, May 9, 2011 at 12:14 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> On 2011-05-05 17:17, Alex Williamson wrote:
>>>> And what about the host? When does Linux release the legacy range?
>>>> Always or only when a specific (!=vga/vesa) framebuffer driver is loaded?
>>>
>>> Well, that's where it'd be nice if the vga arbiter was actually in more
>>> widespread use.  It currently seems to be nothing more than a shared
>>> mutex, but it would actually be useful if it included backends to do the
>>> chipset vga routing changes.  I think when I was testing this, I was
>>> externally poking PCI bridge chipset to toggle the VGA_EN bit.
>>
>> Right, we had to drop the approach to pass through the secondary card
>> for now, the arbiter was not switching properly. Haven't checked yet if
>> VGA_EN was properly set, though the kernel code looks like it should
>> take care of this.
>>
>> Even with handing out the primary adapter, we had only mixed success so
>> far. The onboard adapter worked well (in VESA mode), but the NVIDIA is
>> not displaying early boot messages at all. Maybe a vgabios issue.
>> Windows was booting nevertheless - until we installed the NVIDIA
>> drivers. Than it ran into a blue screen.
>>
>> BTW, what ATI adapter did you use precisely, and what did work, what not?
> 
> Not hijacking the mail thread. Just wanted to provide some inputs.

Much appreciated in fact!

> 
> Few days back I had tried passing through the secondary graphics card.
> I could pass-through two graphics cards to virtual machine.
> 
> 02:00.0 VGA compatible controller: ATI Technologies Inc Redwood
> [Radeon HD 5670] (prog-if 00 [VGA controller])
> 	Subsystem: PC Partner Limited Device e151
> 	Flags: bus master, fast devsel, latency 0, IRQ 87
> 	Memory at d0000000 (64-bit, prefetchable) [size=256M]
> 	Memory at fe6e0000 (64-bit, non-prefetchable) [size=128K]
> 	I/O ports at b000 [size=256]
> 	Expansion ROM at fe6c0000 [disabled] [size=128K]
> 	Capabilities: <access denied>
> 	Kernel driver in use: radeon
> 	Kernel modules: radeon
> 
> 07:00.0 VGA compatible controller: nVidia Corporation G86 [Quadro NVS
> 290] (rev a1) (prog-if 00 [VGA controller])
>        Subsystem: nVidia Corporation Device 0492
>        Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop-
> ParErr-Stepping- SERR+ FastB2B- DisINTx-
>        Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast
>> TAbort-<TAbort- <MAbort- >SERR- <PERR- INTx-
>        Latency: 0, Cache Line Size: 64 bytes
>        Interrupt: pin A routed to IRQ 24
>        Region 0: Memory at fd000000 (32-bit, non-prefetchable) [size=16M]
>        Region 1: Memory at d0000000 (64-bit, prefetchable) [size=256M]
>        Region 3: Memory at fa000000 (64-bit, non-prefetchable) [size=32M]
>        Region 5: I/O ports at ec00 [size=128]
>        Expansion ROM at fe9e0000 [disabled] [size=128K]
>        Capabilities: <access denied>
>        Kernel driver in use: nouveau
>        Kernel modules: nouveau, nvidiafb
> 
> Both of them are PCIe cards. I have one more ATI card and another
> NVIDIA card which does not work.

Interesting. That may rule out missing PCIe capabilities as source for
the NVIDIA driver indisposition.

Did you passed those cards each as primary to the guest, or was the
guest seeing multiple adapters? I presume you only got output after
early boot was completed, right?

To avoid having to deal with legacy I/O forwarding, we started with a
dual adapter setup in the hope to leave the primary guest adapter at
know-to-work cirrus-vga. But already in a native setup with on-board
primary + NVIDIA secondary, the NVIDIA Windows drivers refused to talk
to its hardware in this constellation.

> 
> One of the reason the pass-through did not work is because of the
> limit on amount of pci configuration memory by SeaBIOS. SeaBIOS places
> a hard limit of 256MB or so on the amount of PCI memory space. Thus,
> for some of the VGA device that need more memory never worked for me.
> 
> SeaBIOS allows this memory region to be extended to some value near
> 512MB, but even then the range is not enough.
> 
> Another problem with SeaBIOS which limits the amount of memory space
> is: SeaBIOS allocates the BAR regions as they are encountered. As far
> as I know, the BAR regions should be naturally aligned. Thus the
> simple strategy of the SeaBIOS results in large fragmentation.
> Therefore, even after increasing the PCI memory space to 512MB the BAR
> regions were unallocated.

That's an interesting trace! We'll check this here, but I bet it
contributes to the problems. Our FX 3800 has 1G memory...

> 
> I will confirm you the details of other graphics cards which do not work.

TiA,
Jan
Prasad Joshi May 9, 2011, 3:40 p.m. UTC | #8
On Mon, May 9, 2011 at 4:27 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2011-05-09 16:55, Prasad Joshi wrote:
>> On Mon, May 9, 2011 at 12:14 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>>> On 2011-05-05 17:17, Alex Williamson wrote:
>>>>> And what about the host? When does Linux release the legacy range?
>>>>> Always or only when a specific (!=vga/vesa) framebuffer driver is loaded?
>>>>
>>>> Well, that's where it'd be nice if the vga arbiter was actually in more
>>>> widespread use.  It currently seems to be nothing more than a shared
>>>> mutex, but it would actually be useful if it included backends to do the
>>>> chipset vga routing changes.  I think when I was testing this, I was
>>>> externally poking PCI bridge chipset to toggle the VGA_EN bit.
>>>
>>> Right, we had to drop the approach to pass through the secondary card
>>> for now, the arbiter was not switching properly. Haven't checked yet if
>>> VGA_EN was properly set, though the kernel code looks like it should
>>> take care of this.
>>>
>>> Even with handing out the primary adapter, we had only mixed success so
>>> far. The onboard adapter worked well (in VESA mode), but the NVIDIA is
>>> not displaying early boot messages at all. Maybe a vgabios issue.
>>> Windows was booting nevertheless - until we installed the NVIDIA
>>> drivers. Than it ran into a blue screen.
>>>
>>> BTW, what ATI adapter did you use precisely, and what did work, what not?
>>
>> Not hijacking the mail thread. Just wanted to provide some inputs.
>
> Much appreciated in fact!
>
>>
>> Few days back I had tried passing through the secondary graphics card.
>> I could pass-through two graphics cards to virtual machine.
>>
>> 02:00.0 VGA compatible controller: ATI Technologies Inc Redwood
>> [Radeon HD 5670] (prog-if 00 [VGA controller])
>>       Subsystem: PC Partner Limited Device e151
>>       Flags: bus master, fast devsel, latency 0, IRQ 87
>>       Memory at d0000000 (64-bit, prefetchable) [size=256M]
>>       Memory at fe6e0000 (64-bit, non-prefetchable) [size=128K]
>>       I/O ports at b000 [size=256]
>>       Expansion ROM at fe6c0000 [disabled] [size=128K]
>>       Capabilities: <access denied>
>>       Kernel driver in use: radeon
>>       Kernel modules: radeon
>>
>> 07:00.0 VGA compatible controller: nVidia Corporation G86 [Quadro NVS
>> 290] (rev a1) (prog-if 00 [VGA controller])
>>        Subsystem: nVidia Corporation Device 0492
>>        Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop-
>> ParErr-Stepping- SERR+ FastB2B- DisINTx-
>>        Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast
>>> TAbort-<TAbort- <MAbort- >SERR- <PERR- INTx-
>>        Latency: 0, Cache Line Size: 64 bytes
>>        Interrupt: pin A routed to IRQ 24
>>        Region 0: Memory at fd000000 (32-bit, non-prefetchable) [size=16M]
>>        Region 1: Memory at d0000000 (64-bit, prefetchable) [size=256M]
>>        Region 3: Memory at fa000000 (64-bit, non-prefetchable) [size=32M]
>>        Region 5: I/O ports at ec00 [size=128]
>>        Expansion ROM at fe9e0000 [disabled] [size=128K]
>>        Capabilities: <access denied>
>>        Kernel driver in use: nouveau
>>        Kernel modules: nouveau, nvidiafb
>>
>> Both of them are PCIe cards. I have one more ATI card and another
>> NVIDIA card which does not work.
>
> Interesting. That may rule out missing PCIe capabilities as source for
> the NVIDIA driver indisposition.
>
> Did you passed those cards each as primary to the guest, or was the
> guest seeing multiple adapters?

I passed the graphics device as a primary device to the guest virtual
machine, with -vga none parameter to disable the default vga device.

> I presume you only got output after
> early boot was completed, right?

Yes you are correct. I got the display, only after the KMS was
started. The initial BIOS messages were not displayed.

>
> To avoid having to deal with legacy I/O forwarding, we started with a
> dual adapter setup in the hope to leave the primary guest adapter at
> know-to-work cirrus-vga. But already in a native setup with on-board
> primary + NVIDIA secondary, the NVIDIA Windows drivers refused to talk
> to its hardware in this constellation.
>

Windows operating system never worked for me with either of the graphics card.

>>
>> One of the reason the pass-through did not work is because of the
>> limit on amount of pci configuration memory by SeaBIOS. SeaBIOS places
>> a hard limit of 256MB or so on the amount of PCI memory space. Thus,
>> for some of the VGA device that need more memory never worked for me.
>>
>> SeaBIOS allows this memory region to be extended to some value near
>> 512MB, but even then the range is not enough.
>>
>> Another problem with SeaBIOS which limits the amount of memory space
>> is: SeaBIOS allocates the BAR regions as they are encountered. As far
>> as I know, the BAR regions should be naturally aligned. Thus the
>> simple strategy of the SeaBIOS results in large fragmentation.
>> Therefore, even after increasing the PCI memory space to 512MB the BAR
>> regions were unallocated.
>
> That's an interesting trace! We'll check this here, but I bet it
> contributes to the problems. Our FX 3800 has 1G memory...

Yes it is one of the problem. I remember reading something about the
NVIDIA BIOS and FLR, those could be other interesting issues.

>
>>
>> I will confirm you the details of other graphics cards which do not work.
>
> TiA,
> Jan
>
> --
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity May 11, 2011, 11:23 a.m. UTC | #9
On 05/09/2011 06:27 PM, Jan Kiszka wrote:
> To avoid having to deal with legacy I/O forwarding, we started with a
> dual adapter setup in the hope to leave the primary guest adapter at
> know-to-work cirrus-vga. But already in a native setup with on-board
> primary + NVIDIA secondary, the NVIDIA Windows drivers refused to talk
> to its hardware in this constellation.

IIRC one issue with nvidia is that it uses non-BAR registers to move its 
PCI BAR around, which causes cpu writes to hit empty space.

One way to see if this is the problem is to trace mmio that misses both 
kvm internal devices and qemu devices.
Jan Kiszka May 11, 2011, 12:31 p.m. UTC | #10
On 2011-05-11 13:23, Avi Kivity wrote:
> On 05/09/2011 06:27 PM, Jan Kiszka wrote:
>> To avoid having to deal with legacy I/O forwarding, we started with a
>> dual adapter setup in the hope to leave the primary guest adapter at
>> know-to-work cirrus-vga. But already in a native setup with on-board
>> primary + NVIDIA secondary, the NVIDIA Windows drivers refused to talk
>> to its hardware in this constellation.
> 
> IIRC one issue with nvidia is that it uses non-BAR registers to move its 
> PCI BAR around, which causes cpu writes to hit empty space.

I wonder if that would still be "virtualization friendly" as the adapter
claims to be...

> 
> One way to see if this is the problem is to trace mmio that misses both 
> kvm internal devices and qemu devices.

We'll check.

Jan
diff mbox

Patch

commit 0313d97cf24177023cdb6f2e4c54d077c5a775c1
Author: Alex Williamson <alex.williamson@redhat.com>
Date:   Wed Sep 29 13:50:39 2010 -0600

vfio: VGA passthrough support(ish)

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

diff --git a/Makefile.target b/Makefile.target
index c507dd2..cb0cea6 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -203,6 +203,7 @@  obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o
 obj-i386-y += debugcon.o multiboot.o
 obj-i386-y += pc_piix.o
 obj-i386-y += vfio.o
+obj-$(CONFIG_VFIO_VGA) += vfio-vga.o
 
 # shared objects
 obj-ppc-y = ppc.o
diff --git a/configure b/configure
index 3bfc5e9..b15e68f 100755
--- a/configure
+++ b/configure
@@ -322,6 +322,7 @@  user_pie="no"
 zero_malloc=""
 trace_backend="nop"
 trace_file="trace"
+vfio_vga="no"
 
 # OS specific
 if check_define __linux__ ; then
@@ -718,6 +719,8 @@  for opt do
   ;;
   --enable-vhost-net) vhost_net="yes"
   ;;
+  --enable-vfio-vga) vfio_vga="yes"
+  ;;
   --*dir)
   ;;
   *) echo "ERROR: unknown option $opt"; show_help="yes"
@@ -907,6 +910,7 @@  echo "  --disable-docs           disable documentation build"
 echo "  --disable-vhost-net      disable vhost-net acceleration support"
 echo "  --enable-vhost-net       enable vhost-net acceleration support"
 echo "  --trace-backend=B        Trace backend nop simple ust"
+echo "  --enable-vfio-vga        enable vfio VGA passthrough support"
 echo "  --trace-file=NAME        Full PATH,NAME of file to store traces"
 echo "                           Default:trace-<pid>"
 echo ""
@@ -2240,6 +2244,7 @@  echo "preadv support    $preadv"
 echo "fdatasync         $fdatasync"
 echo "uuid support      $uuid"
 echo "vhost-net support $vhost_net"
+echo "vfio-vga support  $vfio_vga"
 echo "Trace backend     $trace_backend"
 echo "Trace output file $trace_file-<pid>"
 
@@ -2762,6 +2767,9 @@  case "$target_arch2" in
     if test "$xen" = "yes" -a "$target_softmmu" = "yes" ; then
       echo "CONFIG_XEN=y" >> $config_target_mak
     fi
+    if test $vfio_vga = "yes" ; then
+      echo "CONFIG_VFIO_VGA=y" >> $config_host_mak
+    fi
 esac
 case "$target_arch2" in
   i386|x86_64|ppcemb|ppc|ppc64|s390x)
diff --git a/hw/vfio-vga.c b/hw/vfio-vga.c
new file mode 100644
index 0000000..5c1899c
--- /dev/null
+++ b/hw/vfio-vga.c
@@ -0,0 +1,291 @@ 
+/*
+ * vfio VGA device assignment support
+ *
+ * Copyright Red Hat, Inc. 2010
+ *
+ * Authors:
+ *  Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Based on qemu-kvm device-assignment:
+ *  Adapted for KVM by Qumranet.
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/io.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "event_notifier.h"
+#include "hw.h"
+#include "memory.h"
+#include "monitor.h"
+#include "pc.h"
+#include "qemu-error.h"
+#include "sysemu.h"
+#include "vfio.h"
+#include <pci/header.h>
+#include <pci/types.h>
+#include <linux/types.h>
+#include "linux-vfio.h"
+
+//#define DEBUG_VFIO_VGA
+#ifdef DEBUG_VFIO_VGA
+#define DPRINTF(fmt, ...) \
+    do { printf("vfio-vga: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+    do { } while (0)
+#endif
+
+/*
+ * VGA setup
+ */
+static void vfio_vga_write(VFIODevice *vdev, uint32_t addr,
+                           uint32_t val, int len)
+{
+    DPRINTF("%s 0x%x %d - 0x%x\n", __func__, 0xa0000 + addr, len, val);
+    switch (len) {
+        case 1:
+            *(uint8_t *)(vdev->vga_mmio + addr) = (uint8_t)val;
+            break;
+        case 2:
+            *(uint16_t *)(vdev->vga_mmio + addr) = (uint16_t)val;
+            break;
+        case 4:
+            *(uint32_t *)(vdev->vga_mmio + addr) = val;
+            break;
+    }
+}
+
+static void vfio_vga_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    vfio_vga_write(opaque, addr, val, 1);
+}
+
+static void vfio_vga_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    vfio_vga_write(opaque, addr, val, 2);
+}
+
+static void vfio_vga_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    vfio_vga_write(opaque, addr, val, 4);
+}
+
+static CPUWriteMemoryFunc * const vfio_vga_writes[] = {
+    &vfio_vga_writeb,
+    &vfio_vga_writew,
+    &vfio_vga_writel
+};
+
+static uint32_t vfio_vga_read(VFIODevice *vdev, uint32_t addr, int len)
+{
+    uint32_t val = 0xffffffff;
+    switch (len) {
+        case 1:
+            val = (uint32_t)*(uint8_t *)(vdev->vga_mmio + addr);
+            break;
+        case 2:
+            val = (uint32_t)*(uint16_t *)(vdev->vga_mmio + addr);
+            break;
+        case 4:
+            val = *(uint32_t *)(vdev->vga_mmio + addr);
+            break;
+    }
+    DPRINTF("%s 0x%x %d = 0x%x\n", __func__, 0xa0000 + addr, len, val);
+    return val;
+}
+
+static uint32_t vfio_vga_readb(void *opaque, target_phys_addr_t addr)
+{
+    return vfio_vga_read(opaque, addr, 1);
+}
+
+static uint32_t vfio_vga_readw(void *opaque, target_phys_addr_t addr)
+{
+    return vfio_vga_read(opaque, addr, 2);
+}
+
+static uint32_t vfio_vga_readl(void *opaque, target_phys_addr_t addr)
+{
+    return vfio_vga_read(opaque, addr, 4);
+}
+
+static CPUReadMemoryFunc * const vfio_vga_reads[] = {
+    &vfio_vga_readb,
+    &vfio_vga_readw,
+    &vfio_vga_readl
+};
+
+static void vfio_vga_out(VFIODevice *vdev, uint32_t addr, uint32_t val, int len)
+{
+    DPRINTF("%s 0x%x %d - 0x%x\n", __func__, addr, len, val);
+    ioperm(0x3b0, 0x30, 1); /* XXX fix me */
+    switch (len) {
+        case 1:
+            outb(val, addr);
+            break;
+        case 2:
+            outw(val, addr);
+            break;
+        case 4:
+            outl(val, addr);
+            break;
+    }
+}
+
+static void vfio_vga_outb(void *opaque, uint32_t addr, uint32_t val)
+{
+    vfio_vga_out(opaque, addr, val, 1);
+}
+
+static void vfio_vga_outw(void *opaque, uint32_t addr, uint32_t val)
+{
+    vfio_vga_out(opaque, addr, val, 2);
+}
+
+static void vfio_vga_outl(void *opaque, uint32_t addr, uint32_t val)
+{
+    vfio_vga_out(opaque, addr, val, 4);
+}
+
+static uint32_t vfio_vga_in(VFIODevice *vdev, uint32_t addr, int len)
+{
+    uint32_t val = 0xffffffff;
+    ioperm(0x3b0, 0x30, 1); /* XXX fix me */
+    switch (len) {
+        case 1:
+            val = inb(addr);
+            break;
+        case 2:
+            val = inw(addr);
+            break;
+        case 4:
+            val = inl(addr);
+            break;
+    }
+    DPRINTF("%s 0x%x, %d = 0x%x\n", __func__, addr, len, val);
+    return val;
+}
+
+static uint32_t vfio_vga_inb(void *opaque, uint32_t addr)
+{
+    return vfio_vga_in(opaque, addr, 1);
+}
+
+static uint32_t vfio_vga_inw(void *opaque, uint32_t addr)
+{
+    return vfio_vga_in(opaque, addr, 2);
+}
+
+static uint32_t vfio_vga_inl(void *opaque, uint32_t addr)
+{
+    return vfio_vga_in(opaque, addr, 4);
+}
+
+int vfio_vga_setup(VFIODevice *vdev)
+{
+    char buf[256];
+    int ret;
+
+    if (vga_interface_type != VGA_NONE) {
+        fprintf(stderr,
+                "VGA devie assigned without -vga none param, no ISA VGA\n");
+        return -1;
+    }
+
+    vdev->vga_fd = open("/dev/vga_arbiter", O_RDWR);
+    if (vdev->vga_fd < 0) {
+        fprintf(stderr, "%s - Failed to open vga arbiter (%s)\n",
+                __func__, strerror(errno));
+        return -1;
+    }
+    ret = read(vdev->vga_fd, buf, sizeof(buf));
+    if (ret <= 0) {
+        fprintf(stderr, "%s - Failed to read from vga arbiter (%s)\n",
+                __func__, strerror(errno));
+        close(vdev->vga_fd);
+        return -1;
+    }
+    buf[ret - 1] = 0;
+    vdev->vga_orig = qemu_strdup(buf);
+
+    snprintf(buf, sizeof(buf), "target PCI:%04x:%02x:%02x.%x",
+             vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func);
+    ret = write(vdev->vga_fd, buf, strlen(buf));
+    if (ret != strlen(buf)) {
+        fprintf(stderr, "%s - Failed to write to vga arbiter (%s)\n",
+                __func__, strerror(errno));
+        close(vdev->vga_fd);
+        return -1;
+    }
+    snprintf(buf, sizeof(buf), "decodes io+mem");
+    ret = write(vdev->vga_fd, buf, strlen(buf));
+    if (ret != strlen(buf)) {
+        fprintf(stderr, "%s - Failed to write to vga arbiter (%s)\n",
+                __func__, strerror(errno));
+        close(vdev->vga_fd);
+        return -1;
+    }
+
+    vdev->vga_mmio_fd = open("/dev/mem", O_RDWR);
+    if (vdev->vga_mmio_fd < 0) {
+        fprintf(stderr, "%s - Failed to open /dev/mem (%s)\n",
+                __func__, strerror(errno));
+        return -1;
+    }
+    vdev->vga_mmio = mmap(NULL, 0x40000, PROT_READ | PROT_WRITE,
+                          MAP_SHARED, vdev->vga_mmio_fd, 0xa0000);
+    if (vdev->vga_mmio == MAP_FAILED) {
+        fprintf(stderr, "%s - mmap failed (%s)\n", __func__, strerror(errno));
+        return -1;
+    }
+
+#if 1
+    vdev->vga_io = cpu_register_io_memory(vfio_vga_reads,
+                                          vfio_vga_writes, vdev);
+    cpu_register_physical_memory(0xa0000, 0x20000, vdev->vga_io);
+    qemu_register_coalesced_mmio(0xa0000, 0x20000);
+#else
+    cpu_register_physical_memory(0xa0000, 0x20000, 
+        qemu_ram_map(&vdev->pdev.qdev, "VGA", 0x20000, vdev->vga_mmio));
+    qemu_register_coalesced_mmio(0xa0000, 0x20000);
+#endif
+
+    register_ioport_write(0x3b0, 0x30, 1, vfio_vga_outb, vdev);
+    register_ioport_write(0x3b0, 0x30, 2, vfio_vga_outw, vdev);
+    register_ioport_write(0x3b0, 0x30, 4, vfio_vga_outl, vdev);
+    register_ioport_read(0x3b0, 0x30, 1, vfio_vga_inb, vdev);
+    register_ioport_read(0x3b0, 0x30, 2, vfio_vga_inw, vdev);
+    register_ioport_read(0x3b0, 0x30, 4, vfio_vga_inl, vdev);
+    if (ioperm(0x3b0, 0x30, 1)) {
+        fprintf(stderr, "%s - ioperm failed (%s)\n", __func__, strerror(errno));
+        return -1;
+    }
+    return 0;
+}
+
+void vfio_vga_exit(VFIODevice *vdev)
+{
+    if (!vdev->vga_io)
+        return;
+
+    isa_unassign_ioport(0x3b0, 0x30);
+    qemu_unregister_coalesced_mmio(0xa0000, 0x20000);
+    cpu_register_physical_memory(0xa0000, 0x20000, IO_MEM_UNASSIGNED);
+    cpu_unregister_io_memory(vdev->vga_io);
+    munmap(vdev->vga_mmio, 0x40000);
+    close(vdev->vga_mmio_fd);
+    qemu_free(vdev->vga_orig);
+    close(vdev->vga_fd);
+}
+
diff --git a/hw/vfio.c b/hw/vfio.c
index e2da724..f7c7a42 100644
--- a/hw/vfio.c
+++ b/hw/vfio.c
@@ -1268,8 +1268,22 @@  static int vfio_initfn(struct PCIDevice *pdev)
     if (vfio_enable_intx(vdev))
         goto out_unmap_iommu;
 
+#ifdef CONFIG_VFIO_VGA
+    {
+        uint16_t class;
+
+        class = vfio_pci_read_config(&vdev->pdev, PCI_CLASS_DEVICE, 2);
+        if (class == PCI_CLASS_DISPLAY_VGA && vfio_vga_setup(vdev))
+            goto out_vga_fail;
+    }
+#endif
+
     return 0;
 
+#ifdef CONFIG_VFIO_VGA
+out_vga_fail:
+    vfio_disable_intx(vdev);
+#endif
 out_unmap_iommu:
     vfio_unmap_iommu(vdev);
 out_unmap_resources:
@@ -1290,6 +1304,9 @@  static int vfio_exitfn(struct PCIDevice *pdev)
 {
     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
     
+#ifdef CONFIG_VFIO_VGA
+    vfio_vga_exit(vdev);
+#endif
     vfio_disable_intx(vdev);
     vfio_disable_msi(vdev);
     vfio_disable_msix(vdev);
diff --git a/hw/vfio.h b/hw/vfio.h
index b5a0525..c7490b3 100644
--- a/hw/vfio.h
+++ b/hw/vfio.h
@@ -83,8 +83,20 @@  typedef struct VFIODevice {
     MSIX msix;
     int vfiofd;
     int uiommufd;
+#ifdef CONFIG_VFIO_VGA
+    int vga_io;
+    int vga_fd;
+    int vga_mmio_fd;
+    uint8_t *vga_mmio;
+    char *vga_orig;
+#endif
     char *vfiofd_name;
     char *uiommufd_name;
 } VFIODevice;
 
+#ifdef CONFIG_VFIO_VGA
+int vfio_vga_setup(VFIODevice *vdev);
+void vfio_vga_exit(VFIODevice *vdev);
+#endif
+
 #endif /* __VFIO_H__ */