diff mbox

[RFC,v3,5/6] kvm/ppc/mpic: in-kernel MPIC emulation

Message ID 1364954273-18196-6-git-send-email-scottwood@freescale.com (mailing list archive)
State New, archived
Headers show

Commit Message

Scott Wood April 3, 2013, 1:57 a.m. UTC
Hook the MPIC code up to the KVM interfaces, add locking, etc.

TODO: irqfd support, split up into multiple patches, KVM_IRQ_LINE
support

Signed-off-by: Scott Wood <scottwood@freescale.com>
---
v3: mpic_put -> kvmppc_mpic_put

 Documentation/virtual/kvm/devices/mpic.txt |   37 ++
 arch/powerpc/include/asm/kvm_host.h        |    8 +-
 arch/powerpc/include/asm/kvm_ppc.h         |    7 +
 arch/powerpc/kvm/Kconfig                   |    5 +
 arch/powerpc/kvm/Makefile                  |    2 +
 arch/powerpc/kvm/booke.c                   |   10 +-
 arch/powerpc/kvm/mpic.c                    |  814 +++++++++++++++++++++-------
 arch/powerpc/kvm/powerpc.c                 |   12 +-
 include/linux/kvm_host.h                   |    2 +
 include/uapi/linux/kvm.h                   |    9 +
 virt/kvm/kvm_main.c                        |    9 +
 11 files changed, 714 insertions(+), 201 deletions(-)
 create mode 100644 Documentation/virtual/kvm/devices/mpic.txt

Comments

Gleb Natapov April 3, 2013, 3:55 p.m. UTC | #1
On Tue, Apr 02, 2013 at 08:57:52PM -0500, Scott Wood wrote:
> Hook the MPIC code up to the KVM interfaces, add locking, etc.
> 
> TODO: irqfd support, split up into multiple patches, KVM_IRQ_LINE
> support
> 
> Signed-off-by: Scott Wood <scottwood@freescale.com>
> ---
[skip]

> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 20ce2d2..d8f44ef 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -927,6 +927,15 @@ struct kvm_device_attr {
>  	__u64	addr;		/* userspace address of attr data */
>  };
>  
> +#define KVM_DEV_TYPE_FSL_MPIC_20	1
> +#define KVM_DEV_TYPE_FSL_MPIC_42	2
> +
> +#define KVM_DEV_MPIC_GRP_MISC		1
> +#define   KVM_DEV_MPIC_BASE_ADDR	0	/* 64-bit */
> +
> +#define KVM_DEV_MPIC_GRP_REGISTER	2	/* 32-bit */
> +#define KVM_DEV_MPIC_GRP_IRQ_ACTIVE	3	/* 32-bit */
Why not put them in arch specific header?

> +
>  /* ioctl for vm fd */
>  #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
>  
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index ed033c0..e325f5d 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2164,6 +2164,15 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
>  	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
>  
>  	switch (cd->type) {
> +#ifdef CONFIG_KVM_MPIC
> +	case KVM_DEV_TYPE_FSL_MPIC_20:
> +	case KVM_DEV_TYPE_FSL_MPIC_42: {
> +		if (test)
> +			return 0;
> +
> +		return kvm_create_mpic(kvm, cd->type);
> +	}
> +#endif
>  	default:
>  		return -ENODEV;
>  	}
> -- 
> 1.7.9.5
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf April 3, 2013, 4:19 p.m. UTC | #2
On 03.04.2013, at 03:57, Scott Wood wrote:

> Hook the MPIC code up to the KVM interfaces, add locking, etc.
> 
> TODO: irqfd support, split up into multiple patches, KVM_IRQ_LINE
> support
> 
> Signed-off-by: Scott Wood <scottwood@freescale.com>
> ---
> v3: mpic_put -> kvmppc_mpic_put
> 
> 

[...]

> +void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu);
> +
> int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
> 			      struct kvm_config_tlb *cfg);
> int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
> diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
> index 63c67ec..a87139b 100644
> --- a/arch/powerpc/kvm/Kconfig
> +++ b/arch/powerpc/kvm/Kconfig
> @@ -151,6 +151,11 @@ config KVM_E500MC
> 
> 	  If unsure, say N.
> 
> +config KVM_MPIC
> +	bool "KVM in-kernel MPIC emulation"
> +	depends on KVM

This should probably depend on FSL KVM for now, until someone adds support for other MPIC revisions.

> +
> +
> source drivers/vhost/Kconfig
> 
> endif # VIRTUALIZATION
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index b772ede..4a2277a 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -103,6 +103,8 @@ kvm-book3s_32-objs := \
> 	book3s_32_mmu.o
> kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
> 
> +kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o
> +
> kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
> 
> obj-$(CONFIG_KVM_440) += kvm.o
> 

[...]

> struct irq_dest {
> +	struct kvm_vcpu *vcpu;
> +
> 	int32_t ctpr;		/* CPU current task priority */
> 	struct irq_queue raised;
> 	struct irq_queue servicing;
> -	qemu_irq *irqs;
> 
> 	/* Count of IRQ sources asserting on non-INT outputs */
> -	uint32_t outputs_active[OPENPIC_OUTPUT_NB];
> +	uint32_t outputs_active[NUM_OUTPUTS];
> };
> 
> +struct openpic;

Isn't this superfluous?

> +
> struct openpic {
> +	struct kvm *kvm;
> +	struct kvm_io_device mmio;
> +	struct list_head mmio_regions;
> +	atomic_t users;
> +	bool mmio_mapped;
> +
> +	gpa_t reg_base;
> +	spinlock_t lock;
> +
> 	/* Behavior control */
> 	struct fsl_mpic_info *fsl;
> 	uint32_t model;
> @@ -208,6 +231,47 @@ struct openpic {
> 	uint32_t irq_msi;
> };
> 
> 

[...]

> -static uint64_t openpic_gbl_read(void *opaque, gpa_t addr, unsigned len)
> +static int openpic_gbl_read(void *opaque, gpa_t addr, u32 *ptr)
> {
> 	struct openpic *opp = opaque;
> -	uint32_t retval;
> +	u32 retval;
> 
> -	pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr);
> +	pr_debug("%s: addr %#llx\n", __func__, addr);
> 	retval = 0xFFFFFFFF;
> 	if (addr & 0xF)
> -		return retval;
> +		goto out;
> 
> 	switch (addr) {
> 	case 0x1000:		/* FRR */
> 		retval = opp->frr;
> +		retval |= (opp->nb_cpus - 1) << FRR_NCPU_SHIFT;
> 		break;
> 	case 0x1020:		/* GCR */
> 		retval = opp->gcr;
> @@ -731,8 +771,8 @@ static uint64_t openpic_gbl_read(void *opaque, gpa_t addr, unsigned len)
> 	case 0x90:
> 	case 0xA0:
> 	case 0xB0:
> -		retval =
> -		    openpic_cpu_read_internal(opp, addr, get_current_cpu());
> +		retval = openpic_cpu_read_internal(opp, addr,
> +			&retval, get_current_cpu());

This looks bogus. You're passing &retval and overwrite it with the return value right after the function returns?

> 		break;
> 	case 0x10A0:		/* IPI_IVPR */
> 	case 0x10B0:
> @@ -750,28 +790,28 @@ static uint64_t openpic_gbl_read(void *opaque, gpa_t addr, unsigned len)
> 	default:
> 		break;
> 	}
> -	pr_debug("%s: => 0x%08x\n", __func__, retval);
> 
> -	return retval;
> +out:
> +	pr_debug("%s: => 0x%08x\n", __func__, retval);
> +	*ptr = retval;
> +	return 0;
> }
> 

[...]

> 
> +static int kvm_mpic_read(struct kvm_io_device *this, gpa_t addr,
> +			 int len, void *ptr)
> +{
> +	struct openpic *opp = container_of(this, struct openpic, mmio);
> +	int ret;
> +
> +	/*
> +	 * Technically only 32-bit accesses are allowed, but be nice to
> +	 * people dumping registers a byte at a time -- it works in real
> +	 * hardware (reads only, not writes).

Do 16-bit accesses work in real hardware?

> +	 */
> +	if (len == 4) {
> +		if (addr & 3) {
> +			pr_debug("%s: bad alignment %llx/%d\n",
> +				 __func__, addr, len);
> +			return -EINVAL;
> +		}

if (addr & (len - 1))

Then the read_internal call can be shared between the different access sizes, no?

> +
> +		spin_lock_irq(&opp->lock);
> +		ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, ptr);
> +		spin_unlock_irq(&opp->lock);
> +
> +		pr_debug("%s: addr %llx ret %d len 4 val %x\n",
> +			 __func__, addr, ret, *(const u32 *)ptr);
> +	} else if (len == 1) {
> +		union {
> +			u32 val;
> +			u8 bytes[4];
> +		} u;
> +
> +		spin_lock_irq(&opp->lock);
> +		ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, &u.val);
> +		spin_unlock_irq(&opp->lock);
> +
> +		*(u8 *)ptr = u.bytes[addr & 3];
> +
> +		pr_debug("%s: addr %llx ret %d len 1 val %x\n",
> +			 __func__, addr, ret, *(const u8 *)ptr);
> +	} else {
> +		pr_debug("%s: bad length %d\n", __func__, len);
> +		return -EINVAL;
> +	}
> +
> +	return ret;
> +}
> +

[...]

> 
> +static int mpic_set_attr(struct openpic *opp, struct kvm_device_attr *attr)
> +{
> +	u32 attr32;
> +
> +	switch (attr->group) {
> +	case KVM_DEV_MPIC_GRP_MISC:
> +		switch (attr->attr) {
> +		case KVM_DEV_MPIC_BASE_ADDR:
> +			return set_base_addr(opp, attr);
> +		}
> +
> +		break;
> +
> +	case KVM_DEV_MPIC_GRP_REGISTER:
> +		if (copy_from_user(&attr32, (u32 __user *)(long)attr->addr,
> +				   sizeof(u32)))

get_user?

> +			return -EFAULT;
> +
> +		return access_reg(opp, attr->attr, &attr32, ATTR_SET);
> +
> +	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
> +		if (attr->attr > MAX_SRC)
> +			return -EINVAL;
> +
> +		if (copy_from_user(&attr32, (u32 __user *)(long)attr->addr,
> +				   sizeof(u32)))

same here

> +			return -EFAULT;
> +
> +		if (attr32 != 0 && attr32 != 1)
> +			return -EINVAL;
> +
> +		spin_lock_irq(&opp->lock);
> +		openpic_set_irq(opp, attr->attr, attr32);
> +		spin_unlock_irq(&opp->lock);
> +		return 0;
> +	}
> +
> +	return -ENXIO;
> +}
> +
> +static int mpic_get_attr(struct openpic *opp, struct kvm_device_attr *attr)
> +{
> +	u64 attr64;
> +	u32 attr32;
> +	int ret;
> +
> +	switch (attr->group) {
> +	case KVM_DEV_MPIC_GRP_MISC:
> +		switch (attr->attr) {
> +		case KVM_DEV_MPIC_BASE_ADDR:
> +			mutex_lock(&opp->kvm->slots_lock);
> +			attr64 = opp->reg_base;
> +			mutex_unlock(&opp->kvm->slots_lock);
> +
> +			if (copy_to_user((u64 __user *)(long)attr->addr,
> +					 &attr64, sizeof(u64)))

u64 is tricky with put_user on 32bit hosts, so here copy_to_user makes sense

> +				return -EFAULT;
> +
> +			return 0;
> +		}
> +
> +		break;
> +
> +	case KVM_DEV_MPIC_GRP_REGISTER:
> +		ret = access_reg(opp, attr->attr, &attr32, ATTR_GET);
> +		if (ret)
> +			return ret;
> +
> +		if (copy_to_user((u32 __user *)(long)attr->addr, &attr32,
> +				 sizeof(u32)))

put_user

> +			return -EFAULT;
> +
> +		return 0;
> +
> +	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
> +		if (attr->attr > MAX_SRC)
> +			return -EINVAL;
> +
> +		attr32 = opp->src[attr->attr].pending;

Isn't this missing a lock?

> +
> +		if (copy_to_user((u32 __user *)(long)attr->addr, &attr32,
> +				 sizeof(u32)))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +
> +	return -ENXIO;
> +}


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov April 4, 2013, 5:59 a.m. UTC | #3
On Wed, Apr 03, 2013 at 03:58:04PM -0500, Scott Wood wrote:
> On 04/03/2013 10:55:27 AM, Gleb Natapov wrote:
> >On Tue, Apr 02, 2013 at 08:57:52PM -0500, Scott Wood wrote:
> >> Hook the MPIC code up to the KVM interfaces, add locking, etc.
> >>
> >> TODO: irqfd support, split up into multiple patches, KVM_IRQ_LINE
> >> support
> >>
> >> Signed-off-by: Scott Wood <scottwood@freescale.com>
> >> ---
> >[skip]
> >
> >> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> >> index 20ce2d2..d8f44ef 100644
> >> --- a/include/uapi/linux/kvm.h
> >> +++ b/include/uapi/linux/kvm.h
> >> @@ -927,6 +927,15 @@ struct kvm_device_attr {
> >>  	__u64	addr;		/* userspace address of attr data */
> >>  };
> >>
> >> +#define KVM_DEV_TYPE_FSL_MPIC_20	1
> >> +#define KVM_DEV_TYPE_FSL_MPIC_42	2
> >> +
> >> +#define KVM_DEV_MPIC_GRP_MISC		1
> >> +#define   KVM_DEV_MPIC_BASE_ADDR	0	/* 64-bit */
> >> +
> >> +#define KVM_DEV_MPIC_GRP_REGISTER	2	/* 32-bit */
> >> +#define KVM_DEV_MPIC_GRP_IRQ_ACTIVE	3	/* 32-bit */
> >Why not put them in arch specific header?
> 
> KVM_DEV_TYPE_* is not an arch-specific enumeration -- this was
> discussed last time around.
> 
Yes, I am talking about KVM_DEV_MPIC_* only. KVM_DEV_TYPE_ are used by
common code so should stay here.

> KVM_DEV_MPIC_* could go elsewhere if you want to avoid cluttering
> the main kvm.h.  The arch header would be OK, since the non-arch
> header includes the arch header, and thus it wouldn't be visible to
> userspace where it is -- if there later is a need for MPIC (or
> whatever other device follows MPIC's example) on another
> architecture, it could be moved without breaking anything.  Or, we
> could just have a header for each device type.
> 
If device will be used by more then one arch it will move into virt/kvm
and will have its own header, like ioapic.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras April 8, 2013, 6:30 a.m. UTC | #4
On Tue, Apr 02, 2013 at 08:57:52PM -0500, Scott Wood wrote:
> Hook the MPIC code up to the KVM interfaces, add locking, etc.

[snip]

> @@ -2164,6 +2164,15 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
>  	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
>  
>  	switch (cd->type) {
> +#ifdef CONFIG_KVM_MPIC
> +	case KVM_DEV_TYPE_FSL_MPIC_20:
> +	case KVM_DEV_TYPE_FSL_MPIC_42: {
> +		if (test)
> +			return 0;
> +
> +		return kvm_create_mpic(kvm, cd->type);
> +	}
> +#endif

I think this needs to be more like:

#ifdef CONFIG_KVM_MPIC
	case KVM_DEV_TYPE_FSL_MPIC_20:
	case KVM_DEV_TYPE_FSL_MPIC_42: {
		int fd;

		if (test)
			return 0;

		fd = kvm_create_mpic(kvm, cd->type);
		if (fd < 0)
			return fd;
		cd->fd = fd;
		return 0;
	}
#endif

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov April 8, 2013, 10:39 a.m. UTC | #5
On Thu, Apr 04, 2013 at 06:33:38PM -0500, Scott Wood wrote:
> On 04/04/2013 12:59:02 AM, Gleb Natapov wrote:
> >On Wed, Apr 03, 2013 at 03:58:04PM -0500, Scott Wood wrote:
> >> KVM_DEV_MPIC_* could go elsewhere if you want to avoid cluttering
> >> the main kvm.h.  The arch header would be OK, since the non-arch
> >> header includes the arch header, and thus it wouldn't be visible to
> >> userspace where it is -- if there later is a need for MPIC (or
> >> whatever other device follows MPIC's example) on another
> >> architecture, it could be moved without breaking anything.  Or, we
> >> could just have a header for each device type.
> >>
> >If device will be used by more then one arch it will move into
> >virt/kvm
> >and will have its own header, like ioapic.
> 
> virt/kvm/ioapic.h is not uapi.  The ioapic uapi component (e.g.
> struct kvm_ioapic_state) is duplicated between x86 and ia64, which
> is the sort of thing I'd like to avoid.  I'm OK with putting it in
> the PPC header if, upon a later need for multi-architecture support,
> it could move into either the main uapi header or a separate uapi
> header that the main uapi header includes (i.e. no userspace-visible
> change in which header needs to be included).
> 
Agree, it make sense to have separate uapi header for a device that is
used by more then one arch.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Wood April 9, 2013, 12:49 a.m. UTC | #6
On 04/08/2013 01:30:42 AM, Paul Mackerras wrote:
> On Tue, Apr 02, 2013 at 08:57:52PM -0500, Scott Wood wrote:
> > Hook the MPIC code up to the KVM interfaces, add locking, etc.
> 
> [snip]
> 
> > @@ -2164,6 +2164,15 @@ static int kvm_ioctl_create_device(struct  
> kvm *kvm,
> >  	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
> >
> >  	switch (cd->type) {
> > +#ifdef CONFIG_KVM_MPIC
> > +	case KVM_DEV_TYPE_FSL_MPIC_20:
> > +	case KVM_DEV_TYPE_FSL_MPIC_42: {
> > +		if (test)
> > +			return 0;
> > +
> > +		return kvm_create_mpic(kvm, cd->type);
> > +	}
> > +#endif
> 
> I think this needs to be more like:
> 
> #ifdef CONFIG_KVM_MPIC
> 	case KVM_DEV_TYPE_FSL_MPIC_20:
> 	case KVM_DEV_TYPE_FSL_MPIC_42: {
> 		int fd;
> 
> 		if (test)
> 			return 0;
> 
> 		fd = kvm_create_mpic(kvm, cd->type);
> 		if (fd < 0)
> 			return fd;
> 		cd->fd = fd;
> 		return 0;
> 	}
> #endif

Right, thanks for spotting.  It didn't show up in my testing because I  
did the same thing on the QEMU side.

-Scott
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/devices/mpic.txt b/Documentation/virtual/kvm/devices/mpic.txt
new file mode 100644
index 0000000..79e000a
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/mpic.txt
@@ -0,0 +1,37 @@ 
+MPIC interrupt controller
+=========================
+
+Device types supported:
+  KVM_DEV_TYPE_FSL_MPIC_20     Freescale MPIC v2.0
+  KVM_DEV_TYPE_FSL_MPIC_42     Freescale MPIC v4.2
+
+Only one MPIC instance, of any type, may be instantiated.  The created
+MPIC will act as the system interrupt controller, connecting to each
+vcpu's interrupt inputs.
+
+Groups:
+  KVM_DEV_MPIC_GRP_MISC
+  Attributes:
+    KVM_DEV_MPIC_BASE_ADDR (rw, 64-bit)
+      Base address of the 256 KiB MPIC register space.  Must be
+      naturally aligned.  A value of zero disables the mapping.
+      Reset value is zero.
+
+  KVM_DEV_MPIC_GRP_REGISTER (rw, 32-bit)
+    Access an MPIC register, as if the access were made from the guest. 
+    "attr" is the byte offset into the MPIC register space.  Accesses
+    must be 4-byte aligned.
+
+    MSIs may be signaled by using this attribute group to write
+    to the relevant MSIIR.
+
+  KVM_DEV_MPIC_GRP_IRQ_ACTIVE (rw, 32-bit)
+    IRQ input line for each standard openpic source.  0 is inactive and 1
+    is active, regardless of interrupt sense.
+
+    For edge-triggered interrupts:  Writing 1 is considered an activating
+    edge, and writing 0 is ignored.  Reading returns 1 if a previously
+    signaled edge has not been acknowledged, and 0 otherwise.
+
+    "attr" is the IRQ number.  IRQ numbers for standard sources are the
+    byte offset of the relevant IVPR from EIVPR0, divided by 32.
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e34f8fe..7e7aef9 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -359,6 +359,11 @@  struct kvmppc_slb {
 #define KVMPPC_BOOKE_MAX_IAC	4
 #define KVMPPC_BOOKE_MAX_DAC	2
 
+/* KVMPPC_EPR_USER takes precedence over KVMPPC_EPR_KERNEL */
+#define KVMPPC_EPR_NONE		0 /* EPR not supported */
+#define KVMPPC_EPR_USER		1 /* exit to userspace to fill EPR */
+#define KVMPPC_EPR_KERNEL	2 /* in-kernel irqchip */
+
 struct kvmppc_booke_debug_reg {
 	u32 dbcr0;
 	u32 dbcr1;
@@ -522,7 +527,7 @@  struct kvm_vcpu_arch {
 	u8 sane;
 	u8 cpu_type;
 	u8 hcall_needed;
-	u8 epr_enabled;
+	u8 epr_flags; /* KVMPPC_EPR_xxx */
 	u8 epr_needed;
 
 	u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
@@ -589,5 +594,6 @@  struct kvm_vcpu_arch {
 #define KVM_MMIO_REG_FQPR	0x0060
 
 #define __KVM_HAVE_ARCH_WQP
+#define __KVM_HAVE_CREATE_DEVICE
 
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index f589307..3b63b97 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -164,6 +164,8 @@  extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
 
 extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
 
+int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
+
 /*
  * Cuts out inst bits with ordering according to spec.
  * That means the leftmost bit is zero. All given bits are included.
@@ -245,6 +247,9 @@  int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *);
 
 void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
 
+struct openpic;
+void kvmppc_mpic_put(struct openpic *opp);
+
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {
@@ -270,6 +275,8 @@  static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
 #endif
 }
 
+void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu);
+
 int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 			      struct kvm_config_tlb *cfg);
 int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 63c67ec..a87139b 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -151,6 +151,11 @@  config KVM_E500MC
 
 	  If unsure, say N.
 
+config KVM_MPIC
+	bool "KVM in-kernel MPIC emulation"
+	depends on KVM
+
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index b772ede..4a2277a 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -103,6 +103,8 @@  kvm-book3s_32-objs := \
 	book3s_32_mmu.o
 kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
 
+kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o
+
 kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
 
 obj-$(CONFIG_KVM_440) += kvm.o
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 58057d6..cddc6b3 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -346,7 +346,7 @@  static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 		keep_irq = true;
 	}
 
-	if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_enabled)
+	if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_flags)
 		update_epr = true;
 
 	switch (priority) {
@@ -427,8 +427,12 @@  static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 			set_guest_esr(vcpu, vcpu->arch.queued_esr);
 		if (update_dear == true)
 			set_guest_dear(vcpu, vcpu->arch.queued_dear);
-		if (update_epr == true)
-			kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
+		if (update_epr == true) {
+			if (vcpu->arch.epr_flags & KVMPPC_EPR_USER)
+				kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
+			else if (vcpu->arch.epr_flags & KVMPPC_EPR_KERNEL)
+				kvmppc_mpic_set_epr(vcpu);
+		}
 
 		new_msr &= msr_mask;
 #if defined(CONFIG_64BIT)
diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index 1df67ae..8cda2fa 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -23,6 +23,19 @@ 
  * THE SOFTWARE.
  */
 
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/kvm_host.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <asm/uaccess.h>
+#include <asm/mpic.h>
+#include <asm/kvm_para.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_ppc.h>
+#include "iodev.h"
+
 #define MAX_CPU     32
 #define MAX_SRC     256
 #define MAX_TMR     4
@@ -36,6 +49,7 @@ 
 #define OPENPIC_FLAG_ILR          (2 << 0)
 
 /* OpenPIC address map */
+#define OPENPIC_REG_SIZE             0x40000
 #define OPENPIC_GLB_REG_START        0x0
 #define OPENPIC_GLB_REG_SIZE         0x10F0
 #define OPENPIC_TMR_REG_START        0x10F0
@@ -89,6 +103,7 @@  static struct fsl_mpic_info fsl_mpic_42 = {
 #define ILR_INTTGT_INT    0x00
 #define ILR_INTTGT_CINT   0x01	/* critical */
 #define ILR_INTTGT_MCP    0x02	/* machine check */
+#define NUM_OUTPUTS       3
 
 #define MSIIR_OFFSET       0x140
 #define MSIIR_SRS_SHIFT    29
@@ -98,18 +113,14 @@  static struct fsl_mpic_info fsl_mpic_42 = {
 
 static int get_current_cpu(void)
 {
-	CPUState *cpu_single_cpu;
-
-	if (!cpu_single_env)
-		return -1;
-
-	cpu_single_cpu = ENV_GET_CPU(cpu_single_env);
-	return cpu_single_cpu->cpu_index;
+	struct kvm_vcpu *vcpu = current->thread.kvm_vcpu;
+	return vcpu ? vcpu->vcpu_id : -1;
 }
 
-static uint32_t openpic_cpu_read_internal(void *opaque, gpa_t addr, int idx);
-static void openpic_cpu_write_internal(void *opaque, gpa_t addr,
-				       uint32_t val, int idx);
+static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
+				      u32 val, int idx);
+static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
+				     u32 *ptr, int idx);
 
 enum irq_type {
 	IRQ_TYPE_NORMAL = 0,
@@ -131,7 +142,7 @@  struct irq_source {
 	uint32_t idr;		/* IRQ destination register */
 	uint32_t destmask;	/* bitmap of CPU destinations */
 	int last_cpu;
-	int output;		/* IRQ level, e.g. OPENPIC_OUTPUT_INT */
+	int output;		/* IRQ level, e.g. ILR_INTTGT_INT */
 	int pending;		/* TRUE if IRQ is pending */
 	enum irq_type type;
 	bool level:1;		/* level-triggered */
@@ -158,16 +169,28 @@  struct irq_source {
 #define IDR_CI      0x40000000	/* critical interrupt */
 
 struct irq_dest {
+	struct kvm_vcpu *vcpu;
+
 	int32_t ctpr;		/* CPU current task priority */
 	struct irq_queue raised;
 	struct irq_queue servicing;
-	qemu_irq *irqs;
 
 	/* Count of IRQ sources asserting on non-INT outputs */
-	uint32_t outputs_active[OPENPIC_OUTPUT_NB];
+	uint32_t outputs_active[NUM_OUTPUTS];
 };
 
+struct openpic;
+
 struct openpic {
+	struct kvm *kvm;
+	struct kvm_io_device mmio;
+	struct list_head mmio_regions;
+	atomic_t users;
+	bool mmio_mapped;
+
+	gpa_t reg_base;
+	spinlock_t lock;
+
 	/* Behavior control */
 	struct fsl_mpic_info *fsl;
 	uint32_t model;
@@ -208,6 +231,47 @@  struct openpic {
 	uint32_t irq_msi;
 };
 
+
+static void mpic_irq_raise(struct openpic *opp, struct irq_dest *dst,
+			   int output)
+{
+	struct kvm_interrupt irq = {
+		.irq = KVM_INTERRUPT_SET_LEVEL,
+	};
+
+	if (!dst->vcpu) {
+		pr_debug("%s: destination cpu %d does not exist\n",
+			 __func__, dst - &opp->dst[0]);
+		return;
+	}
+
+	pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->vcpu_id,
+		output);
+
+	if (output != ILR_INTTGT_INT)	/* TODO */
+		return;
+
+	kvm_vcpu_ioctl_interrupt(dst->vcpu, &irq);
+}
+
+static void mpic_irq_lower(struct openpic *opp, struct irq_dest *dst,
+			   int output)
+{
+	if (!dst->vcpu) {
+		pr_debug("%s: destination cpu %d does not exist\n",
+			 __func__, dst - &opp->dst[0]);
+		return;
+	}
+
+	pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->vcpu_id,
+		output);
+
+	if (output != ILR_INTTGT_INT)	/* TODO */
+		return;
+
+	kvmppc_core_dequeue_external(dst->vcpu);
+}
+
 static inline void IRQ_setbit(struct irq_queue *q, int n_IRQ)
 {
 	set_bit(n_IRQ, q->queue);
@@ -268,7 +332,7 @@  static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ,
 	pr_debug("%s: IRQ %d active %d was %d\n",
 		__func__, n_IRQ, active, was_active);
 
-	if (src->output != OPENPIC_OUTPUT_INT) {
+	if (src->output != ILR_INTTGT_INT) {
 		pr_debug("%s: output %d irq %d active %d was %d count %d\n",
 			__func__, src->output, n_IRQ, active, was_active,
 			dst->outputs_active[src->output]);
@@ -282,14 +346,14 @@  static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ,
 			    dst->outputs_active[src->output]++ == 0) {
 				pr_debug("%s: Raise OpenPIC output %d cpu %d irq %d\n",
 					__func__, src->output, n_CPU, n_IRQ);
-				qemu_irq_raise(dst->irqs[src->output]);
+				mpic_irq_raise(opp, dst, src->output);
 			}
 		} else {
 			if (was_active &&
 			    --dst->outputs_active[src->output] == 0) {
 				pr_debug("%s: Lower OpenPIC output %d cpu %d irq %d\n",
 					__func__, src->output, n_CPU, n_IRQ);
-				qemu_irq_lower(dst->irqs[src->output]);
+				mpic_irq_lower(opp, dst, src->output);
 			}
 		}
 
@@ -322,8 +386,7 @@  static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ,
 		} else {
 			pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d/%d\n",
 				__func__, n_CPU, n_IRQ, dst->raised.next);
-			qemu_irq_raise(opp->dst[n_CPU].
-				       irqs[OPENPIC_OUTPUT_INT]);
+			mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
 		}
 	} else {
 		IRQ_get_next(opp, &dst->servicing);
@@ -338,8 +401,7 @@  static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ,
 			pr_debug("%s: IRQ %d inactive, current prio %d/%d, CPU %d\n",
 				__func__, n_IRQ, dst->ctpr,
 				dst->servicing.priority, n_CPU);
-			qemu_irq_lower(opp->dst[n_CPU].
-				       irqs[OPENPIC_OUTPUT_INT]);
+			mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
 		}
 	}
 }
@@ -415,8 +477,8 @@  static void openpic_set_irq(void *opaque, int n_IRQ, int level)
 	struct irq_source *src;
 
 	if (n_IRQ >= MAX_IRQ) {
-		pr_err("%s: IRQ %d out of range\n", __func__, n_IRQ);
-		abort();
+		WARN_ONCE(1, "%s: IRQ %d out of range\n", __func__, n_IRQ);
+		return;
 	}
 
 	src = &opp->src[n_IRQ];
@@ -433,7 +495,7 @@  static void openpic_set_irq(void *opaque, int n_IRQ, int level)
 			openpic_update_irq(opp, n_IRQ);
 		}
 
-		if (src->output != OPENPIC_OUTPUT_INT) {
+		if (src->output != ILR_INTTGT_INT) {
 			/* Edge-triggered interrupts shouldn't be used
 			 * with non-INT delivery, but just in case,
 			 * try to make it do something sane rather than
@@ -446,15 +508,13 @@  static void openpic_set_irq(void *opaque, int n_IRQ, int level)
 	}
 }
 
-static void openpic_reset(DeviceState *d)
+static void openpic_reset(struct openpic *opp)
 {
-	struct openpic *opp = FROM_SYSBUS(typeof(*opp), SYS_BUS_DEVICE(d));
 	int i;
 
 	opp->gcr = GCR_RESET;
 	/* Initialise controller registers */
 	opp->frr = ((opp->nb_irqs - 1) << FRR_NIRQ_SHIFT) |
-	    ((opp->nb_cpus - 1) << FRR_NCPU_SHIFT) |
 	    (opp->vid << FRR_VID_SHIFT);
 
 	opp->pir = 0;
@@ -504,7 +564,7 @@  static inline uint32_t read_IRQreg_idr(struct openpic *opp, int n_IRQ)
 static inline uint32_t read_IRQreg_ilr(struct openpic *opp, int n_IRQ)
 {
 	if (opp->flags & OPENPIC_FLAG_ILR)
-		return output_to_inttgt(opp->src[n_IRQ].output);
+		return opp->src[n_IRQ].output;
 
 	return 0xffffffff;
 }
@@ -539,7 +599,7 @@  static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ,
 					__func__);
 			}
 
-			src->output = OPENPIC_OUTPUT_CINT;
+			src->output = ILR_INTTGT_CINT;
 			src->nomask = true;
 			src->destmask = 0;
 
@@ -550,7 +610,7 @@  static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ,
 					src->destmask |= 1UL << i;
 			}
 		} else {
-			src->output = OPENPIC_OUTPUT_INT;
+			src->output = ILR_INTTGT_INT;
 			src->nomask = false;
 			src->destmask = src->idr & normal_mask;
 		}
@@ -565,7 +625,7 @@  static inline void write_IRQreg_ilr(struct openpic *opp, int n_IRQ,
 	if (opp->flags & OPENPIC_FLAG_ILR) {
 		struct irq_source *src = &opp->src[n_IRQ];
 
-		src->output = inttgt_to_output(val & ILR_INTTGT_MASK);
+		src->output = val & ILR_INTTGT_MASK;
 		pr_debug("Set ILR %d to 0x%08x, output %d\n", n_IRQ, src->idr,
 			src->output);
 
@@ -614,34 +674,22 @@  static inline void write_IRQreg_ivpr(struct openpic *opp, int n_IRQ,
 
 static void openpic_gcr_write(struct openpic *opp, uint64_t val)
 {
-	bool mpic_proxy = false;
-
 	if (val & GCR_RESET) {
-		openpic_reset(&opp->busdev.qdev);
+		openpic_reset(opp);
 		return;
 	}
 
 	opp->gcr &= ~opp->mpic_mode_mask;
 	opp->gcr |= val & opp->mpic_mode_mask;
-
-	/* Set external proxy mode */
-	if ((val & opp->mpic_mode_mask) == GCR_MODE_PROXY)
-		mpic_proxy = true;
-
-	ppce500_set_mpic_proxy(mpic_proxy);
 }
 
-static void openpic_gbl_write(void *opaque, gpa_t addr, uint64_t val,
-			      unsigned len)
+static int openpic_gbl_write(void *opaque, gpa_t addr, u32 val)
 {
 	struct openpic *opp = opaque;
-	struct irq_dest *dst;
-	int idx;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n",
-		__func__, addr, val);
+	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
 	if (addr & 0xF)
-		return;
+		return 0;
 
 	switch (addr) {
 	case 0x00:	/* Block Revision Register1 (BRR1) is Readonly */
@@ -664,22 +712,11 @@  static void openpic_gbl_write(void *opaque, gpa_t addr, uint64_t val,
 	case 0x1080:		/* VIR */
 		break;
 	case 0x1090:		/* PIR */
-		for (idx = 0; idx < opp->nb_cpus; idx++) {
-			if ((val & (1 << idx)) && !(opp->pir & (1 << idx))) {
-				pr_debug("Raise OpenPIC RESET output for CPU %d\n",
-					idx);
-				dst = &opp->dst[idx];
-				qemu_irq_raise(dst->irqs[OPENPIC_OUTPUT_RESET]);
-			} else if (!(val & (1 << idx)) &&
-				   (opp->pir & (1 << idx))) {
-				pr_debug("Lower OpenPIC RESET output for CPU %d\n",
-					idx);
-				dst = &opp->dst[idx];
-				qemu_irq_lower(dst->irqs[OPENPIC_OUTPUT_RESET]);
-			}
-		}
-		opp->pir = val;
-		break;
+		/*
+		 * This register is used to reset a CPU core --
+		 * let userspace handle it.
+		 */
+		return 1;
 	case 0x10A0:		/* IPI_IVPR */
 	case 0x10B0:
 	case 0x10C0:
@@ -695,21 +732,24 @@  static void openpic_gbl_write(void *opaque, gpa_t addr, uint64_t val,
 	default:
 		break;
 	}
+
+	return 0;
 }
 
-static uint64_t openpic_gbl_read(void *opaque, gpa_t addr, unsigned len)
+static int openpic_gbl_read(void *opaque, gpa_t addr, u32 *ptr)
 {
 	struct openpic *opp = opaque;
-	uint32_t retval;
+	u32 retval;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr);
+	pr_debug("%s: addr %#llx\n", __func__, addr);
 	retval = 0xFFFFFFFF;
 	if (addr & 0xF)
-		return retval;
+		goto out;
 
 	switch (addr) {
 	case 0x1000:		/* FRR */
 		retval = opp->frr;
+		retval |= (opp->nb_cpus - 1) << FRR_NCPU_SHIFT;
 		break;
 	case 0x1020:		/* GCR */
 		retval = opp->gcr;
@@ -731,8 +771,8 @@  static uint64_t openpic_gbl_read(void *opaque, gpa_t addr, unsigned len)
 	case 0x90:
 	case 0xA0:
 	case 0xB0:
-		retval =
-		    openpic_cpu_read_internal(opp, addr, get_current_cpu());
+		retval = openpic_cpu_read_internal(opp, addr,
+			&retval, get_current_cpu());
 		break;
 	case 0x10A0:		/* IPI_IVPR */
 	case 0x10B0:
@@ -750,28 +790,28 @@  static uint64_t openpic_gbl_read(void *opaque, gpa_t addr, unsigned len)
 	default:
 		break;
 	}
-	pr_debug("%s: => 0x%08x\n", __func__, retval);
 
-	return retval;
+out:
+	pr_debug("%s: => 0x%08x\n", __func__, retval);
+	*ptr = retval;
+	return 0;
 }
 
-static void openpic_tmr_write(void *opaque, gpa_t addr, uint64_t val,
-			      unsigned len)
+static int openpic_tmr_write(void *opaque, gpa_t addr, u32 val)
 {
 	struct openpic *opp = opaque;
 	int idx;
 
 	addr += 0x10f0;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n",
-		__func__, addr, val);
+	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
 	if (addr & 0xF)
-		return;
+		return 0;
 
 	if (addr == 0x10f0) {
 		/* TFRR */
 		opp->tfrr = val;
-		return;
+		return 0;
 	}
 
 	idx = (addr >> 6) & 0x3;
@@ -795,15 +835,17 @@  static void openpic_tmr_write(void *opaque, gpa_t addr, uint64_t val,
 		write_IRQreg_idr(opp, opp->irq_tim0 + idx, val);
 		break;
 	}
+
+	return 0;
 }
 
-static uint64_t openpic_tmr_read(void *opaque, gpa_t addr, unsigned len)
+static int openpic_tmr_read(void *opaque, gpa_t addr, u32 *ptr)
 {
 	struct openpic *opp = opaque;
 	uint32_t retval = -1;
 	int idx;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr);
+	pr_debug("%s: addr %#llx\n", __func__, addr);
 	if (addr & 0xF)
 		goto out;
 
@@ -813,6 +855,7 @@  static uint64_t openpic_tmr_read(void *opaque, gpa_t addr, unsigned len)
 		retval = opp->tfrr;
 		goto out;
 	}
+
 	switch (addr & 0x30) {
 	case 0x00:		/* TCCR */
 		retval = opp->timers[idx].tccr;
@@ -830,18 +873,16 @@  static uint64_t openpic_tmr_read(void *opaque, gpa_t addr, unsigned len)
 
 out:
 	pr_debug("%s: => 0x%08x\n", __func__, retval);
-
-	return retval;
+	*ptr = retval;
+	return 0;
 }
 
-static void openpic_src_write(void *opaque, gpa_t addr, uint64_t val,
-			      unsigned len)
+static int openpic_src_write(void *opaque, gpa_t addr, u32 val)
 {
 	struct openpic *opp = opaque;
 	int idx;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n",
-		__func__, addr, val);
+	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
 
 	addr = addr & 0xffff;
 	idx = addr >> 5;
@@ -857,15 +898,17 @@  static void openpic_src_write(void *opaque, gpa_t addr, uint64_t val,
 		write_IRQreg_ilr(opp, idx, val);
 		break;
 	}
+
+	return 0;
 }
 
-static uint64_t openpic_src_read(void *opaque, uint64_t addr, unsigned len)
+static int openpic_src_read(void *opaque, gpa_t addr, u32 *ptr)
 {
 	struct openpic *opp = opaque;
 	uint32_t retval;
 	int idx;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr);
+	pr_debug("%s: addr %#llx\n", __func__, addr);
 	retval = 0xFFFFFFFF;
 
 	addr = addr & 0xffff;
@@ -884,20 +927,19 @@  static uint64_t openpic_src_read(void *opaque, uint64_t addr, unsigned len)
 	}
 
 	pr_debug("%s: => 0x%08x\n", __func__, retval);
-	return retval;
+	*ptr = retval;
+	return 0;
 }
 
-static void openpic_msi_write(void *opaque, gpa_t addr, uint64_t val,
-			      unsigned size)
+static int openpic_msi_write(void *opaque, gpa_t addr, u32 val)
 {
 	struct openpic *opp = opaque;
 	int idx = opp->irq_msi;
 	int srs, ibs;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx " <= 0x%08" PRIx64 "\n",
-		__func__, addr, val);
+	pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val);
 	if (addr & 0xF)
-		return;
+		return 0;
 
 	switch (addr) {
 	case MSIIR_OFFSET:
@@ -911,17 +953,19 @@  static void openpic_msi_write(void *opaque, gpa_t addr, uint64_t val,
 		/* most registers are read-only, thus ignored */
 		break;
 	}
+
+	return 0;
 }
 
-static uint64_t openpic_msi_read(void *opaque, gpa_t addr, unsigned size)
+static int openpic_msi_read(void *opaque, gpa_t addr, u32 *ptr)
 {
 	struct openpic *opp = opaque;
-	uint64_t r = 0;
+	uint32_t r = 0;
 	int i, srs;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr);
+	pr_debug("%s: addr %#llx\n", __func__, addr);
 	if (addr & 0xF)
-		return -1;
+		return 1;
 
 	srs = addr >> 4;
 
@@ -945,45 +989,47 @@  static uint64_t openpic_msi_read(void *opaque, gpa_t addr, unsigned size)
 		break;
 	}
 
-	return r;
+	pr_debug("%s: => 0x%08x\n", __func__, r);
+	*ptr = r;
+	return 0;
 }
 
-static uint64_t openpic_summary_read(void *opaque, gpa_t addr, unsigned size)
+static int openpic_summary_read(void *opaque, gpa_t addr, u32 *ptr)
 {
-	uint64_t r = 0;
+	uint32_t r = 0;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr);
+	pr_debug("%s: addr %#llx\n", __func__, addr);
 
 	/* TODO: EISR/EIMR */
 
-	return r;
+	*ptr = r;
+	return 0;
 }
 
-static void openpic_summary_write(void *opaque, gpa_t addr, uint64_t val,
-				  unsigned size)
+static int openpic_summary_write(void *opaque, gpa_t addr, u32 val)
 {
-	pr_debug("%s: addr %#" HWADDR_PRIx " <= 0x%08" PRIx64 "\n",
-		__func__, addr, val);
+	pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val);
 
 	/* TODO: EISR/EIMR */
+	return 0;
 }
 
-static void openpic_cpu_write_internal(void *opaque, gpa_t addr,
-				       uint32_t val, int idx)
+static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
+				      u32 val, int idx)
 {
 	struct openpic *opp = opaque;
 	struct irq_source *src;
 	struct irq_dest *dst;
 	int s_IRQ, n_IRQ;
 
-	pr_debug("%s: cpu %d addr %#" HWADDR_PRIx " <= 0x%08x\n", __func__, idx,
+	pr_debug("%s: cpu %d addr %#llx <= 0x%08x\n", __func__, idx,
 		addr, val);
 
 	if (idx < 0)
-		return;
+		return 0;
 
 	if (addr & 0xF)
-		return;
+		return 0;
 
 	dst = &opp->dst[idx];
 	addr &= 0xFF0;
@@ -1008,11 +1054,11 @@  static void openpic_cpu_write_internal(void *opaque, gpa_t addr,
 		if (dst->raised.priority <= dst->ctpr) {
 			pr_debug("%s: Lower OpenPIC INT output cpu %d due to ctpr\n",
 				__func__, idx);
-			qemu_irq_lower(dst->irqs[OPENPIC_OUTPUT_INT]);
+			mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
 		} else if (dst->raised.priority > dst->servicing.priority) {
 			pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d\n",
 				__func__, idx, dst->raised.next);
-			qemu_irq_raise(dst->irqs[OPENPIC_OUTPUT_INT]);
+			mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
 		}
 
 		break;
@@ -1043,18 +1089,22 @@  static void openpic_cpu_write_internal(void *opaque, gpa_t addr,
 		     IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) {
 			pr_debug("Raise OpenPIC INT output cpu %d irq %d\n",
 				idx, n_IRQ);
-			qemu_irq_raise(opp->dst[idx].irqs[OPENPIC_OUTPUT_INT]);
+			mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
 		}
 		break;
 	default:
 		break;
 	}
+
+	return 0;
 }
 
-static void openpic_cpu_write(void *opaque, gpa_t addr, uint64_t val,
-			      unsigned len)
+static int openpic_cpu_write(void *opaque, gpa_t addr, u32 val)
 {
-	openpic_cpu_write_internal(opaque, addr, val, (addr & 0x1f000) >> 12);
+	struct openpic *opp = opaque;
+
+	return openpic_cpu_write_internal(opp, addr, val,
+					 (addr & 0x1f000) >> 12);
 }
 
 static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst,
@@ -1064,7 +1114,7 @@  static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst,
 	int retval, irq;
 
 	pr_debug("Lower OpenPIC INT output\n");
-	qemu_irq_lower(dst->irqs[OPENPIC_OUTPUT_INT]);
+	mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
 
 	irq = IRQ_get_next(opp, &dst->raised);
 	pr_debug("IACK: irq=%d\n", irq);
@@ -1107,20 +1157,35 @@  static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst,
 	return retval;
 }
 
-static uint32_t openpic_cpu_read_internal(void *opaque, gpa_t addr, int idx)
+void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
+{
+	struct openpic *opp = vcpu->arch.irqchip_priv;
+	int cpu = vcpu->vcpu_id;
+	unsigned long flags;
+
+	spin_lock_irqsave(&opp->lock, flags);
+
+	if ((opp->gcr & opp->mpic_mode_mask) == GCR_MODE_PROXY)
+		kvmppc_set_epr(vcpu, openpic_iack(opp, &opp->dst[cpu], cpu));
+
+	spin_unlock_irqrestore(&opp->lock, flags);
+}
+
+static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
+				     u32 *ptr, int idx)
 {
 	struct openpic *opp = opaque;
 	struct irq_dest *dst;
 	uint32_t retval;
 
-	pr_debug("%s: cpu %d addr %#" HWADDR_PRIx "\n", __func__, idx, addr);
+	pr_debug("%s: cpu %d addr %#llx\n", __func__, idx, addr);
 	retval = 0xFFFFFFFF;
 
 	if (idx < 0)
-		return retval;
+		goto out;
 
 	if (addr & 0xF)
-		return retval;
+		goto out;
 
 	dst = &opp->dst[idx];
 	addr &= 0xFF0;
@@ -1142,49 +1207,67 @@  static uint32_t openpic_cpu_read_internal(void *opaque, gpa_t addr, int idx)
 	}
 	pr_debug("%s: => 0x%08x\n", __func__, retval);
 
-	return retval;
+out:
+	*ptr = retval;
+	return 0;
 }
 
-static uint64_t openpic_cpu_read(void *opaque, gpa_t addr, unsigned len)
+static int openpic_cpu_read(void *opaque, gpa_t addr, u32 *ptr)
 {
-	return openpic_cpu_read_internal(opaque, addr, (addr & 0x1f000) >> 12);
+	struct openpic *opp = opaque;
+
+	return openpic_cpu_read_internal(opp, addr, ptr,
+					 (addr & 0x1f000) >> 12);
 }
 
-static const struct kvm_io_device_ops openpic_glb_ops_be = {
+struct mem_reg {
+	struct list_head list;
+	int (*read)(void *opaque, gpa_t addr, u32 *ptr);
+	int (*write)(void *opaque, gpa_t addr, u32 val);
+	gpa_t start_addr;
+	int size;
+};
+
+static struct mem_reg openpic_gbl_mmio = {
 	.write = openpic_gbl_write,
 	.read = openpic_gbl_read,
+	.start_addr = OPENPIC_GLB_REG_START,
+	.size = OPENPIC_GLB_REG_SIZE,
 };
 
-static const struct kvm_io_device_ops openpic_tmr_ops_be = {
+static struct mem_reg openpic_tmr_mmio = {
 	.write = openpic_tmr_write,
 	.read = openpic_tmr_read,
+	.start_addr = OPENPIC_TMR_REG_START,
+	.size = OPENPIC_TMR_REG_SIZE,
 };
 
-static const struct kvm_io_device_ops openpic_cpu_ops_be = {
+static struct mem_reg openpic_cpu_mmio = {
 	.write = openpic_cpu_write,
 	.read = openpic_cpu_read,
+	.start_addr = OPENPIC_CPU_REG_START,
+	.size = OPENPIC_CPU_REG_SIZE,
 };
 
-static const struct kvm_io_device_ops openpic_src_ops_be = {
+static struct mem_reg openpic_src_mmio = {
 	.write = openpic_src_write,
 	.read = openpic_src_read,
+	.start_addr = OPENPIC_SRC_REG_START,
+	.size = OPENPIC_SRC_REG_SIZE,
 };
 
-static const struct kvm_io_device_ops openpic_msi_ops_be = {
+static struct mem_reg openpic_msi_mmio = {
 	.read = openpic_msi_read,
 	.write = openpic_msi_write,
+	.start_addr = OPENPIC_MSI_REG_START,
+	.size = OPENPIC_MSI_REG_SIZE,
 };
 
-static const struct kvm_io_device_ops openpic_summary_ops_be = {
+static struct mem_reg openpic_summary_mmio = {
 	.read = openpic_summary_read,
 	.write = openpic_summary_write,
-};
-
-struct mem_reg {
-	const char *name;
-	const struct kvm_io_device_ops *ops;
-	gpa_t start_addr;
-	int size;
+	.start_addr = OPENPIC_SUMMARY_REG_START,
+	.size = OPENPIC_SUMMARY_REG_SIZE,
 };
 
 static void fsl_common_init(struct openpic *opp)
@@ -1192,6 +1275,9 @@  static void fsl_common_init(struct openpic *opp)
 	int i;
 	int virq = MAX_SRC;
 
+	list_add(&openpic_msi_mmio.list, &opp->mmio_regions);
+	list_add(&openpic_summary_mmio.list, &opp->mmio_regions);
+
 	opp->vid = VID_REVISION_1_2;
 	opp->vir = VIR_GENERIC;
 	opp->vector_mask = 0xFFFF;
@@ -1205,11 +1291,10 @@  static void fsl_common_init(struct openpic *opp)
 	opp->irq_tim0 = virq;
 	virq += MAX_TMR;
 
-	assert(virq <= MAX_IRQ);
+	BUG_ON(virq > MAX_IRQ);
 
 	opp->irq_msi = 224;
 
-	msi_supported = true;
 	for (i = 0; i < opp->fsl->max_ext; i++)
 		opp->src[i].level = false;
 
@@ -1226,63 +1311,404 @@  static void fsl_common_init(struct openpic *opp)
 	}
 }
 
-static void map_list(struct openpic *opp, const struct mem_reg *list,
-		     int *count)
+static int kvm_mpic_read_internal(struct openpic *opp, gpa_t addr, u32 *ptr)
 {
-	while (list->name) {
-		assert(*count < ARRAY_SIZE(opp->sub_io_mem));
+	struct list_head *node;
 
-		memory_region_init_io(&opp->sub_io_mem[*count], list->ops, opp,
-				      list->name, list->size);
+	list_for_each(node, &opp->mmio_regions) {
+		struct mem_reg *mr = list_entry(node, struct mem_reg, list);
 
-		memory_region_add_subregion(&opp->mem, list->start_addr,
-					    &opp->sub_io_mem[*count]);
+		if (mr->start_addr > addr || addr >= mr->start_addr + mr->size)
+			continue;
 
-		(*count)++;
-		list++;
+		return mr->read(opp, addr - mr->start_addr, ptr);
 	}
+
+	return 1;
 }
 
-static int openpic_init(SysBusDevice *dev)
+static int kvm_mpic_write_internal(struct openpic *opp, gpa_t addr, u32 val)
 {
-	struct openpic *opp = FROM_SYSBUS(typeof(*opp), dev);
-	int i, j;
-	int list_count = 0;
-	static const struct mem_reg list_le[] = {
-		{"glb", &openpic_glb_ops_le,
-		 OPENPIC_GLB_REG_START, OPENPIC_GLB_REG_SIZE},
-		{"tmr", &openpic_tmr_ops_le,
-		 OPENPIC_TMR_REG_START, OPENPIC_TMR_REG_SIZE},
-		{"src", &openpic_src_ops_le,
-		 OPENPIC_SRC_REG_START, OPENPIC_SRC_REG_SIZE},
-		{"cpu", &openpic_cpu_ops_le,
-		 OPENPIC_CPU_REG_START, OPENPIC_CPU_REG_SIZE},
-		{NULL}
-	};
-	static const struct mem_reg list_be[] = {
-		{"glb", &openpic_glb_ops_be,
-		 OPENPIC_GLB_REG_START, OPENPIC_GLB_REG_SIZE},
-		{"tmr", &openpic_tmr_ops_be,
-		 OPENPIC_TMR_REG_START, OPENPIC_TMR_REG_SIZE},
-		{"src", &openpic_src_ops_be,
-		 OPENPIC_SRC_REG_START, OPENPIC_SRC_REG_SIZE},
-		{"cpu", &openpic_cpu_ops_be,
-		 OPENPIC_CPU_REG_START, OPENPIC_CPU_REG_SIZE},
-		{NULL}
-	};
-	static const struct mem_reg list_fsl[] = {
-		{"msi", &openpic_msi_ops_be,
-		 OPENPIC_MSI_REG_START, OPENPIC_MSI_REG_SIZE},
-		{"summary", &openpic_summary_ops_be,
-		 OPENPIC_SUMMARY_REG_START, OPENPIC_SUMMARY_REG_SIZE},
-		{NULL}
-	};
+	struct list_head *node;
 
-	memory_region_init(&opp->mem, "openpic", 0x40000);
+	list_for_each(node, &opp->mmio_regions) {
+		struct mem_reg *mr = list_entry(node, struct mem_reg, list);
 
-	switch (opp->model) {
-	case OPENPIC_MODEL_FSL_MPIC_20:
+		if (mr->start_addr > addr || addr >= mr->start_addr + mr->size)
+			continue;
+
+		return mr->write(opp, addr - mr->start_addr, val);
+	}
+
+	return 1;
+}
+
+static int kvm_mpic_read(struct kvm_io_device *this, gpa_t addr,
+			 int len, void *ptr)
+{
+	struct openpic *opp = container_of(this, struct openpic, mmio);
+	int ret;
+
+	/*
+	 * Technically only 32-bit accesses are allowed, but be nice to
+	 * people dumping registers a byte at a time -- it works in real
+	 * hardware (reads only, not writes).
+	 */
+	if (len == 4) {
+		if (addr & 3) {
+			pr_debug("%s: bad alignment %llx/%d\n",
+				 __func__, addr, len);
+			return -EINVAL;
+		}
+
+		spin_lock_irq(&opp->lock);
+		ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, ptr);
+		spin_unlock_irq(&opp->lock);
+
+		pr_debug("%s: addr %llx ret %d len 4 val %x\n",
+			 __func__, addr, ret, *(const u32 *)ptr);
+	} else if (len == 1) {
+		union {
+			u32 val;
+			u8 bytes[4];
+		} u;
+
+		spin_lock_irq(&opp->lock);
+		ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, &u.val);
+		spin_unlock_irq(&opp->lock);
+
+		*(u8 *)ptr = u.bytes[addr & 3];
+
+		pr_debug("%s: addr %llx ret %d len 1 val %x\n",
+			 __func__, addr, ret, *(const u8 *)ptr);
+	} else {
+		pr_debug("%s: bad length %d\n", __func__, len);
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static int kvm_mpic_write(struct kvm_io_device *this, gpa_t addr,
+			  int len, const void *ptr)
+{
+	struct openpic *opp = container_of(this, struct openpic, mmio);
+	int ret;
+
+	if (len != 4) {
+		pr_debug("%s: bad length %d\n", __func__, len);
+		return -EOPNOTSUPP;
+	}
+	if (addr & 3) {
+		pr_debug("%s: bad alignment %llx/%d\n", __func__, addr, len);
+		return -EOPNOTSUPP;
+	}
+
+	spin_lock_irq(&opp->lock);
+	ret = kvm_mpic_write_internal(opp, addr - opp->reg_base,
+				      *(const u32 *)ptr);
+	spin_unlock_irq(&opp->lock);
+
+	pr_debug("%s: addr %llx ret %d val %x\n",
+		 __func__, addr, ret, *(const u32 *)ptr);
+
+	return ret;
+}
+
+static void kvm_mpic_dtor(struct kvm_io_device *this)
+{
+	struct openpic *opp = container_of(this, struct openpic, mmio);
+
+	opp->mmio_mapped = false;
+}
+
+static const struct kvm_io_device_ops mpic_mmio_ops = {
+	.read = kvm_mpic_read,
+	.write = kvm_mpic_write,
+	.destructor = kvm_mpic_dtor,
+};
+
+static void map_mmio(struct openpic *opp)
+{
+	BUG_ON(opp->mmio_mapped);
+	opp->mmio_mapped = true;
+
+	kvm_iodevice_init(&opp->mmio, &mpic_mmio_ops);
+
+	kvm_io_bus_register_dev(opp->kvm, KVM_MMIO_BUS,
+				opp->reg_base, OPENPIC_REG_SIZE,
+				&opp->mmio);
+}
+
+static void unmap_mmio(struct openpic *opp)
+{
+	BUG_ON(opp->mmio_mapped);
+	opp->mmio_mapped = false;
+
+	kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio);
+}
+
+static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr)
+{
+	u64 base;
+
+	if (copy_from_user(&base, (u64 __iomem *)(long)attr->addr, sizeof(u64)))
+		return -EFAULT;
+
+	if (base & 0x3ffff) {
+		pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx not aligned\n",
+			 __func__, base);
+		return -EINVAL;
+	}
+
+	if (base == opp->reg_base)
+		return 0;
+
+	mutex_lock(&opp->kvm->slots_lock);
+
+	unmap_mmio(opp);
+	opp->reg_base = base;
+
+	pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx\n",
+		 __func__, base);
+
+	if (base == 0)
+		goto out;
+
+	map_mmio(opp);
+
+	mutex_unlock(&opp->kvm->slots_lock);
+out:
+	return 0;
+}
+
+#define ATTR_SET		0
+#define ATTR_GET		1
+
+static int access_reg(struct openpic *opp, gpa_t addr, u32 *val, int type)
+{
+	int ret;
+
+	if (addr & 3)
+		return -ENXIO;
+
+	if (type == ATTR_SET)
+		ret = kvm_mpic_write_internal(opp, addr, *val);
+	else
+		ret = kvm_mpic_read_internal(opp, addr, val);
+
+	pr_debug("%s: type %d addr %llx val %x\n", __func__, type, addr, *val);
+
+	return ret;
+}
+
+static int mpic_set_attr(struct openpic *opp, struct kvm_device_attr *attr)
+{
+	u32 attr32;
+
+	switch (attr->group) {
+	case KVM_DEV_MPIC_GRP_MISC:
+		switch (attr->attr) {
+		case KVM_DEV_MPIC_BASE_ADDR:
+			return set_base_addr(opp, attr);
+		}
+
+		break;
+
+	case KVM_DEV_MPIC_GRP_REGISTER:
+		if (copy_from_user(&attr32, (u32 __user *)(long)attr->addr,
+				   sizeof(u32)))
+			return -EFAULT;
+
+		return access_reg(opp, attr->attr, &attr32, ATTR_SET);
+
+	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+		if (attr->attr > MAX_SRC)
+			return -EINVAL;
+
+		if (copy_from_user(&attr32, (u32 __user *)(long)attr->addr,
+				   sizeof(u32)))
+			return -EFAULT;
+
+		if (attr32 != 0 && attr32 != 1)
+			return -EINVAL;
+
+		spin_lock_irq(&opp->lock);
+		openpic_set_irq(opp, attr->attr, attr32);
+		spin_unlock_irq(&opp->lock);
+		return 0;
+	}
+
+	return -ENXIO;
+}
+
+static int mpic_get_attr(struct openpic *opp, struct kvm_device_attr *attr)
+{
+	u64 attr64;
+	u32 attr32;
+	int ret;
+
+	switch (attr->group) {
+	case KVM_DEV_MPIC_GRP_MISC:
+		switch (attr->attr) {
+		case KVM_DEV_MPIC_BASE_ADDR:
+			mutex_lock(&opp->kvm->slots_lock);
+			attr64 = opp->reg_base;
+			mutex_unlock(&opp->kvm->slots_lock);
+
+			if (copy_to_user((u64 __user *)(long)attr->addr,
+					 &attr64, sizeof(u64)))
+				return -EFAULT;
+
+			return 0;
+		}
+
+		break;
+
+	case KVM_DEV_MPIC_GRP_REGISTER:
+		ret = access_reg(opp, attr->attr, &attr32, ATTR_GET);
+		if (ret)
+			return ret;
+
+		if (copy_to_user((u32 __user *)(long)attr->addr, &attr32,
+				 sizeof(u32)))
+			return -EFAULT;
+
+		return 0;
+
+	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+		if (attr->attr > MAX_SRC)
+			return -EINVAL;
+
+		attr32 = opp->src[attr->attr].pending;
+
+		if (copy_to_user((u32 __user *)(long)attr->addr, &attr32,
+				 sizeof(u32)))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	return -ENXIO;
+}
+
+static int mpic_has_attr(struct openpic *opp, struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_MPIC_GRP_MISC:
+		switch (attr->attr) {
+		case KVM_DEV_MPIC_BASE_ADDR:
+			return 0;
+		}
+
+		break;
+
+	case KVM_DEV_MPIC_GRP_REGISTER:
+		return 0;
+
+	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+		if (attr->attr > MAX_SRC)
+			break;
+
+		return 0;
+	}
+
+	return -ENXIO;
+}
+
+static long kvm_mpic_ioctl(struct file *filp, unsigned int ioctl,
+			   unsigned long arg)
+{
+	struct openpic *opp = filp->private_data;
+	struct kvm_device_attr attr;
+	int (*accessor)(struct openpic *opp, struct kvm_device_attr *attr);
+
+	switch (ioctl) {
+	case KVM_SET_DEVICE_ATTR:
+		accessor = mpic_set_attr;
+		break;
+	case KVM_GET_DEVICE_ATTR:
+		accessor = mpic_get_attr;
+		break;
+	case KVM_HAS_DEVICE_ATTR:
+		accessor = mpic_has_attr;
+		break;
 	default:
+		return -ENOTTY;
+	}
+
+	if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+		return -EFAULT;
+
+	return accessor(opp, &attr);
+}
+
+static void mpic_destroy(struct openpic *opp)
+{
+	if (opp->mmio_mapped) {
+		/*
+		 * Normally we get unmapped by kvm_io_bus_destroy(),
+		 * which happens before the VCPUs release their references.
+		 *
+		 * Thus, we should only get here if no VCPUs took a reference
+		 * to us in the first place.
+		 */
+		WARN_ON(opp->nb_cpus != 0);
+		unmap_mmio(opp);
+	}
+
+	kfree(opp);
+}
+
+void kvmppc_mpic_put(struct openpic *opp)
+{
+	if (atomic_dec_and_test(&opp->users))
+		mpic_destroy(opp);
+}
+
+static int kvm_mpic_release(struct inode *inode, struct file *filp)
+{
+	struct openpic *opp = filp->private_data;
+	struct kvm *kvm = opp->kvm;
+
+	kvmppc_mpic_put(opp);
+	kvm_put_kvm(kvm);
+	return 0;
+}
+
+static const struct file_operations kvm_mpic_fops = {
+	.unlocked_ioctl = kvm_mpic_ioctl,
+	.release = kvm_mpic_release,
+};
+
+int kvm_create_mpic(struct kvm *kvm, u32 type)
+{
+	struct openpic *opp;
+	int ret, fd;
+
+	opp = kzalloc(sizeof(struct openpic), GFP_KERNEL);
+	if (!opp)
+		return -ENOMEM;
+
+	fd = anon_inode_getfd("kvm-mpic", &kvm_mpic_fops, opp, O_RDWR);
+	if (fd < 0) {
+		ret = fd;
+		goto err;
+	}
+
+	opp->kvm = kvm;
+	opp->model = type;
+	atomic_set(&opp->users, 1);
+	spin_lock_init(&opp->lock);
+
+	INIT_LIST_HEAD(&opp->mmio_regions);
+	list_add(&openpic_gbl_mmio.list, &opp->mmio_regions);
+	list_add(&openpic_tmr_mmio.list, &opp->mmio_regions);
+	list_add(&openpic_src_mmio.list, &opp->mmio_regions);
+	list_add(&openpic_cpu_mmio.list, &opp->mmio_regions);
+
+	switch (opp->model) {
+	case KVM_DEV_TYPE_FSL_MPIC_20:
 		opp->fsl = &fsl_mpic_20;
 		opp->brr1 = 0x00400200;
 		opp->flags |= OPENPIC_FLAG_IDR_CRIT;
@@ -1290,12 +1716,10 @@  static int openpic_init(SysBusDevice *dev)
 		opp->mpic_mode_mask = GCR_MODE_MIXED;
 
 		fsl_common_init(opp);
-		map_list(opp, list_be, &list_count);
-		map_list(opp, list_fsl, &list_count);
 
 		break;
 
-	case OPENPIC_MODEL_FSL_MPIC_42:
+	case KVM_DEV_TYPE_FSL_MPIC_42:
 		opp->fsl = &fsl_mpic_42;
 		opp->brr1 = 0x00400402;
 		opp->flags |= OPENPIC_FLAG_ILR;
@@ -1303,11 +1727,19 @@  static int openpic_init(SysBusDevice *dev)
 		opp->mpic_mode_mask = GCR_MODE_PROXY;
 
 		fsl_common_init(opp);
-		map_list(opp, list_be, &list_count);
-		map_list(opp, list_fsl, &list_count);
 
 		break;
+
+	default:
+		ret = -ENODEV;
+		goto err;
 	}
 
-	return 0;
+	openpic_reset(opp);
+	kvm_get_kvm(kvm);
+	return fd;
+
+err:
+	kfree(opp);
+	return ret;
 }
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 16b4595..c9a2972 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -317,6 +317,7 @@  int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_ENABLE_CAP:
 	case KVM_CAP_ONE_REG:
 	case KVM_CAP_IOEVENTFD:
+	case KVM_CAP_DEVICE_CTRL:
 		r = 1;
 		break;
 #ifndef CONFIG_KVM_BOOK3S_64_HV
@@ -769,7 +770,10 @@  static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		break;
 	case KVM_CAP_PPC_EPR:
 		r = 0;
-		vcpu->arch.epr_enabled = cap->args[0];
+		if (cap->args[0])
+			vcpu->arch.epr_flags |= KVMPPC_EPR_USER;
+		else
+			vcpu->arch.epr_flags &= ~KVMPPC_EPR_USER;
 		break;
 #ifdef CONFIG_BOOKE
 	case KVM_CAP_PPC_BOOKE_WATCHDOG:
@@ -915,6 +919,7 @@  static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
 long kvm_arch_vm_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg)
 {
+	struct kvm *kvm __maybe_unused = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	long r;
 
@@ -933,7 +938,6 @@  long kvm_arch_vm_ioctl(struct file *filp,
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CREATE_SPAPR_TCE: {
 		struct kvm_create_spapr_tce create_tce;
-		struct kvm *kvm = filp->private_data;
 
 		r = -EFAULT;
 		if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
@@ -945,7 +949,6 @@  long kvm_arch_vm_ioctl(struct file *filp,
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	case KVM_ALLOCATE_RMA: {
-		struct kvm *kvm = filp->private_data;
 		struct kvm_allocate_rma rma;
 
 		r = kvm_vm_ioctl_allocate_rma(kvm, &rma);
@@ -955,7 +958,6 @@  long kvm_arch_vm_ioctl(struct file *filp,
 	}
 
 	case KVM_PPC_ALLOCATE_HTAB: {
-		struct kvm *kvm = filp->private_data;
 		u32 htab_order;
 
 		r = -EFAULT;
@@ -972,7 +974,6 @@  long kvm_arch_vm_ioctl(struct file *filp,
 	}
 
 	case KVM_PPC_GET_HTAB_FD: {
-		struct kvm *kvm = filp->private_data;
 		struct kvm_get_htab_fd ghf;
 
 		r = -EFAULT;
@@ -985,7 +986,6 @@  long kvm_arch_vm_ioctl(struct file *filp,
 
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_PPC_GET_SMMU_INFO: {
-		struct kvm *kvm = filp->private_data;
 		struct kvm_ppc_smmu_info info;
 
 		memset(&info, 0, sizeof(info));
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1c0be23..852a3a1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1084,6 +1084,8 @@  static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
 	return true;
 }
 
+int kvm_create_mpic(struct kvm *kvm, u32 type);
+
 #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
 #else
 static inline void __guest_enter(void) { return; }
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 20ce2d2..d8f44ef 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -927,6 +927,15 @@  struct kvm_device_attr {
 	__u64	addr;		/* userspace address of attr data */
 };
 
+#define KVM_DEV_TYPE_FSL_MPIC_20	1
+#define KVM_DEV_TYPE_FSL_MPIC_42	2
+
+#define KVM_DEV_MPIC_GRP_MISC		1
+#define   KVM_DEV_MPIC_BASE_ADDR	0	/* 64-bit */
+
+#define KVM_DEV_MPIC_GRP_REGISTER	2	/* 32-bit */
+#define KVM_DEV_MPIC_GRP_IRQ_ACTIVE	3	/* 32-bit */
+
 /* ioctl for vm fd */
 #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ed033c0..e325f5d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2164,6 +2164,15 @@  static int kvm_ioctl_create_device(struct kvm *kvm,
 	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
 
 	switch (cd->type) {
+#ifdef CONFIG_KVM_MPIC
+	case KVM_DEV_TYPE_FSL_MPIC_20:
+	case KVM_DEV_TYPE_FSL_MPIC_42: {
+		if (test)
+			return 0;
+
+		return kvm_create_mpic(kvm, cd->type);
+	}
+#endif
 	default:
 		return -ENODEV;
 	}