diff mbox

[v3,07/21] powerpc/powernv: Release PEs dynamically

Message ID 1430116673-22500-8-git-send-email-gwshan@linux.vnet.ibm.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show

Commit Message

Gavin Shan April 27, 2015, 6:37 a.m. UTC
The original code doesn't support releasing PEs dynamically, meaning
that PE and the associated resources (IO, M32, M64 and DMA) can't
be released when unplugging a PCI adapter from one hotpluggable slot.

The patch takes object oriented methodology, introducs reference
count to PE, which is initialized to 1 and increased with 1 when a
new PCI device joins the PE. Once the last PCI device leaves the
PE, the PE is going to be release together with its associated
(IO, M32, M64, DMA) resources.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/pci-bridge.h     |   3 +
 arch/powerpc/kernel/pci-hotplug.c         |   5 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 658 +++++++++++++++++++-----------
 arch/powerpc/platforms/powernv/pci.h      |   4 +-
 4 files changed, 432 insertions(+), 238 deletions(-)
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 5367eb3..a6ad4b1 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -31,6 +31,9 @@  struct pci_controller_ops {
 	resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type);
 	void		(*setup_bridge)(struct pci_bus *, unsigned long);
 	void		(*reset_secondary_bus)(struct pci_dev *dev);
+
+	/* Called when PCI device is released */
+	void		(*release_device)(struct pci_dev *);
 };
 
 /*
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
index 7ed85a6..0040343 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -29,6 +29,11 @@ 
  */
 void pcibios_release_device(struct pci_dev *dev)
 {
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+
+	if (hose->controller_ops.release_device)
+		hose->controller_ops.release_device(dev);
+
 	eeh_remove_device(dev);
 }
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 910fb67..ef8c216 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -12,6 +12,8 @@ 
 #undef DEBUG
 
 #include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/kref.h>
 #include <linux/pci.h>
 #include <linux/crash_dump.h>
 #include <linux/debugfs.h>
@@ -47,6 +49,8 @@ 
 /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
 #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
 
+static void pnv_ioda_release_pe(struct kref *kref);
+
 static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 			    const char *fmt, ...)
 {
@@ -123,25 +127,400 @@  static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
 		(IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
 }
 
-static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
+static inline void pnv_ioda_pe_get(struct pnv_ioda_pe *pe)
 {
-	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
-		pr_warn("%s: Invalid PE %d on PHB#%x\n",
-			__func__, pe_no, phb->hose->global_number);
+	if (!pe)
+		return;
+
+	kref_get(&pe->kref);
+}
+
+static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe)
+{
+	unsigned int count;
+
+	if (!pe)
 		return;
+
+	/*
+	 * The count is initialized to 1 and increased with 1 when
+	 * a new PCI device is bound with the PE. Once the last PCI
+	 * device is leaving from the PE, the PE is going to be
+	 * released.
+	 */
+	count = atomic_read(&pe->kref.refcount);
+	if (count == 2)
+		kref_sub(&pe->kref, 2, pnv_ioda_release_pe);
+	else
+		kref_put(&pe->kref, pnv_ioda_release_pe);
+}
+
+static void pnv_pci_release_device(struct pci_dev *pdev)
+{
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+	struct pnv_ioda_pe *pe;
+
+	if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+		pe = &phb->ioda.pe_array[pdn->pe_number];
+		pnv_ioda_pe_put(pe);
+		pdn->pe_number = IODA_INVALID_PE;
 	}
+}
 
-	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) {
-		pr_warn("%s: PE %d was assigned on PHB#%x\n",
-			__func__, pe_no, phb->hose->global_number);
+static void pnv_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
+{
+	struct pnv_phb *phb = pe->phb;
+	int index, count;
+	unsigned long tbl_addr, tbl_size;
+
+	/* No DMA capability for slave PEs */
+	if (pe->flags & PNV_IODA_PE_SLAVE)
+		return;
+
+	/* Bypass DMA window */
+	if (phb->type == PNV_PHB_IODA2 &&
+	    pe->tce_bypass_enabled &&
+	    pe->tce32_table &&
+	    pe->tce32_table->set_bypass)
+		pe->tce32_table->set_bypass(pe->tce32_table, false);
+
+	/* 32-bits DMA window */
+	count = pe->tce32_seg_end - pe->tce32_seg_start;
+	tbl_addr = pe->tce32_table->it_base;
+	if (!count)
 		return;
+
+	/* Free IOMMU table */
+	iommu_free_table(pe->tce32_table,
+			 of_node_full_name(phb->hose->dn));
+
+	/* Deconfigure TCE table */
+	switch (phb->type) {
+	case PNV_PHB_IODA1:
+		for (index = 0; index < count; index++)
+			opal_pci_map_pe_dma_window(phb->opal_id,
+						   pe->pe_number,
+						   pe->tce32_seg_start + index,
+						   1,
+						   __pa(tbl_addr) +
+						   index * TCE32_TABLE_SIZE,
+						   0,
+						   0x1000);
+		bitmap_clear(phb->ioda.tce32_segmap,
+			     pe->tce32_seg_start,
+			     count);
+		tbl_size = TCE32_TABLE_SIZE * count;
+		break;
+	case PNV_PHB_IODA2:
+		opal_pci_map_pe_dma_window(phb->opal_id,
+					   pe->pe_number,
+					   pe->pe_number << 1,
+					   1,
+					   __pa(tbl_addr),
+					   0,
+					   0x1000);
+		tbl_size = (1ul << ilog2(phb->ioda.m32_pci_base));
+		tbl_size = (tbl_size >> IOMMU_PAGE_SHIFT_4K) * 8;
+		break;
+	default:
+		pe_warn(pe, "Unsupported PHB type %d\n", phb->type);
+		return;
+	}
+
+	/* Free memory of IOMMU table */
+	free_pages(tbl_addr, get_order(tbl_size));
+	pe->tce32_table = NULL;
+	pe->tce32_seg_start = 0;
+	pe->tce32_seg_end = 0;
+}
+
+static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
+{
+	struct pnv_phb *phb = pe->phb;
+	unsigned long *segmap = NULL, *pe_segmap = NULL;
+	int i;
+	uint16_t win, win_type[] = { OPAL_IO_WINDOW_TYPE,
+				     OPAL_M32_WINDOW_TYPE,
+				     OPAL_M64_WINDOW_TYPE };
+
+	for (win = 0; win < ARRAY_SIZE(win_type); win++) {
+		switch (win_type[win]) {
+		case OPAL_IO_WINDOW_TYPE:
+			segmap = phb->ioda.io_segmap;
+			pe_segmap = pe->io_segmap;
+			break;
+		case OPAL_M32_WINDOW_TYPE:
+			segmap = phb->ioda.m32_segmap;
+			pe_segmap = pe->m32_segmap;
+			break;
+		case OPAL_M64_WINDOW_TYPE:
+			segmap = phb->ioda.m64_segmap;
+			pe_segmap = pe->m64_segmap;
+			break;
+		}
+		i = -1;
+		while ((i = find_next_bit(pe_segmap,
+			phb->ioda.total_pe, i + 1)) < phb->ioda.total_pe) {
+			if (win_type[win] == OPAL_IO_WINDOW_TYPE ||
+			    win_type[win] == OPAL_M32_WINDOW_TYPE)
+				opal_pci_map_pe_mmio_window(phb->opal_id,
+						phb->ioda.reserved_pe,
+						win_type[win], 0, i);
+			else if (phb->type == PNV_PHB_IODA1)
+				opal_pci_map_pe_mmio_window(phb->opal_id,
+						phb->ioda.reserved_pe,
+						win_type[win],
+						i / 8, i % 8);
+
+			clear_bit(i, pe_segmap);
+			clear_bit(i, segmap);
+		}
+	}
+}
+
+static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
+				  struct pnv_ioda_pe *parent,
+				  struct pnv_ioda_pe *child,
+				  bool is_add)
+{
+	const char *desc = is_add ? "adding" : "removing";
+	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
+			      OPAL_REMOVE_PE_FROM_DOMAIN;
+	struct pnv_ioda_pe *slave;
+	long rc;
+
+	/* Parent PE affects child PE */
+	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
+				child->pe_number, op);
+	if (rc != OPAL_SUCCESS) {
+		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
+			rc, desc);
+		return -ENXIO;
+	}
+
+	if (!(child->flags & PNV_IODA_PE_MASTER))
+		return 0;
+
+	/* Compound case: parent PE affects slave PEs */
+	list_for_each_entry(slave, &child->slaves, list) {
+		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
+					slave->pe_number, op);
+		if (rc != OPAL_SUCCESS) {
+			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
+				rc, desc);
+			return -ENXIO;
+		}
+	}
+
+	return 0;
+}
+
+static int pnv_ioda_set_peltv(struct pnv_ioda_pe *pe, bool is_add)
+{
+	struct pnv_phb *phb = pe->phb;
+	struct pnv_ioda_pe *slave;
+	struct pci_dev *pdev = NULL;
+	int ret;
+
+	/*
+	 * Clear PE frozen state. If it's master PE, we need
+	 * clear slave PE frozen state as well.
+	 */
+	opal_pci_eeh_freeze_clear(phb->opal_id,
+				  pe->pe_number,
+				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+	if (pe->flags & PNV_IODA_PE_MASTER) {
+		list_for_each_entry(slave, &pe->slaves, list) {
+			opal_pci_eeh_freeze_clear(phb->opal_id,
+						  slave->pe_number,
+						  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+		}
+	}
+
+	/*
+	 * Associate PE in PELT. We need add the PE into the
+	 * corresponding PELT-V as well. Otherwise, the error
+	 * originated from the PE might contribute to other
+	 * PEs.
+	 */
+	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
+	if (ret)
+		return ret;
+
+	/* For compound PEs, any one affects all of them */
+	if (pe->flags & PNV_IODA_PE_MASTER) {
+		list_for_each_entry(slave, &pe->slaves, list) {
+			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
+			if (ret)
+				return ret;
+		}
+	}
+
+	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
+		pdev = pe->pbus->self;
+	else if (pe->flags & PNV_IODA_PE_DEV)
+		pdev = pe->pdev->bus->self;
+#ifdef CONFIG_PCI_IOV
+	else if (pe->flags & PNV_IODA_PE_VF)
+		pdev = pe->parent_dev->bus->self;
+#endif /* CONFIG_PCI_IOV */
+
+	while (pdev) {
+		struct pci_dn *pdn = pci_get_pdn(pdev);
+		struct pnv_ioda_pe *parent;
+
+		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+			parent = &phb->ioda.pe_array[pdn->pe_number];
+			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
+			if (ret)
+				return ret;
+		}
+
+		pdev = pdev->bus->self;
+	}
+
+	return 0;
+}
+
+static void pnv_ioda_deconfigure_pe(struct pnv_ioda_pe *pe)
+{
+	struct pnv_phb *phb = pe->phb;
+	struct pci_dev *parent;
+	uint8_t bcomp, dcomp, fcomp;
+	long rid_end, rid;
+	int64_t rc;
+
+	/* Tear down MVE */
+	if (phb->type == PNV_PHB_IODA1 &&
+	    pe->mve_number != -1) {
+		rc = opal_pci_set_mve(phb->opal_id,
+				      pe->mve_number,
+				      phb->ioda.reserved_pe);
+		if (rc != OPAL_SUCCESS)
+			pe_warn(pe, "Error %lld unmapping MVE#%d\n",
+				rc, pe->mve_number);
+		rc = opal_pci_set_mve_enable(phb->opal_id,
+					     pe->mve_number,
+					     OPAL_DISABLE_MVE);
+		if (rc != OPAL_SUCCESS)
+			pe_warn(pe, "Error %lld disabling MVE#%d\n",
+				rc, pe->mve_number);
+		pe->mve_number = -1;
+	}
+
+	/* Unmapping PELTV */
+	pnv_ioda_set_peltv(pe, false);
+
+	/* To unmap PELTM */
+	if (pe->pbus) {
+		int count;
+
+		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
+		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
+		parent = pe->pbus->self;
+		if (pe->flags & PNV_IODA_PE_BUS_ALL)
+			count = pe->pbus->busn_res.end -
+				pe->pbus->busn_res.start + 1;
+		else
+			count = 1;
+
+		switch(count) {
+		case  1: bcomp = OpalPciBusAll;   break;
+		case  2: bcomp = OpalPciBus7Bits; break;
+		case  4: bcomp = OpalPciBus6Bits; break;
+		case  8: bcomp = OpalPciBus5Bits; break;
+		case 16: bcomp = OpalPciBus4Bits; break;
+		case 32: bcomp = OpalPciBus3Bits; break;
+		default:
+			/* Fail back to case of one bus */
+			pe_warn(pe, "Cannot support %d buses\n", count);
+			bcomp = OpalPciBusAll;
+		}
+		rid_end = pe->rid + (count << 8);
+	} else {
+#ifdef CONFIG_PCI_IOV
+		if (pe->flags & PNV_IODA_PE_VF)
+			parent = pe->parent_dev;
+		else
+#endif
+			parent = pe->pdev->bus->self;
+		bcomp = OpalPciBusAll;
+		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
+		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
+		rid_end = pe->rid + 1;
+	}
+
+	/* Clear RID mapping */
+	for (rid = pe->rid; rid < rid_end; rid++)
+		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
+
+	/* Unmapping PELTM */
+	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
+			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
+	if (rc)
+		pe_warn(pe, "Error %ld unmapping PELTM\n", rc);
+}
+
+static void pnv_ioda_release_pe(struct kref *kref)
+{
+	struct pnv_ioda_pe *pe = container_of(kref, struct pnv_ioda_pe, kref);
+	struct pnv_ioda_pe *tmp, *slave;
+	struct pnv_phb *phb = pe->phb;
+
+	pnv_ioda_release_pe_dma(pe);
+	pnv_ioda_release_pe_seg(pe);
+	pnv_ioda_deconfigure_pe(pe);
+
+	/* Release slave PEs for compound PE */
+	if (pe->flags & PNV_IODA_PE_MASTER) {
+		list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
+			pnv_ioda_pe_put(slave);
+	}
+
+	/* Remove the PE from various list. We need remove slave
+	 * PE from master's list.
+	 */
+	list_del(&pe->dma_link);
+	list_del(&pe->list);
+
+	/* Free PE number */
+	clear_bit(pe->pe_number, phb->ioda.pe_alloc);
+}
+
+static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb,
+					    int pe_no)
+{
+	struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
+
+	kref_init(&pe->kref);
+	pe->phb = phb;
+	pe->pe_number = pe_no;
+	INIT_LIST_HEAD(&pe->dma_link);
+	INIT_LIST_HEAD(&pe->list);
+
+	return pe;
+}
+
+static struct pnv_ioda_pe *pnv_ioda_reserve_pe(struct pnv_phb *phb,
+					       int pe_no)
+{
+	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
+		pr_warn("%s: Invalid PE %d on PHB#%x\n",
+			__func__, pe_no, phb->hose->global_number);
+		return NULL;
 	}
 
-	phb->ioda.pe_array[pe_no].phb = phb;
-	phb->ioda.pe_array[pe_no].pe_number = pe_no;
+	/*
+	 * Same PE might be reserved for multiple times, which
+	 * is out of problem actually.
+	 */
+	set_bit(pe_no, phb->ioda.pe_alloc);
+	return pnv_ioda_init_pe(phb, pe_no);
 }
 
-static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
+static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
 {
 	unsigned long pe_no;
 	unsigned long limit = phb->ioda.total_pe - 1;
@@ -154,20 +533,10 @@  static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
 			break;
 
 		if (--limit >= phb->ioda.total_pe)
-			return IODA_INVALID_PE;
+			return NULL;
 	} while(1);
 
-	phb->ioda.pe_array[pe_no].phb = phb;
-	phb->ioda.pe_array[pe_no].pe_number = pe_no;
-	return pe_no;
-}
-
-static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
-{
-	WARN_ON(phb->ioda.pe_array[pe].pdev);
-
-	memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
-	clear_bit(pe, phb->ioda.pe_alloc);
+	return pnv_ioda_init_pe(phb, pe_no);
 }
 
 static int pnv_ioda1_init_m64(struct pnv_phb *phb)
@@ -382,8 +751,9 @@  static void pnv_ioda_reserve_m64_pe(struct pnv_phb *phb,
 	}
 }
 
-static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
-				struct pci_bus *bus, int all)
+static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
+						struct pci_bus *bus,
+						int all)
 {
 	resource_size_t segsz = phb->ioda.m64_segsize;
 	struct pci_dev *pdev;
@@ -394,14 +764,14 @@  static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
 	int i;
 
 	if (!pnv_ioda_need_m64_pe(phb, bus))
-		return IODA_INVALID_PE;
+		return NULL;
 
         /* Allocate bitmap */
 	size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
 	pe_bitsmap = kzalloc(size, GFP_KERNEL);
 	if (!pe_bitsmap) {
 		pr_warn("%s: Out of memory !\n", __func__);
-		return IODA_INVALID_PE;
+		return NULL;
 	}
 
 	/* The bridge's M64 window might be extended to PHB's M64
@@ -438,7 +808,7 @@  static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
 	/* No M64 window found ? */
 	if (bitmap_empty(pe_bitsmap, phb->ioda.total_pe)) {
 		kfree(pe_bitsmap);
-		return IODA_INVALID_PE;
+		return NULL;
 	}
 
 	/* Figure out the master PE and put all slave PEs
@@ -491,7 +861,7 @@  static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
 	}
 
 	kfree(pe_bitsmap);
-	return master_pe->pe_number;
+	return master_pe;
 }
 
 static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
@@ -695,7 +1065,7 @@  static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
  * but in the meantime, we need to protect them to avoid warnings
  */
 #ifdef CONFIG_PCI_MSI
-static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
+static struct pnv_ioda_pe *pnv_ioda_pci_dev_to_pe(struct pci_dev *dev)
 {
 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
 	struct pnv_phb *phb = hose->private_data;
@@ -709,191 +1079,6 @@  static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
 }
 #endif /* CONFIG_PCI_MSI */
 
-static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
-				  struct pnv_ioda_pe *parent,
-				  struct pnv_ioda_pe *child,
-				  bool is_add)
-{
-	const char *desc = is_add ? "adding" : "removing";
-	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
-			      OPAL_REMOVE_PE_FROM_DOMAIN;
-	struct pnv_ioda_pe *slave;
-	long rc;
-
-	/* Parent PE affects child PE */
-	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
-				child->pe_number, op);
-	if (rc != OPAL_SUCCESS) {
-		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
-			rc, desc);
-		return -ENXIO;
-	}
-
-	if (!(child->flags & PNV_IODA_PE_MASTER))
-		return 0;
-
-	/* Compound case: parent PE affects slave PEs */
-	list_for_each_entry(slave, &child->slaves, list) {
-		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
-					slave->pe_number, op);
-		if (rc != OPAL_SUCCESS) {
-			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
-				rc, desc);
-			return -ENXIO;
-		}
-	}
-
-	return 0;
-}
-
-static int pnv_ioda_set_peltv(struct pnv_phb *phb,
-			      struct pnv_ioda_pe *pe,
-			      bool is_add)
-{
-	struct pnv_ioda_pe *slave;
-	struct pci_dev *pdev = NULL;
-	int ret;
-
-	/*
-	 * Clear PE frozen state. If it's master PE, we need
-	 * clear slave PE frozen state as well.
-	 */
-	if (is_add) {
-		opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
-					  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
-		if (pe->flags & PNV_IODA_PE_MASTER) {
-			list_for_each_entry(slave, &pe->slaves, list)
-				opal_pci_eeh_freeze_clear(phb->opal_id,
-							  slave->pe_number,
-							  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
-		}
-	}
-
-	/*
-	 * Associate PE in PELT. We need add the PE into the
-	 * corresponding PELT-V as well. Otherwise, the error
-	 * originated from the PE might contribute to other
-	 * PEs.
-	 */
-	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
-	if (ret)
-		return ret;
-
-	/* For compound PEs, any one affects all of them */
-	if (pe->flags & PNV_IODA_PE_MASTER) {
-		list_for_each_entry(slave, &pe->slaves, list) {
-			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
-			if (ret)
-				return ret;
-		}
-	}
-
-	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
-		pdev = pe->pbus->self;
-	else if (pe->flags & PNV_IODA_PE_DEV)
-		pdev = pe->pdev->bus->self;
-#ifdef CONFIG_PCI_IOV
-	else if (pe->flags & PNV_IODA_PE_VF)
-		pdev = pe->parent_dev->bus->self;
-#endif /* CONFIG_PCI_IOV */
-	while (pdev) {
-		struct pci_dn *pdn = pci_get_pdn(pdev);
-		struct pnv_ioda_pe *parent;
-
-		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
-			parent = &phb->ioda.pe_array[pdn->pe_number];
-			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
-			if (ret)
-				return ret;
-		}
-
-		pdev = pdev->bus->self;
-	}
-
-	return 0;
-}
-
-#ifdef CONFIG_PCI_IOV
-static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
-{
-	struct pci_dev *parent;
-	uint8_t bcomp, dcomp, fcomp;
-	int64_t rc;
-	long rid_end, rid;
-
-	/* Currently, we just deconfigure VF PE. Bus PE will always there.*/
-	if (pe->pbus) {
-		int count;
-
-		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
-		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
-		parent = pe->pbus->self;
-		if (pe->flags & PNV_IODA_PE_BUS_ALL)
-			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
-		else
-			count = 1;
-
-		switch(count) {
-		case  1: bcomp = OpalPciBusAll;         break;
-		case  2: bcomp = OpalPciBus7Bits;       break;
-		case  4: bcomp = OpalPciBus6Bits;       break;
-		case  8: bcomp = OpalPciBus5Bits;       break;
-		case 16: bcomp = OpalPciBus4Bits;       break;
-		case 32: bcomp = OpalPciBus3Bits;       break;
-		default:
-			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
-			        count);
-			/* Do an exact match only */
-			bcomp = OpalPciBusAll;
-		}
-		rid_end = pe->rid + (count << 8);
-	} else {
-		if (pe->flags & PNV_IODA_PE_VF)
-			parent = pe->parent_dev;
-		else
-			parent = pe->pdev->bus->self;
-		bcomp = OpalPciBusAll;
-		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
-		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
-		rid_end = pe->rid + 1;
-	}
-
-	/* Clear the reverse map */
-	for (rid = pe->rid; rid < rid_end; rid++)
-		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
-
-	/* Release from all parents PELT-V */
-	while (parent) {
-		struct pci_dn *pdn = pci_get_pdn(parent);
-		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
-			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
-						pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
-			/* XXX What to do in case of error ? */
-		}
-		parent = parent->bus->self;
-	}
-
-	opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number,
-				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
-
-	/* Disassociate PE in PELT */
-	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
-				pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
-	if (rc)
-		pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
-	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
-			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
-	if (rc)
-		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
-
-	pe->pbus = NULL;
-	pe->pdev = NULL;
-	pe->parent_dev = NULL;
-
-	return 0;
-}
-#endif /* CONFIG_PCI_IOV */
-
 static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 {
 	struct pci_dev *parent;
@@ -953,7 +1138,7 @@  static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 	}
 
 	/* Configure PELTV */
-	pnv_ioda_set_peltv(phb, pe, true);
+	pnv_ioda_set_peltv(pe, true);
 
 	/* Setup reverse map */
 	for (rid = pe->rid; rid < rid_end; rid++)
@@ -1207,6 +1392,8 @@  static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
 		if (pdn->pe_number != IODA_INVALID_PE)
 			continue;
 
+		/* Increase reference count of the parent PE */
+		pnv_ioda_pe_get(pe);
 		pdn->pe_number = pe->pe_number;
 		pe->dma_weight += pnv_ioda_dev_dma_weight(dev);
 		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
@@ -1224,7 +1411,7 @@  static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
 {
 	struct pci_controller *hose = pci_bus_to_host(bus);
 	struct pnv_phb *phb = hose->private_data;
-	struct pnv_ioda_pe *pe;
+	struct pnv_ioda_pe *pe = NULL;
 	int pe_num = IODA_INVALID_PE;
 
 	/* For partial hotplug case, the PE instance hasn't been destroyed
@@ -1240,24 +1427,24 @@  static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
 	}
 
 	/* PE number for root bus should have been reserved */
-	if (pci_is_root_bus(bus))
-		pe_num = phb->ioda.root_pe_no;
+	if (pci_is_root_bus(bus) &&
+	    phb->ioda.root_pe_no != IODA_INVALID_PE)
+		pe = &phb->ioda.pe_array[phb->ioda.root_pe_no];
 
 	/* Check if PE is determined by M64 */
-	if (pe_num == IODA_INVALID_PE && phb->pick_m64_pe)
-		pe_num = phb->pick_m64_pe(phb, bus, all);
+	if (!pe && phb->pick_m64_pe)
+		pe = phb->pick_m64_pe(phb, bus, all);
 
 	/* The PE number isn't pinned by M64 */
-	if (pe_num == IODA_INVALID_PE)
-		pe_num = pnv_ioda_alloc_pe(phb);
+	if (!pe)
+		pe = pnv_ioda_alloc_pe(phb);
 
-	if (pe_num == IODA_INVALID_PE) {
-		pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
+	if (!pe) {
+		pr_warn("%s: No enough PE# available for PCI bus %04x:%02x\n",
 			__func__, pci_domain_nr(bus), bus->number);
 		return NULL;
 	}
 
-	pe = &phb->ioda.pe_array[pe_num];
 	pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
 	pe->pbus = bus;
 	pe->pdev = NULL;
@@ -1274,14 +1461,12 @@  static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
 
 	if (pnv_ioda_configure_pe(phb, pe)) {
 		/* XXX What do we do here ? */
-		if (pe_num)
-			pnv_ioda_free_pe(phb, pe_num);
-		pe->pbus = NULL;
+		pnv_ioda_pe_put(pe);
 		return NULL;
 	}
 
 	pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
-			GFP_KERNEL, hose->node);
+				       GFP_KERNEL, hose->node);
 	pe->tce32_table->data = pe;
 
 	/* Associate it with all child devices */
@@ -1521,9 +1706,9 @@  static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 		list_del(&pe->list);
 		mutex_unlock(&phb->ioda.pe_list_mutex);
 
-		pnv_ioda_deconfigure_pe(phb, pe);
+		pnv_ioda_deconfigure_pe(pe);
 
-		pnv_ioda_free_pe(phb, pe->pe_number);
+		pnv_ioda_pe_put(pe);
 	}
 }
 
@@ -1601,9 +1786,7 @@  static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 
 		if (pnv_ioda_configure_pe(phb, pe)) {
 			/* XXX What do we do here ? */
-			if (pe_num)
-				pnv_ioda_free_pe(phb, pe_num);
-			pe->pdev = NULL;
+			pnv_ioda_pe_put(pe);
 			continue;
 		}
 
@@ -2263,7 +2446,7 @@  int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode)
 	struct pnv_ioda_pe *pe;
 	int rc;
 
-	pe = pnv_ioda_get_pe(dev);
+	pe = pnv_ioda_pci_dev_to_pe(dev);
 	if (!pe)
 		return -ENODEV;
 
@@ -2379,7 +2562,7 @@  int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
 	struct pnv_ioda_pe *pe;
 	int rc;
 
-	if (!(pe = pnv_ioda_get_pe(dev)))
+	if (!(pe = pnv_ioda_pci_dev_to_pe(dev)))
 		return -ENODEV;
 
 	/* Assign XIVE to PE */
@@ -2401,7 +2584,7 @@  static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
 				  unsigned int hwirq, unsigned int virq,
 				  unsigned int is_64, struct msi_msg *msg)
 {
-	struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
+	struct pnv_ioda_pe *pe = pnv_ioda_pci_dev_to_pe(dev);
 	unsigned int xive_num = hwirq - phb->msi_base;
 	__be32 data;
 	int rc;
@@ -3065,6 +3248,7 @@  static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 	pnv_pci_controller_ops.setup_bridge = pnv_pci_setup_bridge;
 	pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment;
 	pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus;
+	pnv_pci_controller_ops.release_device = pnv_pci_release_device;
 	hose->controller_ops = pnv_pci_controller_ops;
 
 #ifdef CONFIG_PCI_IOV
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 1bea3a8..8b10f01 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -28,6 +28,7 @@  enum pnv_phb_model {
 /* Data associated with a PE, including IOMMU tracking etc.. */
 struct pnv_phb;
 struct pnv_ioda_pe {
+	struct kref		kref;
 	unsigned long		flags;
 	struct pnv_phb		*phb;
 
@@ -120,7 +121,8 @@  struct pnv_phb {
 	void (*shutdown)(struct pnv_phb *phb);
 	int (*init_m64)(struct pnv_phb *phb);
 	void (*reserve_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus);
-	int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all);
+	struct pnv_ioda_pe *(*pick_m64_pe)(struct pnv_phb *phb,
+					   struct pci_bus *bus, int all);
 	int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
 	void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
 	int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);