Message ID | 1426820797-6267-18-git-send-email-weiyang@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | New, archived |
Delegated to: | Bjorn Helgaas |
Headers | show |
On Fri, 2015-03-20 at 11:06 +0800, Wei Yang wrote: > On PowerNV platform, resource position in M64 BAR implies the PE# the > resource belongs to. In some cases, adjustment of a resource is necessary > to locate it to a correct position in M64 BAR . > > This patch adds pnv_pci_vf_resource_shift() to shift the 'real' PF IOV BAR > address according to an offset. > > Note: > > After doing so, there would be a "hole" in the /proc/iomem when offset > is a positive value. It looks like the device return some mmio back to > the system, which actually no one could use it. > > [bhelgaas: rework loops, rework overlap check, index resource[] > conventionally, remove pci_regs.h include, squashed with next patch] > Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com> > --- > arch/powerpc/include/asm/pci-bridge.h | 4 + > arch/powerpc/kernel/pci_dn.c | 13 + > arch/powerpc/platforms/powernv/pci-ioda.c | 524 ++++++++++++++++++++++++++++- > arch/powerpc/platforms/powernv/pci.c | 18 + > arch/powerpc/platforms/powernv/pci.h | 7 + > 5 files changed, 549 insertions(+), 17 deletions(-) > > diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h > index d0d1718..3c95097 100644 > --- a/arch/powerpc/include/asm/pci-bridge.h > +++ b/arch/powerpc/include/asm/pci-bridge.h > @@ -177,6 +177,10 @@ struct pci_dn { > int pe_number; > #ifdef CONFIG_PCI_IOV > u16 vfs_expanded; /* number of VFs IOV BAR expanded */ > + u16 num_vfs; /* number of VFs enabled*/ > + int offset; /* PE# for the first VF PE */ > +#define IODA_INVALID_M64 (-1) > + int m64_wins[PCI_SRIOV_NUM_BARS]; > #endif /* CONFIG_PCI_IOV */ > #endif > struct list_head child_list; > diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c > index f3a1a81..93ed7b3 100644 > --- a/arch/powerpc/kernel/pci_dn.c > +++ b/arch/powerpc/kernel/pci_dn.c > @@ -217,6 +217,19 @@ void remove_dev_pci_info(struct pci_dev *pdev) > struct pci_dn *pdn, *tmp; > int i; > > + /* > + * VF and VF PE are created/released dynamically, so we need to > + * bind/unbind them. Otherwise the VF and VF PE would be mismatched > + * when re-enabling SR-IOV. > + */ > + if (pdev->is_virtfn) { > + pdn = pci_get_pdn(pdev); > +#ifdef CONFIG_PPC_POWERNV > + pdn->pe_number = IODA_INVALID_PE; > +#endif > + return; > + } > + > /* Only support IOV PF for now */ > if (!pdev->is_physfn) > return; > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c > index 93ec16c..bd1b678 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -44,6 +44,9 @@ > #include "powernv.h" > #include "pci.h" > > +/* 256M DMA window, 4K TCE pages, 8 bytes TCE */ > +#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) > + > static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, > const char *fmt, ...) > { > @@ -56,11 +59,18 @@ static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, > vaf.fmt = fmt; > vaf.va = &args; > > - if (pe->pdev) > + if (pe->flags & PNV_IODA_PE_DEV) > strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix)); > - else > + else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) > sprintf(pfix, "%04x:%02x ", > pci_domain_nr(pe->pbus), pe->pbus->number); > +#ifdef CONFIG_PCI_IOV > + else if (pe->flags & PNV_IODA_PE_VF) > + sprintf(pfix, "%04x:%02x:%2x.%d", > + pci_domain_nr(pe->parent_dev->bus), > + (pe->rid & 0xff00) >> 8, > + PCI_SLOT(pe->rid), PCI_FUNC(pe->rid)); > +#endif /* CONFIG_PCI_IOV*/ > > printk("%spci %s: [PE# %.3d] %pV", > level, pfix, pe->pe_number, &vaf); > @@ -591,7 +601,7 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, > bool is_add) > { > struct pnv_ioda_pe *slave; > - struct pci_dev *pdev; > + struct pci_dev *pdev = NULL; > int ret; > > /* > @@ -630,8 +640,12 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, > > if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS)) > pdev = pe->pbus->self; > - else > + else if (pe->flags & PNV_IODA_PE_DEV) > pdev = pe->pdev->bus->self; > +#ifdef CONFIG_PCI_IOV > + else if (pe->flags & PNV_IODA_PE_VF) > + pdev = pe->parent_dev->bus->self; > +#endif /* CONFIG_PCI_IOV */ > while (pdev) { > struct pci_dn *pdn = pci_get_pdn(pdev); > struct pnv_ioda_pe *parent; > @@ -649,6 +663,87 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, > return 0; > } > > +#ifdef CONFIG_PCI_IOV > +static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) > +{ > + struct pci_dev *parent; > + uint8_t bcomp, dcomp, fcomp; > + int64_t rc; > + long rid_end, rid; > + > + /* Currently, we just deconfigure VF PE. Bus PE will always there.*/ > + if (pe->pbus) { > + int count; > + > + dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; > + fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; > + parent = pe->pbus->self; > + if (pe->flags & PNV_IODA_PE_BUS_ALL) > + count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; > + else > + count = 1; > + > + switch(count) { > + case 1: bcomp = OpalPciBusAll; break; > + case 2: bcomp = OpalPciBus7Bits; break; > + case 4: bcomp = OpalPciBus6Bits; break; > + case 8: bcomp = OpalPciBus5Bits; break; > + case 16: bcomp = OpalPciBus4Bits; break; > + case 32: bcomp = OpalPciBus3Bits; break; > + default: > + dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n", > + count); > + /* Do an exact match only */ > + bcomp = OpalPciBusAll; > + } > + rid_end = pe->rid + (count << 8); > + } else { > + if (pe->flags & PNV_IODA_PE_VF) > + parent = pe->parent_dev; > + else > + parent = pe->pdev->bus->self; > + bcomp = OpalPciBusAll; > + dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; > + fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; > + rid_end = pe->rid + 1; > + } > + > + /* Clear the reverse map */ > + for (rid = pe->rid; rid < rid_end; rid++) > + phb->ioda.pe_rmap[rid] = 0; > + > + /* Release from all parents PELT-V */ > + while (parent) { > + struct pci_dn *pdn = pci_get_pdn(parent); > + if (pdn && pdn->pe_number != IODA_INVALID_PE) { > + rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, > + pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); > + /* XXX What to do in case of error ? */ > + } > + parent = parent->bus->self; > + } > + > + opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number, > + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); > + > + /* Disassociate PE in PELT */ > + rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, > + pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); > + if (rc) > + pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc); > + rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, > + bcomp, dcomp, fcomp, OPAL_UNMAP_PE); > + if (rc) > + pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); > + > + pe->pbus = NULL; > + pe->pdev = NULL; > + pe->parent_dev = NULL; > + > + return 0; > +} > +#endif /* CONFIG_PCI_IOV */ > + > static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) > { > struct pci_dev *parent; > @@ -675,15 +770,19 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) > case 16: bcomp = OpalPciBus4Bits; break; > case 32: bcomp = OpalPciBus3Bits; break; > default: > - pr_err("%s: Number of subordinate busses %d" > - " unsupported\n", > - pci_name(pe->pbus->self), count); > + dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n", > + count); > /* Do an exact match only */ > bcomp = OpalPciBusAll; > } > rid_end = pe->rid + (count << 8); > } else { > - parent = pe->pdev->bus->self; > +#ifdef CONFIG_PCI_IOV > + if (pe->flags & PNV_IODA_PE_VF) > + parent = pe->parent_dev; > + else > +#endif /* CONFIG_PCI_IOV */ > + parent = pe->pdev->bus->self; > bcomp = OpalPciBusAll; > dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; > fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; > @@ -774,6 +873,78 @@ static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) > return 10; > } > > +#ifdef CONFIG_PCI_IOV > +static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) > +{ > + struct pci_dn *pdn = pci_get_pdn(dev); > + int i; > + struct resource *res, res2; > + resource_size_t size; > + u16 num_vfs; > + > + if (!dev->is_physfn) > + return -EINVAL; > + > + /* > + * "offset" is in VFs. The M64 windows are sized so that when they > + * are segmented, each segment is the same size as the IOV BAR. > + * Each segment is in a separate PE, and the high order bits of the > + * address are the PE number. Therefore, each VF's BAR is in a > + * separate PE, and changing the IOV BAR start address changes the > + * range of PEs the VFs are in. > + */ > + num_vfs = pdn->num_vfs; > + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { > + res = &dev->resource[i + PCI_IOV_RESOURCES]; > + if (!res->flags || !res->parent) > + continue; > + > + if (!pnv_pci_is_mem_pref_64(res->flags)) > + continue; > + > + /* > + * The actual IOV BAR range is determined by the start address > + * and the actual size for num_vfs VFs BAR. This check is to > + * make sure that after shifting, the range will not overlap > + * with another device. > + */ > + size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); > + res2.flags = res->flags; > + res2.start = res->start + (size * offset); > + res2.end = res2.start + (size * num_vfs) - 1; > + > + if (res2.end > res->end) { > + dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n", > + i, &res2, res, num_vfs, offset); > + return -EBUSY; > + } > + } > + > + /* > + * After doing so, there would be a "hole" in the /proc/iomem when > + * offset is a positive value. It looks like the device return some > + * mmio back to the system, which actually no one could use it. > + */ > + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { > + res = &dev->resource[i + PCI_IOV_RESOURCES]; > + if (!res->flags || !res->parent) > + continue; > + > + if (!pnv_pci_is_mem_pref_64(res->flags)) > + continue; > + > + size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); > + res2 = *res; > + res->start += size * offset; > + > + dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n", > + i, &res2, res, num_vfs, offset); > + pci_update_resource(dev, i + PCI_IOV_RESOURCES); > + } > + return 0; > +} > +#endif /* CONFIG_PCI_IOV */ > + > #if 0 > static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) > { > @@ -979,8 +1150,312 @@ static void pnv_pci_ioda_setup_PEs(void) > } > > #ifdef CONFIG_PCI_IOV > +static int pnv_pci_vf_release_m64(struct pci_dev *pdev) > +{ > + struct pci_bus *bus; > + struct pci_controller *hose; > + struct pnv_phb *phb; > + struct pci_dn *pdn; > + int i; > + > + bus = pdev->bus; > + hose = pci_bus_to_host(bus); > + phb = hose->private_data; > + pdn = pci_get_pdn(pdev); > + > + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { > + if (pdn->m64_wins[i] == IODA_INVALID_M64) > + continue; > + opal_pci_phb_mmio_enable(phb->opal_id, > + OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i], 0); > + clear_bit(pdn->m64_wins[i], &phb->ioda.m64_bar_alloc); > + pdn->m64_wins[i] = IODA_INVALID_M64; > + } > + > + return 0; > +} > + > +static int pnv_pci_vf_assign_m64(struct pci_dev *pdev) > +{ > + struct pci_bus *bus; > + struct pci_controller *hose; > + struct pnv_phb *phb; > + struct pci_dn *pdn; > + unsigned int win; > + struct resource *res; > + int i; > + int64_t rc; > + > + bus = pdev->bus; > + hose = pci_bus_to_host(bus); > + phb = hose->private_data; > + pdn = pci_get_pdn(pdev); > + > + /* Initialize the m64_wins to IODA_INVALID_M64 */ > + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) > + pdn->m64_wins[i] = IODA_INVALID_M64; > + > + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { > + res = &pdev->resource[i + PCI_IOV_RESOURCES]; > + if (!res->flags || !res->parent) > + continue; > + > + if (!pnv_pci_is_mem_pref_64(res->flags)) > + continue; > + > + do { > + win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, > + phb->ioda.m64_bar_idx + 1, 0); > + > + if (win >= phb->ioda.m64_bar_idx + 1) > + goto m64_failed; > + } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); > + > + pdn->m64_wins[i] = win; > + > + /* Map the M64 here */ > + rc = opal_pci_set_phb_mem_window(phb->opal_id, > + OPAL_M64_WINDOW_TYPE, > + pdn->m64_wins[i], > + res->start, > + 0, /* unused */ > + resource_size(res)); > + if (rc != OPAL_SUCCESS) { > + dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n", > + win, rc); > + goto m64_failed; > + } > + > + rc = opal_pci_phb_mmio_enable(phb->opal_id, > + OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i], 1); > + if (rc != OPAL_SUCCESS) { > + dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n", > + win, rc); > + goto m64_failed; > + } > + } > + return 0; > + > +m64_failed: > + pnv_pci_vf_release_m64(pdev); > + return -EBUSY; > +} > + > +static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe) > +{ > + struct pci_bus *bus; > + struct pci_controller *hose; > + struct pnv_phb *phb; > + struct iommu_table *tbl; > + unsigned long addr; > + int64_t rc; > + > + bus = dev->bus; > + hose = pci_bus_to_host(bus); > + phb = hose->private_data; > + tbl = pe->tce32_table; > + addr = tbl->it_base; > + > + opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, > + pe->pe_number << 1, 1, __pa(addr), > + 0, 0x1000); > + > + rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, > + pe->pe_number, > + (pe->pe_number << 1) + 1, > + pe->tce_bypass_base, > + 0); > + if (rc) > + pe_warn(pe, "OPAL error %ld release DMA window\n", rc); > + > + iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); > + free_pages(addr, get_order(TCE32_TABLE_SIZE)); > + pe->tce32_table = NULL; > +} > + > +static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) > +{ > + struct pci_bus *bus; > + struct pci_controller *hose; > + struct pnv_phb *phb; > + struct pnv_ioda_pe *pe, *pe_n; > + struct pci_dn *pdn; > + > + bus = pdev->bus; > + hose = pci_bus_to_host(bus); > + phb = hose->private_data; > + > + if (!pdev->is_physfn) > + return; > + > + pdn = pci_get_pdn(pdev); > + list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { > + if (pe->parent_dev != pdev) > + continue; > + > + pnv_pci_ioda2_release_dma_pe(pdev, pe); > + > + /* Remove from list */ > + mutex_lock(&phb->ioda.pe_list_mutex); > + list_del(&pe->list); > + mutex_unlock(&phb->ioda.pe_list_mutex); > + > + pnv_ioda_deconfigure_pe(phb, pe); > + > + pnv_ioda_free_pe(phb, pe->pe_number); > + } > +} > + > +void pnv_pci_sriov_disable(struct pci_dev *pdev) > +{ > + struct pci_bus *bus; > + struct pci_controller *hose; > + struct pnv_phb *phb; > + struct pci_dn *pdn; > + struct pci_sriov *iov; > + u16 num_vfs; > + > + bus = pdev->bus; > + hose = pci_bus_to_host(bus); > + phb = hose->private_data; > + pdn = pci_get_pdn(pdev); > + iov = pdev->sriov; > + num_vfs = pdn->num_vfs; > + > + /* Release VF PEs */ > + pnv_ioda_release_vf_PE(pdev); > + > + if (phb->type == PNV_PHB_IODA2) { > + pnv_pci_vf_resource_shift(pdev, -pdn->offset); > + > + /* Release M64 windows */ > + pnv_pci_vf_release_m64(pdev); > + > + /* Release PE numbers */ > + bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); > + pdn->offset = 0; > + } > +} > + > +static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, > + struct pnv_ioda_pe *pe); > +static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) > +{ > + struct pci_bus *bus; > + struct pci_controller *hose; > + struct pnv_phb *phb; > + struct pnv_ioda_pe *pe; > + int pe_num; > + u16 vf_index; > + struct pci_dn *pdn; > + > + bus = pdev->bus; > + hose = pci_bus_to_host(bus); > + phb = hose->private_data; > + pdn = pci_get_pdn(pdev); > + > + if (!pdev->is_physfn) > + return; > + > + /* Reserve PE for each VF */ > + for (vf_index = 0; vf_index < num_vfs; vf_index++) { > + pe_num = pdn->offset + vf_index; > + > + pe = &phb->ioda.pe_array[pe_num]; > + pe->pe_number = pe_num; > + pe->phb = phb; > + pe->flags = PNV_IODA_PE_VF; > + pe->pbus = NULL; > + pe->parent_dev = pdev; > + pe->tce32_seg = -1; > + pe->mve_number = -1; > + pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) | > + pci_iov_virtfn_devfn(pdev, vf_index); > + > + pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n", > + hose->global_number, pdev->bus->number, > + PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)), > + PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num); > + > + if (pnv_ioda_configure_pe(phb, pe)) { > + /* XXX What do we do here ? */ > + if (pe_num) > + pnv_ioda_free_pe(phb, pe_num); > + pe->pdev = NULL; > + continue; > + } > + > + pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), > + GFP_KERNEL, hose->node); > + pe->tce32_table->data = pe; > + > + /* Put PE to the list */ > + mutex_lock(&phb->ioda.pe_list_mutex); > + list_add_tail(&pe->list, &phb->ioda.pe_list); > + mutex_unlock(&phb->ioda.pe_list_mutex); > + > + pnv_pci_ioda2_setup_dma_pe(phb, pe); > + } > +} > + > +int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) > +{ > + struct pci_bus *bus; > + struct pci_controller *hose; > + struct pnv_phb *phb; > + struct pci_dn *pdn; > + int ret; > + > + bus = pdev->bus; > + hose = pci_bus_to_host(bus); > + phb = hose->private_data; > + pdn = pci_get_pdn(pdev); > + > + if (phb->type == PNV_PHB_IODA2) { > + /* Calculate available PE for required VFs */ > + mutex_lock(&phb->ioda.pe_alloc_mutex); > + pdn->offset = bitmap_find_next_zero_area( > + phb->ioda.pe_alloc, phb->ioda.total_pe, > + 0, num_vfs, 0); > + if (pdn->offset >= phb->ioda.total_pe) { > + mutex_unlock(&phb->ioda.pe_alloc_mutex); > + dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs); > + pdn->offset = 0; > + return -EBUSY; > + } > + bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs); > + pdn->num_vfs = num_vfs; > + mutex_unlock(&phb->ioda.pe_alloc_mutex); > + > + /* Assign M64 window accordingly */ > + ret = pnv_pci_vf_assign_m64(pdev); > + if (ret) { > + dev_info(&pdev->dev, "Not enough M64 window resources\n"); > + goto m64_failed; > + } > + > + /* Do some magic shift */ > + ret = pnv_pci_vf_resource_shift(pdev, pdn->offset); Given that you're already doing a version 15, would it be possible to include a more informative comment than "Do some magic shift"? Perhaps some of the information from your commit message? Regards, Daniel > + if (ret) > + goto m64_failed; > + } > + > + /* Setup VF PEs */ > + pnv_ioda_setup_vf_PE(pdev, num_vfs); > + > + return 0; > + > +m64_failed: > + bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); > + pdn->offset = 0; > + > + return ret; > +} > + > int pcibios_sriov_disable(struct pci_dev *pdev) > { > + pnv_pci_sriov_disable(pdev); > + > /* Release firmware data */ > remove_dev_pci_info(pdev); > return 0; > @@ -990,6 +1465,8 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) > { > /* Allocate firmware data */ > add_dev_pci_info(pdev); > + > + pnv_pci_sriov_enable(pdev, num_vfs); > return 0; > } > #endif /* CONFIG_PCI_IOV */ > @@ -1186,9 +1663,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, > int64_t rc; > void *addr; > > - /* 256M DMA window, 4K TCE pages, 8 bytes TCE */ > -#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) > - > /* XXX FIXME: Handle 64-bit only DMA devices */ > /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ > /* XXX FIXME: Allocate multi-level tables on PHB3 */ > @@ -1251,12 +1725,19 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, > TCE_PCI_SWINV_PAIR); > } > iommu_init_table(tbl, phb->hose->node); > - iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); > > - if (pe->pdev) > + if (pe->flags & PNV_IODA_PE_DEV) { > + iommu_register_group(tbl, phb->hose->global_number, > + pe->pe_number); > set_iommu_table_base_and_group(&pe->pdev->dev, tbl); > - else > + } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { > + iommu_register_group(tbl, phb->hose->global_number, > + pe->pe_number); > pnv_ioda_setup_bus_dma(pe, pe->pbus, true); > + } else if (pe->flags & PNV_IODA_PE_VF) { > + iommu_register_group(tbl, phb->hose->global_number, > + pe->pe_number); > + } > > return; > fail: > @@ -1383,12 +1864,19 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, > tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); > } > iommu_init_table(tbl, phb->hose->node); > - iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); > > - if (pe->pdev) > + if (pe->flags & PNV_IODA_PE_DEV) { > + iommu_register_group(tbl, phb->hose->global_number, > + pe->pe_number); > set_iommu_table_base_and_group(&pe->pdev->dev, tbl); > - else > + } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { > + iommu_register_group(tbl, phb->hose->global_number, > + pe->pe_number); > pnv_ioda_setup_bus_dma(pe, pe->pbus, true); > + } else if (pe->flags & PNV_IODA_PE_VF) { > + iommu_register_group(tbl, phb->hose->global_number, > + pe->pe_number); > + } > > /* Also create a bypass window */ > if (!pnv_iommu_bypass_disabled) > @@ -2068,6 +2556,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, > phb->hub_id = hub_id; > phb->opal_id = phb_id; > phb->type = ioda_type; > + mutex_init(&phb->ioda.pe_alloc_mutex); > > /* Detect specific models for error handling */ > if (of_device_is_compatible(np, "ibm,p7ioc-pciex")) > @@ -2127,6 +2616,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, > > INIT_LIST_HEAD(&phb->ioda.pe_dma_list); > INIT_LIST_HEAD(&phb->ioda.pe_list); > + mutex_init(&phb->ioda.pe_list_mutex); > > /* Calculate how many 32-bit TCE segments we have */ > phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28; > diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c > index 6c20d6e..a88f915 100644 > --- a/arch/powerpc/platforms/powernv/pci.c > +++ b/arch/powerpc/platforms/powernv/pci.c > @@ -714,6 +714,24 @@ static void pnv_pci_dma_dev_setup(struct pci_dev *pdev) > { > struct pci_controller *hose = pci_bus_to_host(pdev->bus); > struct pnv_phb *phb = hose->private_data; > +#ifdef CONFIG_PCI_IOV > + struct pnv_ioda_pe *pe; > + struct pci_dn *pdn; > + > + /* Fix the VF pdn PE number */ > + if (pdev->is_virtfn) { > + pdn = pci_get_pdn(pdev); > + WARN_ON(pdn->pe_number != IODA_INVALID_PE); > + list_for_each_entry(pe, &phb->ioda.pe_list, list) { > + if (pe->rid == ((pdev->bus->number << 8) | > + (pdev->devfn & 0xff))) { > + pdn->pe_number = pe->pe_number; > + pe->pdev = pdev; > + break; > + } > + } > + } > +#endif /* CONFIG_PCI_IOV */ > > /* If we have no phb structure, try to setup a fallback based on > * the device-tree (RTAS PCI for example) > diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h > index 7317777..39d42f2 100644 > --- a/arch/powerpc/platforms/powernv/pci.h > +++ b/arch/powerpc/platforms/powernv/pci.h > @@ -23,6 +23,7 @@ enum pnv_phb_model { > #define PNV_IODA_PE_BUS_ALL (1 << 2) /* PE has subordinate buses */ > #define PNV_IODA_PE_MASTER (1 << 3) /* Master PE in compound case */ > #define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */ > +#define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */ > > /* Data associated with a PE, including IOMMU tracking etc.. */ > struct pnv_phb; > @@ -34,6 +35,9 @@ struct pnv_ioda_pe { > * entire bus (& children). In the former case, pdev > * is populated, in the later case, pbus is. > */ > +#ifdef CONFIG_PCI_IOV > + struct pci_dev *parent_dev; > +#endif > struct pci_dev *pdev; > struct pci_bus *pbus; > > @@ -165,6 +169,8 @@ struct pnv_phb { > > /* PE allocation bitmap */ > unsigned long *pe_alloc; > + /* PE allocation mutex */ > + struct mutex pe_alloc_mutex; > > /* M32 & IO segment maps */ > unsigned int *m32_segmap; > @@ -179,6 +185,7 @@ struct pnv_phb { > * on the sequence of creation > */ > struct list_head pe_list; > + struct mutex pe_list_mutex; > > /* Reverse map of PEs, will have to extend if > * we are to support more than 256 PEs, indexed
On Tue, Mar 24, 2015 at 10:22:26AM +1100, Daniel Axtens wrote: >> + >> + /* Do some magic shift */ >> + ret = pnv_pci_vf_resource_shift(pdev, pdn->offset); > > >Given that you're already doing a version 15, would it be possible to >include a more informative comment than "Do some magic shift"? Perhaps >some of the information from your commit message? > >Regards, >Daniel > Daniel, Thanks for your comment, I will add more informative comment at this place.
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index d0d1718..3c95097 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -177,6 +177,10 @@ struct pci_dn { int pe_number; #ifdef CONFIG_PCI_IOV u16 vfs_expanded; /* number of VFs IOV BAR expanded */ + u16 num_vfs; /* number of VFs enabled*/ + int offset; /* PE# for the first VF PE */ +#define IODA_INVALID_M64 (-1) + int m64_wins[PCI_SRIOV_NUM_BARS]; #endif /* CONFIG_PCI_IOV */ #endif struct list_head child_list; diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index f3a1a81..93ed7b3 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -217,6 +217,19 @@ void remove_dev_pci_info(struct pci_dev *pdev) struct pci_dn *pdn, *tmp; int i; + /* + * VF and VF PE are created/released dynamically, so we need to + * bind/unbind them. Otherwise the VF and VF PE would be mismatched + * when re-enabling SR-IOV. + */ + if (pdev->is_virtfn) { + pdn = pci_get_pdn(pdev); +#ifdef CONFIG_PPC_POWERNV + pdn->pe_number = IODA_INVALID_PE; +#endif + return; + } + /* Only support IOV PF for now */ if (!pdev->is_physfn) return; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 93ec16c..bd1b678 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -44,6 +44,9 @@ #include "powernv.h" #include "pci.h" +/* 256M DMA window, 4K TCE pages, 8 bytes TCE */ +#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) + static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...) { @@ -56,11 +59,18 @@ static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, vaf.fmt = fmt; vaf.va = &args; - if (pe->pdev) + if (pe->flags & PNV_IODA_PE_DEV) strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix)); - else + else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) sprintf(pfix, "%04x:%02x ", pci_domain_nr(pe->pbus), pe->pbus->number); +#ifdef CONFIG_PCI_IOV + else if (pe->flags & PNV_IODA_PE_VF) + sprintf(pfix, "%04x:%02x:%2x.%d", + pci_domain_nr(pe->parent_dev->bus), + (pe->rid & 0xff00) >> 8, + PCI_SLOT(pe->rid), PCI_FUNC(pe->rid)); +#endif /* CONFIG_PCI_IOV*/ printk("%spci %s: [PE# %.3d] %pV", level, pfix, pe->pe_number, &vaf); @@ -591,7 +601,7 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, bool is_add) { struct pnv_ioda_pe *slave; - struct pci_dev *pdev; + struct pci_dev *pdev = NULL; int ret; /* @@ -630,8 +640,12 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS)) pdev = pe->pbus->self; - else + else if (pe->flags & PNV_IODA_PE_DEV) pdev = pe->pdev->bus->self; +#ifdef CONFIG_PCI_IOV + else if (pe->flags & PNV_IODA_PE_VF) + pdev = pe->parent_dev->bus->self; +#endif /* CONFIG_PCI_IOV */ while (pdev) { struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *parent; @@ -649,6 +663,87 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, return 0; } +#ifdef CONFIG_PCI_IOV +static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) +{ + struct pci_dev *parent; + uint8_t bcomp, dcomp, fcomp; + int64_t rc; + long rid_end, rid; + + /* Currently, we just deconfigure VF PE. Bus PE will always there.*/ + if (pe->pbus) { + int count; + + dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; + fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; + parent = pe->pbus->self; + if (pe->flags & PNV_IODA_PE_BUS_ALL) + count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; + else + count = 1; + + switch(count) { + case 1: bcomp = OpalPciBusAll; break; + case 2: bcomp = OpalPciBus7Bits; break; + case 4: bcomp = OpalPciBus6Bits; break; + case 8: bcomp = OpalPciBus5Bits; break; + case 16: bcomp = OpalPciBus4Bits; break; + case 32: bcomp = OpalPciBus3Bits; break; + default: + dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n", + count); + /* Do an exact match only */ + bcomp = OpalPciBusAll; + } + rid_end = pe->rid + (count << 8); + } else { + if (pe->flags & PNV_IODA_PE_VF) + parent = pe->parent_dev; + else + parent = pe->pdev->bus->self; + bcomp = OpalPciBusAll; + dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; + fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; + rid_end = pe->rid + 1; + } + + /* Clear the reverse map */ + for (rid = pe->rid; rid < rid_end; rid++) + phb->ioda.pe_rmap[rid] = 0; + + /* Release from all parents PELT-V */ + while (parent) { + struct pci_dn *pdn = pci_get_pdn(parent); + if (pdn && pdn->pe_number != IODA_INVALID_PE) { + rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, + pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); + /* XXX What to do in case of error ? */ + } + parent = parent->bus->self; + } + + opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + + /* Disassociate PE in PELT */ + rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, + pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); + if (rc) + pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc); + rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, + bcomp, dcomp, fcomp, OPAL_UNMAP_PE); + if (rc) + pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); + + pe->pbus = NULL; + pe->pdev = NULL; + pe->parent_dev = NULL; + + return 0; +} +#endif /* CONFIG_PCI_IOV */ + static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { struct pci_dev *parent; @@ -675,15 +770,19 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) case 16: bcomp = OpalPciBus4Bits; break; case 32: bcomp = OpalPciBus3Bits; break; default: - pr_err("%s: Number of subordinate busses %d" - " unsupported\n", - pci_name(pe->pbus->self), count); + dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n", + count); /* Do an exact match only */ bcomp = OpalPciBusAll; } rid_end = pe->rid + (count << 8); } else { - parent = pe->pdev->bus->self; +#ifdef CONFIG_PCI_IOV + if (pe->flags & PNV_IODA_PE_VF) + parent = pe->parent_dev; + else +#endif /* CONFIG_PCI_IOV */ + parent = pe->pdev->bus->self; bcomp = OpalPciBusAll; dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; @@ -774,6 +873,78 @@ static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) return 10; } +#ifdef CONFIG_PCI_IOV +static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) +{ + struct pci_dn *pdn = pci_get_pdn(dev); + int i; + struct resource *res, res2; + resource_size_t size; + u16 num_vfs; + + if (!dev->is_physfn) + return -EINVAL; + + /* + * "offset" is in VFs. The M64 windows are sized so that when they + * are segmented, each segment is the same size as the IOV BAR. + * Each segment is in a separate PE, and the high order bits of the + * address are the PE number. Therefore, each VF's BAR is in a + * separate PE, and changing the IOV BAR start address changes the + * range of PEs the VFs are in. + */ + num_vfs = pdn->num_vfs; + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &dev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + + if (!pnv_pci_is_mem_pref_64(res->flags)) + continue; + + /* + * The actual IOV BAR range is determined by the start address + * and the actual size for num_vfs VFs BAR. This check is to + * make sure that after shifting, the range will not overlap + * with another device. + */ + size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); + res2.flags = res->flags; + res2.start = res->start + (size * offset); + res2.end = res2.start + (size * num_vfs) - 1; + + if (res2.end > res->end) { + dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n", + i, &res2, res, num_vfs, offset); + return -EBUSY; + } + } + + /* + * After doing so, there would be a "hole" in the /proc/iomem when + * offset is a positive value. It looks like the device return some + * mmio back to the system, which actually no one could use it. + */ + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &dev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + + if (!pnv_pci_is_mem_pref_64(res->flags)) + continue; + + size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); + res2 = *res; + res->start += size * offset; + + dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n", + i, &res2, res, num_vfs, offset); + pci_update_resource(dev, i + PCI_IOV_RESOURCES); + } + return 0; +} +#endif /* CONFIG_PCI_IOV */ + #if 0 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) { @@ -979,8 +1150,312 @@ static void pnv_pci_ioda_setup_PEs(void) } #ifdef CONFIG_PCI_IOV +static int pnv_pci_vf_release_m64(struct pci_dev *pdev) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pci_dn *pdn; + int i; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + pdn = pci_get_pdn(pdev); + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + if (pdn->m64_wins[i] == IODA_INVALID_M64) + continue; + opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i], 0); + clear_bit(pdn->m64_wins[i], &phb->ioda.m64_bar_alloc); + pdn->m64_wins[i] = IODA_INVALID_M64; + } + + return 0; +} + +static int pnv_pci_vf_assign_m64(struct pci_dev *pdev) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pci_dn *pdn; + unsigned int win; + struct resource *res; + int i; + int64_t rc; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + pdn = pci_get_pdn(pdev); + + /* Initialize the m64_wins to IODA_INVALID_M64 */ + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) + pdn->m64_wins[i] = IODA_INVALID_M64; + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &pdev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + + if (!pnv_pci_is_mem_pref_64(res->flags)) + continue; + + do { + win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, + phb->ioda.m64_bar_idx + 1, 0); + + if (win >= phb->ioda.m64_bar_idx + 1) + goto m64_failed; + } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); + + pdn->m64_wins[i] = win; + + /* Map the M64 here */ + rc = opal_pci_set_phb_mem_window(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + pdn->m64_wins[i], + res->start, + 0, /* unused */ + resource_size(res)); + if (rc != OPAL_SUCCESS) { + dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n", + win, rc); + goto m64_failed; + } + + rc = opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i], 1); + if (rc != OPAL_SUCCESS) { + dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n", + win, rc); + goto m64_failed; + } + } + return 0; + +m64_failed: + pnv_pci_vf_release_m64(pdev); + return -EBUSY; +} + +static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct iommu_table *tbl; + unsigned long addr; + int64_t rc; + + bus = dev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + tbl = pe->tce32_table; + addr = tbl->it_base; + + opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, + pe->pe_number << 1, 1, __pa(addr), + 0, 0x1000); + + rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, + pe->pe_number, + (pe->pe_number << 1) + 1, + pe->tce_bypass_base, + 0); + if (rc) + pe_warn(pe, "OPAL error %ld release DMA window\n", rc); + + iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); + free_pages(addr, get_order(TCE32_TABLE_SIZE)); + pe->tce32_table = NULL; +} + +static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pnv_ioda_pe *pe, *pe_n; + struct pci_dn *pdn; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + + if (!pdev->is_physfn) + return; + + pdn = pci_get_pdn(pdev); + list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { + if (pe->parent_dev != pdev) + continue; + + pnv_pci_ioda2_release_dma_pe(pdev, pe); + + /* Remove from list */ + mutex_lock(&phb->ioda.pe_list_mutex); + list_del(&pe->list); + mutex_unlock(&phb->ioda.pe_list_mutex); + + pnv_ioda_deconfigure_pe(phb, pe); + + pnv_ioda_free_pe(phb, pe->pe_number); + } +} + +void pnv_pci_sriov_disable(struct pci_dev *pdev) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pci_dn *pdn; + struct pci_sriov *iov; + u16 num_vfs; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + pdn = pci_get_pdn(pdev); + iov = pdev->sriov; + num_vfs = pdn->num_vfs; + + /* Release VF PEs */ + pnv_ioda_release_vf_PE(pdev); + + if (phb->type == PNV_PHB_IODA2) { + pnv_pci_vf_resource_shift(pdev, -pdn->offset); + + /* Release M64 windows */ + pnv_pci_vf_release_m64(pdev); + + /* Release PE numbers */ + bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); + pdn->offset = 0; + } +} + +static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe); +static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pnv_ioda_pe *pe; + int pe_num; + u16 vf_index; + struct pci_dn *pdn; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + pdn = pci_get_pdn(pdev); + + if (!pdev->is_physfn) + return; + + /* Reserve PE for each VF */ + for (vf_index = 0; vf_index < num_vfs; vf_index++) { + pe_num = pdn->offset + vf_index; + + pe = &phb->ioda.pe_array[pe_num]; + pe->pe_number = pe_num; + pe->phb = phb; + pe->flags = PNV_IODA_PE_VF; + pe->pbus = NULL; + pe->parent_dev = pdev; + pe->tce32_seg = -1; + pe->mve_number = -1; + pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) | + pci_iov_virtfn_devfn(pdev, vf_index); + + pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n", + hose->global_number, pdev->bus->number, + PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)), + PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num); + + if (pnv_ioda_configure_pe(phb, pe)) { + /* XXX What do we do here ? */ + if (pe_num) + pnv_ioda_free_pe(phb, pe_num); + pe->pdev = NULL; + continue; + } + + pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), + GFP_KERNEL, hose->node); + pe->tce32_table->data = pe; + + /* Put PE to the list */ + mutex_lock(&phb->ioda.pe_list_mutex); + list_add_tail(&pe->list, &phb->ioda.pe_list); + mutex_unlock(&phb->ioda.pe_list_mutex); + + pnv_pci_ioda2_setup_dma_pe(phb, pe); + } +} + +int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pci_dn *pdn; + int ret; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + pdn = pci_get_pdn(pdev); + + if (phb->type == PNV_PHB_IODA2) { + /* Calculate available PE for required VFs */ + mutex_lock(&phb->ioda.pe_alloc_mutex); + pdn->offset = bitmap_find_next_zero_area( + phb->ioda.pe_alloc, phb->ioda.total_pe, + 0, num_vfs, 0); + if (pdn->offset >= phb->ioda.total_pe) { + mutex_unlock(&phb->ioda.pe_alloc_mutex); + dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs); + pdn->offset = 0; + return -EBUSY; + } + bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs); + pdn->num_vfs = num_vfs; + mutex_unlock(&phb->ioda.pe_alloc_mutex); + + /* Assign M64 window accordingly */ + ret = pnv_pci_vf_assign_m64(pdev); + if (ret) { + dev_info(&pdev->dev, "Not enough M64 window resources\n"); + goto m64_failed; + } + + /* Do some magic shift */ + ret = pnv_pci_vf_resource_shift(pdev, pdn->offset); + if (ret) + goto m64_failed; + } + + /* Setup VF PEs */ + pnv_ioda_setup_vf_PE(pdev, num_vfs); + + return 0; + +m64_failed: + bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); + pdn->offset = 0; + + return ret; +} + int pcibios_sriov_disable(struct pci_dev *pdev) { + pnv_pci_sriov_disable(pdev); + /* Release firmware data */ remove_dev_pci_info(pdev); return 0; @@ -990,6 +1465,8 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) { /* Allocate firmware data */ add_dev_pci_info(pdev); + + pnv_pci_sriov_enable(pdev, num_vfs); return 0; } #endif /* CONFIG_PCI_IOV */ @@ -1186,9 +1663,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, int64_t rc; void *addr; - /* 256M DMA window, 4K TCE pages, 8 bytes TCE */ -#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) - /* XXX FIXME: Handle 64-bit only DMA devices */ /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ /* XXX FIXME: Allocate multi-level tables on PHB3 */ @@ -1251,12 +1725,19 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, TCE_PCI_SWINV_PAIR); } iommu_init_table(tbl, phb->hose->node); - iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); - if (pe->pdev) + if (pe->flags & PNV_IODA_PE_DEV) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); set_iommu_table_base_and_group(&pe->pdev->dev, tbl); - else + } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); pnv_ioda_setup_bus_dma(pe, pe->pbus, true); + } else if (pe->flags & PNV_IODA_PE_VF) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); + } return; fail: @@ -1383,12 +1864,19 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); } iommu_init_table(tbl, phb->hose->node); - iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); - if (pe->pdev) + if (pe->flags & PNV_IODA_PE_DEV) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); set_iommu_table_base_and_group(&pe->pdev->dev, tbl); - else + } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); pnv_ioda_setup_bus_dma(pe, pe->pbus, true); + } else if (pe->flags & PNV_IODA_PE_VF) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); + } /* Also create a bypass window */ if (!pnv_iommu_bypass_disabled) @@ -2068,6 +2556,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->hub_id = hub_id; phb->opal_id = phb_id; phb->type = ioda_type; + mutex_init(&phb->ioda.pe_alloc_mutex); /* Detect specific models for error handling */ if (of_device_is_compatible(np, "ibm,p7ioc-pciex")) @@ -2127,6 +2616,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, INIT_LIST_HEAD(&phb->ioda.pe_dma_list); INIT_LIST_HEAD(&phb->ioda.pe_list); + mutex_init(&phb->ioda.pe_list_mutex); /* Calculate how many 32-bit TCE segments we have */ phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28; diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 6c20d6e..a88f915 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -714,6 +714,24 @@ static void pnv_pci_dma_dev_setup(struct pci_dev *pdev) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); struct pnv_phb *phb = hose->private_data; +#ifdef CONFIG_PCI_IOV + struct pnv_ioda_pe *pe; + struct pci_dn *pdn; + + /* Fix the VF pdn PE number */ + if (pdev->is_virtfn) { + pdn = pci_get_pdn(pdev); + WARN_ON(pdn->pe_number != IODA_INVALID_PE); + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + if (pe->rid == ((pdev->bus->number << 8) | + (pdev->devfn & 0xff))) { + pdn->pe_number = pe->pe_number; + pe->pdev = pdev; + break; + } + } + } +#endif /* CONFIG_PCI_IOV */ /* If we have no phb structure, try to setup a fallback based on * the device-tree (RTAS PCI for example) diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 7317777..39d42f2 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -23,6 +23,7 @@ enum pnv_phb_model { #define PNV_IODA_PE_BUS_ALL (1 << 2) /* PE has subordinate buses */ #define PNV_IODA_PE_MASTER (1 << 3) /* Master PE in compound case */ #define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */ +#define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */ /* Data associated with a PE, including IOMMU tracking etc.. */ struct pnv_phb; @@ -34,6 +35,9 @@ struct pnv_ioda_pe { * entire bus (& children). In the former case, pdev * is populated, in the later case, pbus is. */ +#ifdef CONFIG_PCI_IOV + struct pci_dev *parent_dev; +#endif struct pci_dev *pdev; struct pci_bus *pbus; @@ -165,6 +169,8 @@ struct pnv_phb { /* PE allocation bitmap */ unsigned long *pe_alloc; + /* PE allocation mutex */ + struct mutex pe_alloc_mutex; /* M32 & IO segment maps */ unsigned int *m32_segmap; @@ -179,6 +185,7 @@ struct pnv_phb { * on the sequence of creation */ struct list_head pe_list; + struct mutex pe_list_mutex; /* Reverse map of PEs, will have to extend if * we are to support more than 256 PEs, indexed
On PowerNV platform, resource position in M64 BAR implies the PE# the resource belongs to. In some cases, adjustment of a resource is necessary to locate it to a correct position in M64 BAR . This patch adds pnv_pci_vf_resource_shift() to shift the 'real' PF IOV BAR address according to an offset. Note: After doing so, there would be a "hole" in the /proc/iomem when offset is a positive value. It looks like the device return some mmio back to the system, which actually no one could use it. [bhelgaas: rework loops, rework overlap check, index resource[] conventionally, remove pci_regs.h include, squashed with next patch] Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com> --- arch/powerpc/include/asm/pci-bridge.h | 4 + arch/powerpc/kernel/pci_dn.c | 13 + arch/powerpc/platforms/powernv/pci-ioda.c | 524 ++++++++++++++++++++++++++++- arch/powerpc/platforms/powernv/pci.c | 18 + arch/powerpc/platforms/powernv/pci.h | 7 + 5 files changed, 549 insertions(+), 17 deletions(-)