Message ID | 20180712154709.16444-1-keith.busch@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, 2018-07-12 at 09:47 -0600, Keith Busch wrote: > This patch will find the max contiguous area to determine the largest > pmem namespace size that can be created. If the requested size exceeds > the largest available, ENOSPC error will be returned. > > This fixes the allocation underrun error and wrong error return code > that have otherwise been observed as the following kernel warning: > > WARNING: CPU: <CPU> PID: <PID> at drivers/nvdimm/namespace_devs.c:913 size_store > > Fixes: a1f3e4d6a0c3 ("libnvdimm, region: update nd_region_available_dpa() for multi-pmem support") > Cc: <stable@vger.kernel.org> > Signed-off-by: Keith Busch <keith.busch@intel.com> Hi Keith, I was testing these patches and I found: When booting a VM which has both, a qemu ACPI.NFIT bus, and nfit_test buses, initially the nfit_test buses show correct max_available_extent. But the qemu ACPI.NFIT bus regions (which have an automatic full- capacity namespace created on them when they come up) show max_available_extent of the full region size, even as the available_size attr is zero. $ cat /sys/bus/nd/devices/region1/max_available_extent 17045651456 $ ndctl list -BNR --region=region1 [ { "provider":"ACPI.NFIT", "dev":"ndbus1", "regions":[ { "dev":"region1", "size":17045651456, "available_size":0, "type":"pmem", "numa_node":0, "persistence_domain":"unknown", "namespaces":[ { "dev":"namespace1.0", "mode":"raw", "size":17045651456, "sector_size":512, "blockdev":"pmem1", "numa_node":0 } ... If i reconfig the default namespace: $ sudo ndctl create-namespace --region=region1 --type=pmem -- reconfig=namespace1.0 --type=pmem --mode=fsdax --force { "dev":"namespace1.0", "mode":"fsdax", "map":"dev", "size":"15.63 GiB (16.78 GB)", "uuid":"55411e87-41a6-44e0-8198-97023de70413", "raw_uuid":"cb80c5c1-c582-4e12-9d24-2fd30bb7da20", "sector_size":512, "blockdev":"pmem1", "numa_node":0 } Then the max_available_extent gets updated correctly: $ cat /sys/bus/nd/devices/region1/max_available_extent 0 > --- > v2 -> v3: > > This one takes block regions into account by reserving pmem regions > on dimms and finding the largest intersection among all dimms in > the region. > > drivers/nvdimm/dimm_devs.c | 30 ++++++++++++++++++++++++++++++ > drivers/nvdimm/namespace_devs.c | 6 +++--- > drivers/nvdimm/nd-core.h | 9 +++++++++ > drivers/nvdimm/region_devs.c | 24 ++++++++++++++++++++++++ > 4 files changed, 66 insertions(+), 3 deletions(-) > > diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c > index 8d348b22ba45..9e977cbd1a60 100644 > --- a/drivers/nvdimm/dimm_devs.c > +++ b/drivers/nvdimm/dimm_devs.c > @@ -536,6 +536,36 @@ resource_size_t nd_blk_available_dpa(struct nd_region *nd_region) > return info.available; > } > > +/** > + * nd_pmem_max_contiguous_dpa - For the given dimm+region, return the max > + * contiguous unallocated dpa range. > + * @nd_region: constrain available space check to this reference region > + * @nd_mapping: container of dpa-resource-root + labels > + */ > +resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region, > + struct nd_mapping *nd_mapping) > +{ > + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); > + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); > + resource_size_t max = 0; > + struct resource *res; > + > + /* if a dimm is disabled the available capacity is zero */ > + if (!ndd) > + return 0; > + > + if (reserve_free_pmem(nvdimm_bus, nd_mapping)) > + return 0; > + for_each_dpa_resource(ndd, res) { > + if (strcmp(res->name, "pmem-reserve") != 0) > + continue; > + if (resource_size(res) > max) > + max = resource_size(res); > + } > + release_free_pmem(nvdimm_bus, nd_mapping); > + return max; > +} > + > /** > * nd_pmem_available_dpa - for the given dimm+region account unallocated dpa > * @nd_mapping: container of dpa-resource-root + labels > diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c > index 28afdd668905..c3afff2cdf1d 100644 > --- a/drivers/nvdimm/namespace_devs.c > +++ b/drivers/nvdimm/namespace_devs.c > @@ -836,7 +836,7 @@ static int __reserve_free_pmem(struct device *dev, void *data) > return 0; > } > > -static void release_free_pmem(struct nvdimm_bus *nvdimm_bus, > +void release_free_pmem(struct nvdimm_bus *nvdimm_bus, > struct nd_mapping *nd_mapping) > { > struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); > @@ -847,7 +847,7 @@ static void release_free_pmem(struct nvdimm_bus *nvdimm_bus, > nvdimm_free_dpa(ndd, res); > } > > -static int reserve_free_pmem(struct nvdimm_bus *nvdimm_bus, > +int reserve_free_pmem(struct nvdimm_bus *nvdimm_bus, > struct nd_mapping *nd_mapping) > { > struct nvdimm *nvdimm = nd_mapping->nvdimm; > @@ -1032,7 +1032,7 @@ static ssize_t __size_store(struct device *dev, unsigned long long val) > > allocated += nvdimm_allocated_dpa(ndd, &label_id); > } > - available = nd_region_available_dpa(nd_region); > + available = nd_region_allocatable_dpa(nd_region); > > if (val > available + allocated) > return -ENOSPC; > diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h > index 79274ead54fb..1c5f5b389940 100644 > --- a/drivers/nvdimm/nd-core.h > +++ b/drivers/nvdimm/nd-core.h > @@ -100,6 +100,15 @@ struct nd_region; > struct nvdimm_drvdata; > struct nd_mapping; > void nd_mapping_free_labels(struct nd_mapping *nd_mapping); > + > +int reserve_free_pmem(struct nvdimm_bus *nvdimm_bus, > + struct nd_mapping *nd_mapping); > +void release_free_pmem(struct nvdimm_bus *nvdimm_bus, > + struct nd_mapping *nd_mapping); > + > +resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region, > + struct nd_mapping *nd_mapping); > +resource_size_t nd_region_allocatable_dpa(struct nd_region *nd_region); > resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, > struct nd_mapping *nd_mapping, resource_size_t *overlap); > resource_size_t nd_blk_available_dpa(struct nd_region *nd_region); > diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c > index ec3543b83330..c30d5af02cc2 100644 > --- a/drivers/nvdimm/region_devs.c > +++ b/drivers/nvdimm/region_devs.c > @@ -389,6 +389,30 @@ resource_size_t nd_region_available_dpa(struct nd_region *nd_region) > return available; > } > > +resource_size_t nd_region_allocatable_dpa(struct nd_region *nd_region) > +{ > + resource_size_t available = 0; > + int i; > + > + if (is_memory(&nd_region->dev)) > + available = PHYS_ADDR_MAX; > + > + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); > + for (i = 0; i < nd_region->ndr_mappings; i++) { > + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; > + > + if (is_memory(&nd_region->dev)) > + available = min(available, > + nd_pmem_max_contiguous_dpa(nd_region, > + nd_mapping)); > + else if (is_nd_blk(&nd_region->dev)) > + available += nd_blk_available_dpa(nd_region); > + } > + if (is_memory(&nd_region->dev)) > + return available * nd_region->ndr_mappings; > + return available; > +} > + > static ssize_t available_size_show(struct device *dev, > struct device_attribute *attr, char *buf) > {
On Fri, Jul 20, 2018 at 01:46:06PM -0700, Verma, Vishal L wrote: > > On Thu, 2018-07-12 at 09:47 -0600, Keith Busch wrote: > > This patch will find the max contiguous area to determine the largest > > pmem namespace size that can be created. If the requested size exceeds > > the largest available, ENOSPC error will be returned. > > > > This fixes the allocation underrun error and wrong error return code > > that have otherwise been observed as the following kernel warning: > > > > WARNING: CPU: <CPU> PID: <PID> at drivers/nvdimm/namespace_devs.c:913 size_store > > > > Fixes: a1f3e4d6a0c3 ("libnvdimm, region: update nd_region_available_dpa() for multi-pmem support") > > Cc: <stable@vger.kernel.org> > > Signed-off-by: Keith Busch <keith.busch@intel.com> > > Hi Keith, > > I was testing these patches and I found: > > When booting a VM which has both, a qemu ACPI.NFIT bus, and nfit_test > buses, initially the nfit_test buses show correct max_available_extent. > But the qemu ACPI.NFIT bus regions (which have an automatic full- > capacity namespace created on them when they come up) show > max_available_extent of the full region size, even as the > available_size attr is zero. The max extents only counts the free pmem that it can reserve. We shouldn't have been able to reserve non-free pmem, so it sounds like something must be wrong with how the resources were set up. I'll make a similar qemu config and see why/if the resource was considered free.
On Fri, Jul 20, 2018 at 01:46:06PM -0700, Verma, Vishal L wrote: > $ cat /sys/bus/nd/devices/region1/max_available_extent > 17045651456 > > $ ndctl list -BNR --region=region1 > [ > { > "provider":"ACPI.NFIT", > "dev":"ndbus1", > "regions":[ > { > "dev":"region1", > "size":17045651456, > "available_size":0, > "type":"pmem", > "numa_node":0, > "persistence_domain":"unknown", > "namespaces":[ > { > "dev":"namespace1.0", > "mode":"raw", > "size":17045651456, > "sector_size":512, > "blockdev":"pmem1", > "numa_node":0 > } > ... > As we saw, getting the "available_size" directly from the the region's sysfs entry also returned the same as max extent: $ cat /sys/bus/nd/devices/region1/available_size 17045651456 The reason ndctl shows available_size as '0' is because the nstype is neither of type PMEM nor BLK. So I think max_available_extent is doing the right thing.
On Fri, 2018-07-20 at 15:48 -0600, Keith Busch wrote: > On Fri, Jul 20, 2018 at 01:46:06PM -0700, Verma, Vishal L wrote: > > $ cat /sys/bus/nd/devices/region1/max_available_extent > > 17045651456 > > > > $ ndctl list -BNR --region=region1 > > [ > > { > > "provider":"ACPI.NFIT", > > "dev":"ndbus1", > > "regions":[ > > { > > "dev":"region1", > > "size":17045651456, > > "available_size":0, > > "type":"pmem", > > "numa_node":0, > > "persistence_domain":"unknown", > > "namespaces":[ > > { > > "dev":"namespace1.0", > > "mode":"raw", > > "size":17045651456, > > "sector_size":512, > > "blockdev":"pmem1", > > "numa_node":0 > > } > > ... > > > > As we saw, getting the "available_size" directly from the the > region's > sysfs entry also returned the same as max extent: > > $ cat /sys/bus/nd/devices/region1/available_size > 17045651456 > > The reason ndctl shows available_size as '0' is because the nstype is > neither of type PMEM nor BLK. > > So I think max_available_extent is doing the right thing. Yep, I agree. I did however see another potential breakage (the blk- exhaust unit test fails due to this) ndctl create-namespace --bus=nfit_test.0 creates a namespace on say region 3. That makes available_size for region3 zero (as reported by ndctl-list as well as directly from sysfs), but max_available_extent still shows the full size available. $ sudo ndctl create-namespace --bus=nfit_test.0 { "dev":"namespace3.0", "mode":"fsdax", "map":"dev", "size":"28.50 MiB (29.89 MB)", "uuid":"592071ed-0928-4be8-96fb-4be944e4c6f4", "raw_uuid":"c4ac44fa-d3bd-43ea-9a1a-3a083d9fed1d", "sector_size":512, "blockdev":"pmem3" } $ cat /sys/bus/nd/devices/region3/max_available_extent 33554432 $ cat /sys/bus/nd/devices/region3/available_size 0 And then a subsequent ndctl create-namespace --bus=nfit_test.0 sees the max_available extent on region 3 (with the corresponding ndctl patches for this applied), tries to create a namespace again there, and obviously fails. As a side note, I think it may be useful to include in the related ndctl patch, a json entry for the max_available_extent for region listings.
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index 8d348b22ba45..9e977cbd1a60 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -536,6 +536,36 @@ resource_size_t nd_blk_available_dpa(struct nd_region *nd_region) return info.available; } +/** + * nd_pmem_max_contiguous_dpa - For the given dimm+region, return the max + * contiguous unallocated dpa range. + * @nd_region: constrain available space check to this reference region + * @nd_mapping: container of dpa-resource-root + labels + */ +resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region, + struct nd_mapping *nd_mapping) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); + resource_size_t max = 0; + struct resource *res; + + /* if a dimm is disabled the available capacity is zero */ + if (!ndd) + return 0; + + if (reserve_free_pmem(nvdimm_bus, nd_mapping)) + return 0; + for_each_dpa_resource(ndd, res) { + if (strcmp(res->name, "pmem-reserve") != 0) + continue; + if (resource_size(res) > max) + max = resource_size(res); + } + release_free_pmem(nvdimm_bus, nd_mapping); + return max; +} + /** * nd_pmem_available_dpa - for the given dimm+region account unallocated dpa * @nd_mapping: container of dpa-resource-root + labels diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 28afdd668905..c3afff2cdf1d 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -836,7 +836,7 @@ static int __reserve_free_pmem(struct device *dev, void *data) return 0; } -static void release_free_pmem(struct nvdimm_bus *nvdimm_bus, +void release_free_pmem(struct nvdimm_bus *nvdimm_bus, struct nd_mapping *nd_mapping) { struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); @@ -847,7 +847,7 @@ static void release_free_pmem(struct nvdimm_bus *nvdimm_bus, nvdimm_free_dpa(ndd, res); } -static int reserve_free_pmem(struct nvdimm_bus *nvdimm_bus, +int reserve_free_pmem(struct nvdimm_bus *nvdimm_bus, struct nd_mapping *nd_mapping) { struct nvdimm *nvdimm = nd_mapping->nvdimm; @@ -1032,7 +1032,7 @@ static ssize_t __size_store(struct device *dev, unsigned long long val) allocated += nvdimm_allocated_dpa(ndd, &label_id); } - available = nd_region_available_dpa(nd_region); + available = nd_region_allocatable_dpa(nd_region); if (val > available + allocated) return -ENOSPC; diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 79274ead54fb..1c5f5b389940 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -100,6 +100,15 @@ struct nd_region; struct nvdimm_drvdata; struct nd_mapping; void nd_mapping_free_labels(struct nd_mapping *nd_mapping); + +int reserve_free_pmem(struct nvdimm_bus *nvdimm_bus, + struct nd_mapping *nd_mapping); +void release_free_pmem(struct nvdimm_bus *nvdimm_bus, + struct nd_mapping *nd_mapping); + +resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region, + struct nd_mapping *nd_mapping); +resource_size_t nd_region_allocatable_dpa(struct nd_region *nd_region); resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, struct nd_mapping *nd_mapping, resource_size_t *overlap); resource_size_t nd_blk_available_dpa(struct nd_region *nd_region); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index ec3543b83330..c30d5af02cc2 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -389,6 +389,30 @@ resource_size_t nd_region_available_dpa(struct nd_region *nd_region) return available; } +resource_size_t nd_region_allocatable_dpa(struct nd_region *nd_region) +{ + resource_size_t available = 0; + int i; + + if (is_memory(&nd_region->dev)) + available = PHYS_ADDR_MAX; + + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + + if (is_memory(&nd_region->dev)) + available = min(available, + nd_pmem_max_contiguous_dpa(nd_region, + nd_mapping)); + else if (is_nd_blk(&nd_region->dev)) + available += nd_blk_available_dpa(nd_region); + } + if (is_memory(&nd_region->dev)) + return available * nd_region->ndr_mappings; + return available; +} + static ssize_t available_size_show(struct device *dev, struct device_attribute *attr, char *buf) {
This patch will find the max contiguous area to determine the largest pmem namespace size that can be created. If the requested size exceeds the largest available, ENOSPC error will be returned. This fixes the allocation underrun error and wrong error return code that have otherwise been observed as the following kernel warning: WARNING: CPU: <CPU> PID: <PID> at drivers/nvdimm/namespace_devs.c:913 size_store Fixes: a1f3e4d6a0c3 ("libnvdimm, region: update nd_region_available_dpa() for multi-pmem support") Cc: <stable@vger.kernel.org> Signed-off-by: Keith Busch <keith.busch@intel.com> --- v2 -> v3: This one takes block regions into account by reserving pmem regions on dimms and finding the largest intersection among all dimms in the region. drivers/nvdimm/dimm_devs.c | 30 ++++++++++++++++++++++++++++++ drivers/nvdimm/namespace_devs.c | 6 +++--- drivers/nvdimm/nd-core.h | 9 +++++++++ drivers/nvdimm/region_devs.c | 24 ++++++++++++++++++++++++ 4 files changed, 66 insertions(+), 3 deletions(-)