diff mbox series

[5/5] cxl/region: Constrain region granularity scaling factor

Message ID 165853778028.2430596.7493880465382850752.stgit@dwillia2-xfh.jf.intel.com
State Superseded
Headers show
Series CXL Region Provisioning Fixes | expand

Commit Message

Dan Williams July 23, 2022, 12:56 a.m. UTC
Consider the scenario where a platform provides for a x2 host-bridge
interleave at a granularity of 256 bytes. The only permitted region
granularities for that configuration are 256 or 512. Anything larger
than 512 results in unmapped capacity within a given decoder. Also, if
the region granularity is 512 then the interleave_ways for the region
must be 4 to keep the scaling matched.

Here are the translations for the first (4) 256-byte blocks where an
endpoint decoder is configured for a 512-byte granularity:

Block[0] => HB0 => DPA: 0
Block[1] => HB1 => DPA: 0
Block[2] => HB0 => DPA: 0
Block[3] => HB1 => DPA: 0

In order for those translations to not alias the region interleave ways
must be 4 resulting in:

Block[0] => HB0 => Dev0 => DPA: 0
Block[1] => HB1 => Dev1 => DPA: 0
Block[2] => HB0 => Dev2 => DPA: 0
Block[3] => HB1 => Dev3 => DPA: 0

...not 2, resulting in:

Block[0] => HB0 => Dev0 => DPA: 0
Block[1] => HB1 => Dev1 => DPA: 0
Block[2] => HB0 => Dev0 => DPA: 0 !
Block[3] => HB1 => Dev1 => DPA: 0 !

Given tooling is already being built around this ABI allow for
granularity and ways to be set in either order and validate the
combination once both are set.

Reported-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/region.c |   63 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 60 insertions(+), 3 deletions(-)

Comments

Alison Schofield Aug. 1, 2022, 7:43 p.m. UTC | #1
On Fri, Jul 22, 2022 at 05:56:20PM -0700, Dan Williams wrote:
> Consider the scenario where a platform provides for a x2 host-bridge
> interleave at a granularity of 256 bytes. The only permitted region
> granularities for that configuration are 256 or 512. Anything larger
> than 512 results in unmapped capacity within a given decoder. Also, if
> the region granularity is 512 then the interleave_ways for the region
> must be 4 to keep the scaling matched.
> 
> Here are the translations for the first (4) 256-byte blocks where an
> endpoint decoder is configured for a 512-byte granularity:
> 
> Block[0] => HB0 => DPA: 0
> Block[1] => HB1 => DPA: 0
> Block[2] => HB0 => DPA: 0
> Block[3] => HB1 => DPA: 0
> 
> In order for those translations to not alias the region interleave ways
> must be 4 resulting in:
> 
> Block[0] => HB0 => Dev0 => DPA: 0
> Block[1] => HB1 => Dev1 => DPA: 0
> Block[2] => HB0 => Dev2 => DPA: 0
> Block[3] => HB1 => Dev3 => DPA: 0
> 
> ...not 2, resulting in:
> 
> Block[0] => HB0 => Dev0 => DPA: 0
> Block[1] => HB1 => Dev1 => DPA: 0
> Block[2] => HB0 => Dev0 => DPA: 0 !
> Block[3] => HB1 => Dev1 => DPA: 0 !
> 
> Given tooling is already being built around this ABI allow for
> granularity and ways to be set in either order and validate the
> combination once both are set.
> 
> Reported-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

I appreciate the code comments!

Reviewed-by: Alison Schofield <alison.schofield@intel.com>

> ---
>  drivers/cxl/core/region.c |   63 +++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 60 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 05b6212e6399..a34e537e4cb2 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -348,6 +348,25 @@ static ssize_t interleave_ways_store(struct device *dev,
>  		goto out;
>  	}
>  
> +	/*
> +	 * If region granularity has been set and xHB interleave is active,
> +	 * validate that granularity is compatible with specified ways.
> +	 * Otherwise allow ways to be set now and depend on
> +	 * interleave_granularity_store() to validate this constraint.
> +	 */
> +	if (cxld->interleave_ways > 1 &&
> +	    p->interleave_granularity > cxld->interleave_granularity &&
> +	    p->interleave_granularity / cxld->interleave_granularity !=
> +		    val / cxld->interleave_ways) {
> +		dev_dbg(dev,
> +			"ways scaling factor %d mismatch with granularity %d\n",
> +			val / cxld->interleave_ways,
> +			p->interleave_granularity /
> +				cxld->interleave_granularity);
> +		rc = -EINVAL;
> +		goto out;
> +	}
> +
>  	save = p->interleave_ways;
>  	p->interleave_ways = val;
>  	rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
> @@ -386,7 +405,7 @@ static ssize_t interleave_granularity_store(struct device *dev,
>  	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
>  	struct cxl_region *cxlr = to_cxl_region(dev);
>  	struct cxl_region_params *p = &cxlr->params;
> -	int rc, val;
> +	int rc, val, ways;
>  	u16 ig;
>  
>  	rc = kstrtoint(buf, 0, &val);
> @@ -403,9 +422,29 @@ static ssize_t interleave_granularity_store(struct device *dev,
>  	 * granularity less than the root interleave result in needing
>  	 * multiple endpoints to support a single slot in the
>  	 * interleave.
> +	 *
> +	 * When the root interleave ways is 1 then the root granularity is a
> +	 * don't care.
> +	 *
> +	 * Limit region granularity to cxld->interleave_granularity *
> +	 * rounddown_pow_of_two(cxld->interleave_ways) otherwise holes result in
> +	 * the decode at each endpoint. Note that rounddown_pow_of_two()
> +	 * accounts for x3, x6, and x9 root intereleave.
>  	 */
> -	if (val < cxld->interleave_granularity)
> -		return -EINVAL;
> +	ways = rounddown_pow_of_two(cxld->interleave_ways);
> +	if (ways > 1) {
> +		if (val < cxld->interleave_granularity) {
> +			dev_dbg(dev, "granularity %d must be >= %d\n", val,
> +				cxld->interleave_granularity);
> +			return -EINVAL;
> +		}
> +
> +		if (val > cxld->interleave_granularity * ways) {
> +			dev_dbg(dev, "granularity %d must be <= %d\n", val,
> +				cxld->interleave_granularity * ways);
> +			return -EINVAL;
> +		}
> +	}
>  
>  	rc = down_write_killable(&cxl_region_rwsem);
>  	if (rc)
> @@ -415,6 +454,24 @@ static ssize_t interleave_granularity_store(struct device *dev,
>  		goto out;
>  	}
>  
> +	/*
> +	 * If region ways has been set and xHB interleave is active, validate
> +	 * that ways is compatible with specified granularity.  Otherwise allow
> +	 * granularity to be set now and depend on interleave_ways_store() to
> +	 * validate this constraint.
> +	 */
> +	if (cxld->interleave_ways > 1 && p->interleave_ways &&
> +	    val > cxld->interleave_granularity &&
> +	    p->interleave_ways / cxld->interleave_ways !=
> +		    val / cxld->interleave_granularity) {
> +		dev_dbg(dev,
> +			"granularity scaling factor %d mismatch with ways %d\n",
> +			val / cxld->interleave_granularity,
> +			p->interleave_ways / cxld->interleave_ways);
> +		rc = -EINVAL;
> +		goto out;
> +	}
> +
>  	p->interleave_granularity = val;
>  out:
>  	up_write(&cxl_region_rwsem);
>
Verma, Vishal L Aug. 1, 2022, 8:55 p.m. UTC | #2
On Fri, 2022-07-22 at 17:56 -0700, Dan Williams wrote:
> Consider the scenario where a platform provides for a x2 host-bridge
> interleave at a granularity of 256 bytes. The only permitted region
> granularities for that configuration are 256 or 512. Anything larger
> than 512 results in unmapped capacity within a given decoder. Also, if
> the region granularity is 512 then the interleave_ways for the region
> must be 4 to keep the scaling matched.
> 
> Here are the translations for the first (4) 256-byte blocks where an
> endpoint decoder is configured for a 512-byte granularity:
> 
> Block[0] => HB0 => DPA: 0
> Block[1] => HB1 => DPA: 0
> Block[2] => HB0 => DPA: 0
> Block[3] => HB1 => DPA: 0
> 
> In order for those translations to not alias the region interleave ways
> must be 4 resulting in:
> 
> Block[0] => HB0 => Dev0 => DPA: 0
> Block[1] => HB1 => Dev1 => DPA: 0
> Block[2] => HB0 => Dev2 => DPA: 0
> Block[3] => HB1 => Dev3 => DPA: 0
> 
> ...not 2, resulting in:
> 
> Block[0] => HB0 => Dev0 => DPA: 0
> Block[1] => HB1 => Dev1 => DPA: 0
> Block[2] => HB0 => Dev0 => DPA: 0 !
> Block[3] => HB1 => Dev1 => DPA: 0 !
> 
> Given tooling is already being built around this ABI allow for
> granularity and ways to be set in either order and validate the
> combination once both are set.
> 
> Reported-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  drivers/cxl/core/region.c |   63 +++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 60 insertions(+), 3 deletions(-)

Looks good, 

Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>

> 
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 05b6212e6399..a34e537e4cb2 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -348,6 +348,25 @@ static ssize_t interleave_ways_store(struct device *dev,
>                 goto out;
>         }
>  
> +       /*
> +        * If region granularity has been set and xHB interleave is active,
> +        * validate that granularity is compatible with specified ways.
> +        * Otherwise allow ways to be set now and depend on
> +        * interleave_granularity_store() to validate this constraint.
> +        */
> +       if (cxld->interleave_ways > 1 &&
> +           p->interleave_granularity > cxld->interleave_granularity &&
> +           p->interleave_granularity / cxld->interleave_granularity !=
> +                   val / cxld->interleave_ways) {
> +               dev_dbg(dev,
> +                       "ways scaling factor %d mismatch with granularity %d\n",
> +                       val / cxld->interleave_ways,
> +                       p->interleave_granularity /
> +                               cxld->interleave_granularity);
> +               rc = -EINVAL;
> +               goto out;
> +       }
> +
>         save = p->interleave_ways;
>         p->interleave_ways = val;
>         rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
> @@ -386,7 +405,7 @@ static ssize_t interleave_granularity_store(struct device *dev,
>         struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
>         struct cxl_region *cxlr = to_cxl_region(dev);
>         struct cxl_region_params *p = &cxlr->params;
> -       int rc, val;
> +       int rc, val, ways;
>         u16 ig;
>  
>         rc = kstrtoint(buf, 0, &val);
> @@ -403,9 +422,29 @@ static ssize_t interleave_granularity_store(struct device *dev,
>          * granularity less than the root interleave result in needing
>          * multiple endpoints to support a single slot in the
>          * interleave.
> +        *
> +        * When the root interleave ways is 1 then the root granularity is a
> +        * don't care.
> +        *
> +        * Limit region granularity to cxld->interleave_granularity *
> +        * rounddown_pow_of_two(cxld->interleave_ways) otherwise holes result in
> +        * the decode at each endpoint. Note that rounddown_pow_of_two()
> +        * accounts for x3, x6, and x9 root intereleave.
>          */
> -       if (val < cxld->interleave_granularity)
> -               return -EINVAL;
> +       ways = rounddown_pow_of_two(cxld->interleave_ways);
> +       if (ways > 1) {
> +               if (val < cxld->interleave_granularity) {
> +                       dev_dbg(dev, "granularity %d must be >= %d\n", val,
> +                               cxld->interleave_granularity);
> +                       return -EINVAL;
> +               }
> +
> +               if (val > cxld->interleave_granularity * ways) {
> +                       dev_dbg(dev, "granularity %d must be <= %d\n", val,
> +                               cxld->interleave_granularity * ways);
> +                       return -EINVAL;
> +               }
> +       }
>  
>         rc = down_write_killable(&cxl_region_rwsem);
>         if (rc)
> @@ -415,6 +454,24 @@ static ssize_t interleave_granularity_store(struct device *dev,
>                 goto out;
>         }
>  
> +       /*
> +        * If region ways has been set and xHB interleave is active, validate
> +        * that ways is compatible with specified granularity.  Otherwise allow
> +        * granularity to be set now and depend on interleave_ways_store() to
> +        * validate this constraint.
> +        */
> +       if (cxld->interleave_ways > 1 && p->interleave_ways &&
> +           val > cxld->interleave_granularity &&
> +           p->interleave_ways / cxld->interleave_ways !=
> +                   val / cxld->interleave_granularity) {
> +               dev_dbg(dev,
> +                       "granularity scaling factor %d mismatch with ways %d\n",
> +                       val / cxld->interleave_granularity,
> +                       p->interleave_ways / cxld->interleave_ways);
> +               rc = -EINVAL;
> +               goto out;
> +       }
> +
>         p->interleave_granularity = val;
>  out:
>         up_write(&cxl_region_rwsem);
>
Jonathan Cameron Aug. 3, 2022, 4:17 p.m. UTC | #3
On Fri, 22 Jul 2022 17:56:20 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> Consider the scenario where a platform provides for a x2 host-bridge
> interleave at a granularity of 256 bytes.
Hi Dan,

Terminology not clear to me.  Is this interleave across 2 host bridges?

I'm lost in the explanation.

> The only permitted region
> granularities for that configuration are 256 or 512. Anything larger
> than 512 results in unmapped capacity within a given decoder. Also, if
> the region granularity is 512 then the interleave_ways for the region
> must be 4 to keep the scaling matched.
> 
> Here are the translations for the first (4) 256-byte blocks where an
> endpoint decoder is configured for a 512-byte granularity:
> 
> Block[0] => HB0 => DPA: 0
> Block[1] => HB1 => DPA: 0
> Block[2] => HB0 => DPA: 0
> Block[3] => HB1 => DPA: 0
> 
> In order for those translations to not alias the region interleave ways
> must be 4 resulting in:
> 
> Block[0] => HB0 => Dev0 => DPA: 0
> Block[1] => HB1 => Dev1 => DPA: 0
> Block[2] => HB0 => Dev2 => DPA: 0
> Block[3] => HB1 => Dev3 => DPA: 0

Block[4] => HBA0 => Dev0 => DPA: 512 ?

which means we are only using alternate 256 blocks of the EP.  Does that make
sense?

> 
> ...not 2, resulting in:
> 
> Block[0] => HB0 => Dev0 => DPA: 0
> Block[1] => HB1 => Dev1 => DPA: 0
> Block[2] => HB0 => Dev0 => DPA: 0 !
> Block[3] => HB1 => Dev1 => DPA: 0 !

> 
> Given tooling is already being built around this ABI allow for
> granularity and ways to be set in either order and validate the
> combination once both are set.
> 
> Reported-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

Typo inline.

> ---
>  drivers/cxl/core/region.c |   63 +++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 60 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 05b6212e6399..a34e537e4cb2 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -348,6 +348,25 @@ static ssize_t interleave_ways_store(struct device *dev,
>  		goto out;
>  	}
>  
> +	/*
> +	 * If region granularity has been set and xHB interleave is active,
> +	 * validate that granularity is compatible with specified ways.
> +	 * Otherwise allow ways to be set now and depend on
> +	 * interleave_granularity_store() to validate this constraint.
> +	 */
> +	if (cxld->interleave_ways > 1 &&
> +	    p->interleave_granularity > cxld->interleave_granularity &&
> +	    p->interleave_granularity / cxld->interleave_granularity !=
> +		    val / cxld->interleave_ways) {
> +		dev_dbg(dev,
> +			"ways scaling factor %d mismatch with granularity %d\n",
> +			val / cxld->interleave_ways,
> +			p->interleave_granularity /
> +				cxld->interleave_granularity);
> +		rc = -EINVAL;
> +		goto out;
> +	}
> +
>  	save = p->interleave_ways;
>  	p->interleave_ways = val;
>  	rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
> @@ -386,7 +405,7 @@ static ssize_t interleave_granularity_store(struct device *dev,
>  	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
>  	struct cxl_region *cxlr = to_cxl_region(dev);
>  	struct cxl_region_params *p = &cxlr->params;
> -	int rc, val;
> +	int rc, val, ways;
>  	u16 ig;
>  
>  	rc = kstrtoint(buf, 0, &val);
> @@ -403,9 +422,29 @@ static ssize_t interleave_granularity_store(struct device *dev,
>  	 * granularity less than the root interleave result in needing
>  	 * multiple endpoints to support a single slot in the
>  	 * interleave.
> +	 *
> +	 * When the root interleave ways is 1 then the root granularity is a
> +	 * don't care.
> +	 *
> +	 * Limit region granularity to cxld->interleave_granularity *
> +	 * rounddown_pow_of_two(cxld->interleave_ways) otherwise holes result in
> +	 * the decode at each endpoint. Note that rounddown_pow_of_two()
> +	 * accounts for x3, x6, and x9 root intereleave.

interleave

>  	 */
> -	if (val < cxld->interleave_granularity)
> -		return -EINVAL;
> +	ways = rounddown_pow_of_two(cxld->interleave_ways);
> +	if (ways > 1) {
> +		if (val < cxld->interleave_granularity) {
> +			dev_dbg(dev, "granularity %d must be >= %d\n", val,
> +				cxld->interleave_granularity);
> +			return -EINVAL;
> +		}
> +
> +		if (val > cxld->interleave_granularity * ways) {
> +			dev_dbg(dev, "granularity %d must be <= %d\n", val,
> +				cxld->interleave_granularity * ways);
> +			return -EINVAL;
> +		}
> +	}
>  
>  	rc = down_write_killable(&cxl_region_rwsem);
>  	if (rc)
> @@ -415,6 +454,24 @@ static ssize_t interleave_granularity_store(struct device *dev,
>  		goto out;
>  	}
>  
> +	/*
> +	 * If region ways has been set and xHB interleave is active, validate
> +	 * that ways is compatible with specified granularity.  Otherwise allow
> +	 * granularity to be set now and depend on interleave_ways_store() to
> +	 * validate this constraint.
> +	 */
> +	if (cxld->interleave_ways > 1 && p->interleave_ways &&
> +	    val > cxld->interleave_granularity &&
> +	    p->interleave_ways / cxld->interleave_ways !=
> +		    val / cxld->interleave_granularity) {
> +		dev_dbg(dev,
> +			"granularity scaling factor %d mismatch with ways %d\n",
> +			val / cxld->interleave_granularity,
> +			p->interleave_ways / cxld->interleave_ways);
> +		rc = -EINVAL;
> +		goto out;
> +	}
> +
>  	p->interleave_granularity = val;
>  out:
>  	up_write(&cxl_region_rwsem);
> 
>
Dan Williams Aug. 4, 2022, 4:33 p.m. UTC | #4
Jonathan Cameron wrote:
> On Fri, 22 Jul 2022 17:56:20 -0700
> Dan Williams <dan.j.williams@intel.com> wrote:
> 
> > Consider the scenario where a platform provides for a x2 host-bridge
> > interleave at a granularity of 256 bytes.
> Hi Dan,
> 
> Terminology not clear to me.  Is this interleave across 2 host bridges?

Yes.

> 
> I'm lost in the explanation.
> 
> > The only permitted region
> > granularities for that configuration are 256 or 512. Anything larger
> > than 512 results in unmapped capacity within a given decoder. Also, if
> > the region granularity is 512 then the interleave_ways for the region
> > must be 4 to keep the scaling matched.
> > 
> > Here are the translations for the first (4) 256-byte blocks where an
> > endpoint decoder is configured for a 512-byte granularity:
> > 
> > Block[0] => HB0 => DPA: 0
> > Block[1] => HB1 => DPA: 0
> > Block[2] => HB0 => DPA: 0
> > Block[3] => HB1 => DPA: 0
> > 
> > In order for those translations to not alias the region interleave ways
> > must be 4 resulting in:
> > 
> > Block[0] => HB0 => Dev0 => DPA: 0
> > Block[1] => HB1 => Dev1 => DPA: 0
> > Block[2] => HB0 => Dev2 => DPA: 0
> > Block[3] => HB1 => Dev3 => DPA: 0
> 
> Block[4] => HBA0 => Dev0 => DPA: 512 ?
> 
> which means we are only using alternate 256 blocks of the EP.  Does that make
> sense?

Yes, but I found a subtle bug a bug in my logic. Here is the output from
my spreadsheet calculating the DPA_Offset for the first 8 blocks when
the topology is "x2 host-bridge @ 256 => x2 endpoint @ 512"

Address	HB Index	HPA Offset	DPA Offset
0x0	0		0x0		0x0
0x100	1		0x100		0x0
0x200	0		0x200		0x0
0x300	1		0x300		0x0
0x400	0		0x400		0x200
0x500	1		0x500		0x200
0x600	0		0x600		0x200
0x700	1		0x700		0x200
0x800	0		0x800		0x400

So we have 2 endpoints at 4 DPA_Offset == 0, so there is aliasing.
However, the bug in my logic is that this fix for this is not:

"...if the region granularity is 512 then the interleave_ways for the
region must be 4 to keep the scaling matched."

...it is that the number of *targets* in the interleave must be 4 with
that split handled by the host bridge, and leave the endpoint interleave
ways setting at 2. So, in general to support region-granularity >
root-granularity, the host-bridges and / or switches in the path must
scale the interleave.

---
"x4 region @ 512 granularity under an x2 window @ 256"

 ┌───────────────────────────────────┬──┐
 │WINDOW0                            │x2│
 └─────────┬─────────────────┬───────┴──┘
           │                 │
 ┌─────────▼────┬──┐  ┌──────▼───────┬──┐
 │HB0           │x2│  │HB1           │x2│
 └──┬────────┬──┴──┘  └─┬─────────┬──┴──┘
    │        │          │         │
 ┌──▼─┬──┐ ┌─▼──┬──┐  ┌─▼──┬──┐ ┌─▼──┬──┐
 │DEV0│x2│ │DEV1│x2│  │DEV2│x2│ │DEV3│x2│
 └────┴──┘ └────┴──┘  └────┴──┘ └────┴──┘
---

---
"x4 region @ 256 granularity under an x2 window @ 256"

 ┌───────────────────────────────────┬──┐
 │WINDOW0                            │x2│
 └─────────┬─────────────────┬───────┴──┘
           │                 │
 ┌─────────▼────┬──┐  ┌──────▼───────┬──┐
 │HB0           │x2│  │HB1           │x2│
 └──┬────────┬──┴──┘  └─┬─────────┬──┴──┘
    │        │          │         │
 ┌──▼─┬──┐ ┌─▼──┬──┐  ┌─▼──┬──┐ ┌─▼──┬──┐
 │DEV0│x4│ │DEV1│x4│  │DEV2│x4│ │DEV3│x4│
 └────┴──┘ └────┴──┘  └────┴──┘ └────┴──┘
---

...i.e. it is not always the case that the endpoint interleave ways
setting matches the number of devices in the set.

In the interests of settling the code for v6.0 merge the smallest fix is
to just not allow region granularity != window granularity
configurations for now. Then for v6.1 the algorithm can be expanded to
take switching into account for endpoint intereleave geometry.

> > ...not 2, resulting in:
> > 
> > Block[0] => HB0 => Dev0 => DPA: 0
> > Block[1] => HB1 => Dev1 => DPA: 0
> > Block[2] => HB0 => Dev0 => DPA: 0 !
> > Block[3] => HB1 => Dev1 => DPA: 0 !
> 
> > 
> > Given tooling is already being built around this ABI allow for
> > granularity and ways to be set in either order and validate the
> > combination once both are set.
> > 
> > Reported-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> 
> Typo inline.

Thanks, but moot now that I'm dropping this in favor of the safe fix,
and push ongoing improvements to the next merge window.
Dan Williams Aug. 4, 2022, 5:57 p.m. UTC | #5
Dan Williams wrote:
> Jonathan Cameron wrote:
> > On Fri, 22 Jul 2022 17:56:20 -0700
> > Dan Williams <dan.j.williams@intel.com> wrote:
> > 
> > > Consider the scenario where a platform provides for a x2 host-bridge
> > > interleave at a granularity of 256 bytes.
> > Hi Dan,
> > 
> > Terminology not clear to me.  Is this interleave across 2 host bridges?
> 
> Yes.
> 
> > 
> > I'm lost in the explanation.
> > 
> > > The only permitted region
> > > granularities for that configuration are 256 or 512. Anything larger
> > > than 512 results in unmapped capacity within a given decoder. Also, if
> > > the region granularity is 512 then the interleave_ways for the region
> > > must be 4 to keep the scaling matched.
> > > 
> > > Here are the translations for the first (4) 256-byte blocks where an
> > > endpoint decoder is configured for a 512-byte granularity:
> > > 
> > > Block[0] => HB0 => DPA: 0
> > > Block[1] => HB1 => DPA: 0
> > > Block[2] => HB0 => DPA: 0
> > > Block[3] => HB1 => DPA: 0
> > > 
> > > In order for those translations to not alias the region interleave ways
> > > must be 4 resulting in:
> > > 
> > > Block[0] => HB0 => Dev0 => DPA: 0
> > > Block[1] => HB1 => Dev1 => DPA: 0
> > > Block[2] => HB0 => Dev2 => DPA: 0
> > > Block[3] => HB1 => Dev3 => DPA: 0
> > 
> > Block[4] => HBA0 => Dev0 => DPA: 512 ?
> > 
> > which means we are only using alternate 256 blocks of the EP.  Does that make
> > sense?
> 
> Yes, but I found a subtle bug a bug in my logic. Here is the output from
> my spreadsheet calculating the DPA_Offset for the first 8 blocks when
> the topology is "x2 host-bridge @ 256 => x2 endpoint @ 512"
> 
> Address	HB Index	HPA Offset	DPA Offset
> 0x0	0		0x0		0x0
> 0x100	1		0x100		0x0
> 0x200	0		0x200		0x0
> 0x300	1		0x300		0x0
> 0x400	0		0x400		0x200
> 0x500	1		0x500		0x200
> 0x600	0		0x600		0x200
> 0x700	1		0x700		0x200
> 0x800	0		0x800		0x400
> 
> So we have 2 endpoints at 4 DPA_Offset == 0, so there is aliasing.
> However, the bug in my logic is that this fix for this is not:
> 
> "...if the region granularity is 512 then the interleave_ways for the
> region must be 4 to keep the scaling matched."
> 
> ...it is that the number of *targets* in the interleave must be 4 with
> that split handled by the host bridge, and leave the endpoint interleave
> ways setting at 2. So, in general to support region-granularity >
> root-granularity, the host-bridges and / or switches in the path must
> scale the interleave.
> 
> ---
> "x4 region @ 512 granularity under an x2 window @ 256"
> 
>  ┌───────────────────────────────────┬──┐
>  │WINDOW0                            │x2│
>  └─────────┬─────────────────┬───────┴──┘
>            │                 │
>  ┌─────────▼────┬──┐  ┌──────▼───────┬──┐
>  │HB0           │x2│  │HB1           │x2│
>  └──┬────────┬──┴──┘  └─┬─────────┬──┴──┘
>     │        │          │         │
>  ┌──▼─┬──┐ ┌─▼──┬──┐  ┌─▼──┬──┐ ┌─▼──┬──┐
>  │DEV0│x2│ │DEV1│x2│  │DEV2│x2│ │DEV3│x2│
>  └────┴──┘ └────┴──┘  └────┴──┘ └────┴──┘
> ---
> 
> ---
> "x4 region @ 256 granularity under an x2 window @ 256"
> 
>  ┌───────────────────────────────────┬──┐
>  │WINDOW0                            │x2│
>  └─────────┬─────────────────┬───────┴──┘
>            │                 │
>  ┌─────────▼────┬──┐  ┌──────▼───────┬──┐
>  │HB0           │x2│  │HB1           │x2│
>  └──┬────────┬──┴──┘  └─┬─────────┬──┴──┘
>     │        │          │         │
>  ┌──▼─┬──┐ ┌─▼──┬──┐  ┌─▼──┬──┐ ┌─▼──┬──┐
>  │DEV0│x4│ │DEV1│x4│  │DEV2│x4│ │DEV3│x4│
>  └────┴──┘ └────┴──┘  └────┴──┘ └────┴──┘
> ---
> 
> ...i.e. it is not always the case that the endpoint interleave ways
> setting matches the number of devices in the set.

No, that's still broken. Even with the host-bridge scaling the interleave it
still results in stranded capacity at the endpoints. So, in general region IG must
be <= Window IG. The current implementation is already blocking the region IG <
Window IG case, so forcing region IG to be equal to Window IG is the
solution for now.
diff mbox series

Patch

diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 05b6212e6399..a34e537e4cb2 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -348,6 +348,25 @@  static ssize_t interleave_ways_store(struct device *dev,
 		goto out;
 	}
 
+	/*
+	 * If region granularity has been set and xHB interleave is active,
+	 * validate that granularity is compatible with specified ways.
+	 * Otherwise allow ways to be set now and depend on
+	 * interleave_granularity_store() to validate this constraint.
+	 */
+	if (cxld->interleave_ways > 1 &&
+	    p->interleave_granularity > cxld->interleave_granularity &&
+	    p->interleave_granularity / cxld->interleave_granularity !=
+		    val / cxld->interleave_ways) {
+		dev_dbg(dev,
+			"ways scaling factor %d mismatch with granularity %d\n",
+			val / cxld->interleave_ways,
+			p->interleave_granularity /
+				cxld->interleave_granularity);
+		rc = -EINVAL;
+		goto out;
+	}
+
 	save = p->interleave_ways;
 	p->interleave_ways = val;
 	rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
@@ -386,7 +405,7 @@  static ssize_t interleave_granularity_store(struct device *dev,
 	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
 	struct cxl_region *cxlr = to_cxl_region(dev);
 	struct cxl_region_params *p = &cxlr->params;
-	int rc, val;
+	int rc, val, ways;
 	u16 ig;
 
 	rc = kstrtoint(buf, 0, &val);
@@ -403,9 +422,29 @@  static ssize_t interleave_granularity_store(struct device *dev,
 	 * granularity less than the root interleave result in needing
 	 * multiple endpoints to support a single slot in the
 	 * interleave.
+	 *
+	 * When the root interleave ways is 1 then the root granularity is a
+	 * don't care.
+	 *
+	 * Limit region granularity to cxld->interleave_granularity *
+	 * rounddown_pow_of_two(cxld->interleave_ways) otherwise holes result in
+	 * the decode at each endpoint. Note that rounddown_pow_of_two()
+	 * accounts for x3, x6, and x9 root intereleave.
 	 */
-	if (val < cxld->interleave_granularity)
-		return -EINVAL;
+	ways = rounddown_pow_of_two(cxld->interleave_ways);
+	if (ways > 1) {
+		if (val < cxld->interleave_granularity) {
+			dev_dbg(dev, "granularity %d must be >= %d\n", val,
+				cxld->interleave_granularity);
+			return -EINVAL;
+		}
+
+		if (val > cxld->interleave_granularity * ways) {
+			dev_dbg(dev, "granularity %d must be <= %d\n", val,
+				cxld->interleave_granularity * ways);
+			return -EINVAL;
+		}
+	}
 
 	rc = down_write_killable(&cxl_region_rwsem);
 	if (rc)
@@ -415,6 +454,24 @@  static ssize_t interleave_granularity_store(struct device *dev,
 		goto out;
 	}
 
+	/*
+	 * If region ways has been set and xHB interleave is active, validate
+	 * that ways is compatible with specified granularity.  Otherwise allow
+	 * granularity to be set now and depend on interleave_ways_store() to
+	 * validate this constraint.
+	 */
+	if (cxld->interleave_ways > 1 && p->interleave_ways &&
+	    val > cxld->interleave_granularity &&
+	    p->interleave_ways / cxld->interleave_ways !=
+		    val / cxld->interleave_granularity) {
+		dev_dbg(dev,
+			"granularity scaling factor %d mismatch with ways %d\n",
+			val / cxld->interleave_granularity,
+			p->interleave_ways / cxld->interleave_ways);
+		rc = -EINVAL;
+		goto out;
+	}
+
 	p->interleave_granularity = val;
 out:
 	up_write(&cxl_region_rwsem);