diff mbox series

[v2,4/4] drm/i915/icl: Add Multi-segmented gamma support

Message ID 20190430152108.31814-5-shashank.sharma@intel.com (mailing list archive)
State New, archived
Headers show
Series Enable Multi-segmented-gamma for ICL | expand

Commit Message

Sharma, Shashank April 30, 2019, 3:21 p.m. UTC
ICL introduces a new gamma correction mode in display engine, called
multi-segmented-gamma mode. This mode allows users to program the
darker region of the gamma curve with sueprfine precision. An
example use case for this is HDR curves (like PQ ST-2084).

If we plot a gamma correction curve from value range between 0.0 to 1.0,
ICL's multi-segment has 3 different sections:
- superfine segment: 9 values, ranges between 0 - 1/(128 * 256)
- fine segment: 257 values, ranges between 0 - 1/(128)
- corase segment: 257 values, ranges between 0 - 1

This patch:
- Changes gamma LUTs size for ICL/GEN11 to 262144 entries (8 * 128 * 256),
  so that userspace can program with highest precision supported.
- Changes default gamma mode (non-legacy) to multi-segmented-gamma mode.
- Adds functions to program/detect multi-segment gamma.

V2: Addressed review comments from Ville
    - separate function for superfine and fine segments.
    - remove enum for segments.
    - reuse last entry of the LUT as gc_max value.
    - replace if() ....cond with switch...case in icl_load_luts.
    - add an entry variable, instead of 'word'

Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>

Suggested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Shashank Sharma <shashank.sharma@intel.com>
Signed-off-by: Uma Shankar <uma.shankar@intel.com>
---
 drivers/gpu/drm/i915/i915_pci.c    |   3 +-
 drivers/gpu/drm/i915/intel_color.c | 125 ++++++++++++++++++++++++++++-
 2 files changed, 123 insertions(+), 5 deletions(-)

Comments

Ville Syrjala May 3, 2019, 3:50 p.m. UTC | #1
On Tue, Apr 30, 2019 at 08:51:08PM +0530, Shashank Sharma wrote:
> ICL introduces a new gamma correction mode in display engine, called
> multi-segmented-gamma mode. This mode allows users to program the
> darker region of the gamma curve with sueprfine precision. An
> example use case for this is HDR curves (like PQ ST-2084).
> 
> If we plot a gamma correction curve from value range between 0.0 to 1.0,
> ICL's multi-segment has 3 different sections:
> - superfine segment: 9 values, ranges between 0 - 1/(128 * 256)
> - fine segment: 257 values, ranges between 0 - 1/(128)
> - corase segment: 257 values, ranges between 0 - 1
> 
> This patch:
> - Changes gamma LUTs size for ICL/GEN11 to 262144 entries (8 * 128 * 256),
>   so that userspace can program with highest precision supported.
> - Changes default gamma mode (non-legacy) to multi-segmented-gamma mode.
> - Adds functions to program/detect multi-segment gamma.
> 
> V2: Addressed review comments from Ville
>     - separate function for superfine and fine segments.
>     - remove enum for segments.
>     - reuse last entry of the LUT as gc_max value.
>     - replace if() ....cond with switch...case in icl_load_luts.
>     - add an entry variable, instead of 'word'
> 
> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> 
> Suggested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
> Signed-off-by: Shashank Sharma <shashank.sharma@intel.com>
> Signed-off-by: Uma Shankar <uma.shankar@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_pci.c    |   3 +-
>  drivers/gpu/drm/i915/intel_color.c | 125 ++++++++++++++++++++++++++++-
>  2 files changed, 123 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
> index ffa2ee70a03d..83698951760b 100644
> --- a/drivers/gpu/drm/i915/i915_pci.c
> +++ b/drivers/gpu/drm/i915/i915_pci.c
> @@ -749,7 +749,8 @@ static const struct intel_device_info intel_cannonlake_info = {
>  	GEN(11), \
>  	.ddb_size = 2048, \
>  	.has_logical_ring_elsq = 1, \
> -	.color = { .degamma_lut_size = 33, .gamma_lut_size = 1024 }
> +	.color = { .degamma_lut_size = 33, .gamma_lut_size = 262144 }

Ugh. Thats one big LUT. But looks correct.

> +

Bogus newline.

>  
>  static const struct intel_device_info intel_icelake_11_info = {
>  	GEN11_FEATURES,
> diff --git a/drivers/gpu/drm/i915/intel_color.c b/drivers/gpu/drm/i915/intel_color.c
> index 6c341bea514c..49831e8d02fb 100644
> --- a/drivers/gpu/drm/i915/intel_color.c
> +++ b/drivers/gpu/drm/i915/intel_color.c
> @@ -41,6 +41,9 @@
>  #define CTM_COEFF_ABS(coeff)		((coeff) & (CTM_COEFF_SIGN - 1))
>  
>  #define LEGACY_LUT_LENGTH		256
> +#define ICL_GAMMA_MULTISEG_LUT_LENGTH		(256 * 128 * 8)
> +#define ICL_GAMMA_SUPERFINE_SEG_LENGTH	9
> +
>  /*
>   * Extract the CSC coefficient from a CTM coefficient (in U32.32 fixed point
>   * format). This macro takes the coefficient we want transformed and the
> @@ -767,6 +770,113 @@ static void glk_load_luts(const struct intel_crtc_state *crtc_state)
>  	}
>  }
>  
> +/* ilk+ "12.4" interpolated format (high 10 bits) */
> +static u32 ilk_lut_12p4_ldw(const struct drm_color_lut *color)
> +{
> +	return (color->red >> 6) << 20 | (color->green >> 6) << 10 |
> +		(color->blue >> 6);
> +}
> +
> +/* ilk+ "12.4" interpolated format (low 6 bits) */
> +static u32 ilk_lut_12p4_udw(const struct drm_color_lut *color)
> +{
> +	return (color->red & 0x3f) << 24 | (color->green & 0x3f) << 14 |
> +		(color->blue & 0x3f);
> +}
> +
> +static void
> +icl_load_gcmax(const struct intel_crtc_state *crtc_state,
> +			const struct drm_color_lut *entry)

Indentation looks off. Also s/entry/color/ to match the other similarish
funcs maybe?

> +{
> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
> +	enum pipe pipe = crtc->pipe;
> +
> +	/* Fixme: LUT entries are 16 bit only, so we can prog 0xFFFF max */
> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 0), entry->red);
> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 1), entry->green);
> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 2), entry->blue);
> +}
> +
> +static void
> +icl_program_gamma_superfine_segment(const struct intel_crtc_state *crtc_state)
> +{
> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
> +	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
> +	const struct drm_color_lut *lut = blob->data;
> +	enum pipe pipe = crtc->pipe;
> +	u32 i;
> +
> +	if (!lut || drm_color_lut_size(blob) < ICL_GAMMA_SUPERFINE_SEG_LENGTH)
> +		return;

These checks aren't needed. Just dead code.

> +
> +	/*
> +	 * Every entry in the multi-segment LUT is corresponding to a superfine
> +	 * segment step which is 1/(8 * 128 * 256).
> +	 *
> +	 * Superfine segment has 9 entries, corresponding to values
> +	 * 0, 1/(8 * 128 * 256), 2/(8 * 128 * 256) .... 8/(8 * 128 * 256).
> +	 */
> +	I915_WRITE(PREC_PAL_MULTI_SEG_INDEX(pipe), PAL_PREC_AUTO_INCREMENT);
> +
> +	for (i = 0; i < 9; i++) {
> +		const struct drm_color_lut *entry = &lut[i];
> +
> +		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
> +			   ilk_lut_12p4_udw(entry));

ldw should come before udw.

> +		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
> +			   ilk_lut_12p4_ldw(entry));
> +	}
> +}
> +
> +static void
> +icl_program_gamma_multi_segment(const struct intel_crtc_state *crtc_state)
> +{
> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
> +	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
> +	const struct drm_color_lut *lut = blob->data;
> +	const struct drm_color_lut *entry;

'entry' declaration can be moved into the loops.

> +	enum pipe pipe = crtc->pipe;
> +	u32 i;
> +
> +	if (!lut || (drm_color_lut_size(blob) < ICL_GAMMA_MULTISEG_LUT_LENGTH))
> +		return;

More checks that aren't needed.

> +
> +	/*
> +	 * Every entry in the multi-segment LUT is corresponding to a superfine
> +	 * segment step which is 1/(8*128*256).
> +	 *
> +	 * Fine segment's step is 1/(128 * 256) ie 1/(128 * 256),  2/(128*256)
> +	 * ... 256/(128*256). So in order to program fine segment of LUT we
> +	 * need to pick every 8'th entry in LUT, and program 256 indexes.
> +	 * Fine segment's index 0 is programmed in HW, and it starts from
> +	 * index 1.

The wording here is a bit confusing. I guess the problem is what to call
things. PAL_PREC_INDEX[0/1] is what we program, but that maps to the point
seg2[1] with seg2[0] being unused by the hw. Well, the spec says it's
implicit but IIRC I was told long ago that it's not actually used.

Not sure how to word that in the best way. Maybe something like?

/*
 * Fine segment (seg2) ...
 *
 * PAL_PREC_INDEX[0] and PAL_PREC_INDEX[1] map to seg2[1],
 * with seg2[0] being unused by the hardware.
 */

Not sure that's any clearer.

> +	 */
> +	I915_WRITE(PREC_PAL_INDEX(pipe), PAL_PREC_AUTO_INCREMENT);
> +	for (i = 1; i < 257; i++) {
> +		entry = &lut[i * 8];
> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_udw(entry));
> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_ldw(entry));
> +	}
> +
> +	/*
> +	 * Coarse segment's starts from index 0 and it's step is 1/256 ie 0,
> +	 * 1/256, 2/256 ...256/256. As per the description of each entry in LUT
> +	 * above, we need to pick every 8 * 128 = 1024th entry in LUT, and
> +	 * program 256 of those.
> +	 */

Could make a note here stating that seg3[0] and seg3[1] are also unused
by the hardware, even though we have to program them to advance the
index. I don't see it mentioned in the spec, but this one I definitely
remember confirming from Art way back when. However I never verified
that on actual hw. We could also consider just programming those two
entries to 0 and start the actual coarse segment programming from index 2.
Or we could skip them by reprogramming the index directly.

> +	for (i = 0; i < 256; i++) {
> +		entry = &lut[i * 1024];

s/1024/8 * 128/ maybe?

> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_udw(entry));
> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_ldw(entry));
> +	}
> +
> +	icl_load_gcmax(crtc_state, entry);
> +	ivb_load_lut_ext_max(crtc);
> +}
> +
>  static void icl_load_luts(const struct intel_crtc_state *crtc_state)
>  {
>  	const struct drm_property_blob *gamma_lut = crtc_state->base.gamma_lut;
> @@ -775,10 +885,17 @@ static void icl_load_luts(const struct intel_crtc_state *crtc_state)
>  	if (crtc_state->base.degamma_lut)
>  		glk_load_degamma_lut(crtc_state);
>  
> -	if ((crtc_state->gamma_mode & GAMMA_MODE_MODE_MASK) ==
> -	    GAMMA_MODE_MODE_8BIT) {
> +	switch (crtc_state->gamma_mode & GAMMA_MODE_MODE_MASK) {
> +	case GAMMA_MODE_MODE_8BIT:
>  		i9xx_load_luts(crtc_state);
> -	} else {
> +		break;
> +
> +	case GAMMA_MODE_MODE_12BIT_MULTI_SEGMENTED:
> +		icl_program_gamma_superfine_segment(crtc_state);
> +		icl_program_gamma_multi_segment(crtc_state);
> +		break;
> +
> +	default:
>  		bdw_load_lut_10(crtc, gamma_lut, PAL_PREC_INDEX_VALUE(0));
>  		ivb_load_lut_ext_max(crtc);
>  	}
> @@ -1209,7 +1326,7 @@ static u32 icl_gamma_mode(const struct intel_crtc_state *crtc_state)
>  	    crtc_state_is_legacy_gamma(crtc_state))
>  		gamma_mode |= GAMMA_MODE_MODE_8BIT;
>  	else
> -		gamma_mode |= GAMMA_MODE_MODE_10BIT;
> +		gamma_mode |= GAMMA_MODE_MODE_12BIT_MULTI_SEGMENTED;
>  
>  	return gamma_mode;
>  }
> -- 
> 2.17.1
Sharma, Shashank May 6, 2019, 10:39 a.m. UTC | #2
Regards

Shashank

On 5/3/2019 9:20 PM, Ville Syrjälä wrote:
> On Tue, Apr 30, 2019 at 08:51:08PM +0530, Shashank Sharma wrote:
>> ICL introduces a new gamma correction mode in display engine, called
>> multi-segmented-gamma mode. This mode allows users to program the
>> darker region of the gamma curve with sueprfine precision. An
>> example use case for this is HDR curves (like PQ ST-2084).
>>
>> If we plot a gamma correction curve from value range between 0.0 to 1.0,
>> ICL's multi-segment has 3 different sections:
>> - superfine segment: 9 values, ranges between 0 - 1/(128 * 256)
>> - fine segment: 257 values, ranges between 0 - 1/(128)
>> - corase segment: 257 values, ranges between 0 - 1
>>
>> This patch:
>> - Changes gamma LUTs size for ICL/GEN11 to 262144 entries (8 * 128 * 256),
>>    so that userspace can program with highest precision supported.
>> - Changes default gamma mode (non-legacy) to multi-segmented-gamma mode.
>> - Adds functions to program/detect multi-segment gamma.
>>
>> V2: Addressed review comments from Ville
>>      - separate function for superfine and fine segments.
>>      - remove enum for segments.
>>      - reuse last entry of the LUT as gc_max value.
>>      - replace if() ....cond with switch...case in icl_load_luts.
>>      - add an entry variable, instead of 'word'
>>
>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
>>
>> Suggested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
>> Signed-off-by: Shashank Sharma <shashank.sharma@intel.com>
>> Signed-off-by: Uma Shankar <uma.shankar@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_pci.c    |   3 +-
>>   drivers/gpu/drm/i915/intel_color.c | 125 ++++++++++++++++++++++++++++-
>>   2 files changed, 123 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
>> index ffa2ee70a03d..83698951760b 100644
>> --- a/drivers/gpu/drm/i915/i915_pci.c
>> +++ b/drivers/gpu/drm/i915/i915_pci.c
>> @@ -749,7 +749,8 @@ static const struct intel_device_info intel_cannonlake_info = {
>>   	GEN(11), \
>>   	.ddb_size = 2048, \
>>   	.has_logical_ring_elsq = 1, \
>> -	.color = { .degamma_lut_size = 33, .gamma_lut_size = 1024 }
>> +	.color = { .degamma_lut_size = 33, .gamma_lut_size = 262144 }
> Ugh. Thats one big LUT. But looks correct.
>
>> +
> Bogus newline.
Got it.
>>   
>>   static const struct intel_device_info intel_icelake_11_info = {
>>   	GEN11_FEATURES,
>> diff --git a/drivers/gpu/drm/i915/intel_color.c b/drivers/gpu/drm/i915/intel_color.c
>> index 6c341bea514c..49831e8d02fb 100644
>> --- a/drivers/gpu/drm/i915/intel_color.c
>> +++ b/drivers/gpu/drm/i915/intel_color.c
>> @@ -41,6 +41,9 @@
>>   #define CTM_COEFF_ABS(coeff)		((coeff) & (CTM_COEFF_SIGN - 1))
>>   
>>   #define LEGACY_LUT_LENGTH		256
>> +#define ICL_GAMMA_MULTISEG_LUT_LENGTH		(256 * 128 * 8)
>> +#define ICL_GAMMA_SUPERFINE_SEG_LENGTH	9
>> +
>>   /*
>>    * Extract the CSC coefficient from a CTM coefficient (in U32.32 fixed point
>>    * format). This macro takes the coefficient we want transformed and the
>> @@ -767,6 +770,113 @@ static void glk_load_luts(const struct intel_crtc_state *crtc_state)
>>   	}
>>   }
>>   
>> +/* ilk+ "12.4" interpolated format (high 10 bits) */
>> +static u32 ilk_lut_12p4_ldw(const struct drm_color_lut *color)
>> +{
>> +	return (color->red >> 6) << 20 | (color->green >> 6) << 10 |
>> +		(color->blue >> 6);
>> +}
>> +
>> +/* ilk+ "12.4" interpolated format (low 6 bits) */
>> +static u32 ilk_lut_12p4_udw(const struct drm_color_lut *color)
>> +{
>> +	return (color->red & 0x3f) << 24 | (color->green & 0x3f) << 14 |
>> +		(color->blue & 0x3f);
>> +}
>> +
>> +static void
>> +icl_load_gcmax(const struct intel_crtc_state *crtc_state,
>> +			const struct drm_color_lut *entry)
> Indentation looks off. Also s/entry/color/ to match the other similarish
> funcs maybe?
Sure.
>> +{
>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
>> +	enum pipe pipe = crtc->pipe;
>> +
>> +	/* Fixme: LUT entries are 16 bit only, so we can prog 0xFFFF max */
>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 0), entry->red);
>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 1), entry->green);
>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 2), entry->blue);
>> +}
>> +
>> +static void
>> +icl_program_gamma_superfine_segment(const struct intel_crtc_state *crtc_state)
>> +{
>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
>> +	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
>> +	const struct drm_color_lut *lut = blob->data;
>> +	enum pipe pipe = crtc->pipe;
>> +	u32 i;
>> +
>> +	if (!lut || drm_color_lut_size(blob) < ICL_GAMMA_SUPERFINE_SEG_LENGTH)
>> +		return;
> These checks aren't needed. Just dead code.
Will remove this and similars.
>> +
>> +	/*
>> +	 * Every entry in the multi-segment LUT is corresponding to a superfine
>> +	 * segment step which is 1/(8 * 128 * 256).
>> +	 *
>> +	 * Superfine segment has 9 entries, corresponding to values
>> +	 * 0, 1/(8 * 128 * 256), 2/(8 * 128 * 256) .... 8/(8 * 128 * 256).
>> +	 */
>> +	I915_WRITE(PREC_PAL_MULTI_SEG_INDEX(pipe), PAL_PREC_AUTO_INCREMENT);
>> +
>> +	for (i = 0; i < 9; i++) {
>> +		const struct drm_color_lut *entry = &lut[i];
>> +
>> +		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
>> +			   ilk_lut_12p4_udw(entry));
> ldw should come before udw.
Got it.
>> +		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
>> +			   ilk_lut_12p4_ldw(entry));
>> +	}
>> +}
>> +
>> +static void
>> +icl_program_gamma_multi_segment(const struct intel_crtc_state *crtc_state)
>> +{
>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
>> +	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
>> +	const struct drm_color_lut *lut = blob->data;
>> +	const struct drm_color_lut *entry;
> 'entry' declaration can be moved into the loops.
Its being used in multiple loops, also being used for GCMax outside the 
loop.
>> +	enum pipe pipe = crtc->pipe;
>> +	u32 i;
>> +
>> +	if (!lut || (drm_color_lut_size(blob) < ICL_GAMMA_MULTISEG_LUT_LENGTH))
>> +		return;
> More checks that aren't needed.
Got it.
>> +
>> +	/*
>> +	 * Every entry in the multi-segment LUT is corresponding to a superfine
>> +	 * segment step which is 1/(8*128*256).
>> +	 *
>> +	 * Fine segment's step is 1/(128 * 256) ie 1/(128 * 256),  2/(128*256)
>> +	 * ... 256/(128*256). So in order to program fine segment of LUT we
>> +	 * need to pick every 8'th entry in LUT, and program 256 indexes.
>> +	 * Fine segment's index 0 is programmed in HW, and it starts from
>> +	 * index 1.
> The wording here is a bit confusing. I guess the problem is what to call
> things. PAL_PREC_INDEX[0/1] is what we program, but that maps to the point
> seg2[1] with seg2[0] being unused by the hw. Well, the spec says it's
> implicit but IIRC I was told long ago that it's not actually used.
>
> Not sure how to word that in the best way. Maybe something like?
>
> /*
>   * Fine segment (seg2) ...
>   *
>   * PAL_PREC_INDEX[0] and PAL_PREC_INDEX[1] map to seg2[1],
>   * with seg2[0] being unused by the hardware.
>   */
>
> Not sure that's any clearer.
Ok, will try to come up with something in similar lines.
>> +	 */
>> +	I915_WRITE(PREC_PAL_INDEX(pipe), PAL_PREC_AUTO_INCREMENT);
>> +	for (i = 1; i < 257; i++) {
>> +		entry = &lut[i * 8];
>> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_udw(entry));
>> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_ldw(entry));
>> +	}
>> +
>> +	/*
>> +	 * Coarse segment's starts from index 0 and it's step is 1/256 ie 0,
>> +	 * 1/256, 2/256 ...256/256. As per the description of each entry in LUT
>> +	 * above, we need to pick every 8 * 128 = 1024th entry in LUT, and
>> +	 * program 256 of those.
>> +	 */
> Could make a note here stating that seg3[0] and seg3[1] are also unused
> by the hardware, even though we have to program them to advance the
> index. I don't see it mentioned in the spec, but this one I definitely
> remember confirming from Art way back when. However I never verified
> that on actual hw. We could also consider just programming those two
> entries to 0 and start the actual coarse segment programming from index 2.
> Or we could skip them by reprogramming the index directly.
If they are not being used, does it matter what and if we program into 
them ? We can add a note though, mentioning this.
>> +	for (i = 0; i < 256; i++) {
>> +		entry = &lut[i * 1024];
> s/1024/8 * 128/ maybe?

Sure.

Regards

Shashank

>
>> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_udw(entry));
>> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_ldw(entry));
>> +	}
>> +
>> +	icl_load_gcmax(crtc_state, entry);
>> +	ivb_load_lut_ext_max(crtc);
>> +}
>> +
>>   static void icl_load_luts(const struct intel_crtc_state *crtc_state)
>>   {
>>   	const struct drm_property_blob *gamma_lut = crtc_state->base.gamma_lut;
>> @@ -775,10 +885,17 @@ static void icl_load_luts(const struct intel_crtc_state *crtc_state)
>>   	if (crtc_state->base.degamma_lut)
>>   		glk_load_degamma_lut(crtc_state);
>>   
>> -	if ((crtc_state->gamma_mode & GAMMA_MODE_MODE_MASK) ==
>> -	    GAMMA_MODE_MODE_8BIT) {
>> +	switch (crtc_state->gamma_mode & GAMMA_MODE_MODE_MASK) {
>> +	case GAMMA_MODE_MODE_8BIT:
>>   		i9xx_load_luts(crtc_state);
>> -	} else {
>> +		break;
>> +
>> +	case GAMMA_MODE_MODE_12BIT_MULTI_SEGMENTED:
>> +		icl_program_gamma_superfine_segment(crtc_state);
>> +		icl_program_gamma_multi_segment(crtc_state);
>> +		break;
>> +
>> +	default:
>>   		bdw_load_lut_10(crtc, gamma_lut, PAL_PREC_INDEX_VALUE(0));
>>   		ivb_load_lut_ext_max(crtc);
>>   	}
>> @@ -1209,7 +1326,7 @@ static u32 icl_gamma_mode(const struct intel_crtc_state *crtc_state)
>>   	    crtc_state_is_legacy_gamma(crtc_state))
>>   		gamma_mode |= GAMMA_MODE_MODE_8BIT;
>>   	else
>> -		gamma_mode |= GAMMA_MODE_MODE_10BIT;
>> +		gamma_mode |= GAMMA_MODE_MODE_12BIT_MULTI_SEGMENTED;
>>   
>>   	return gamma_mode;
>>   }
>> -- 
>> 2.17.1
Ville Syrjala May 6, 2019, 12:25 p.m. UTC | #3
On Mon, May 06, 2019 at 04:09:33PM +0530, Sharma, Shashank wrote:
> Regards
> 
> Shashank
> 
> On 5/3/2019 9:20 PM, Ville Syrjälä wrote:
> > On Tue, Apr 30, 2019 at 08:51:08PM +0530, Shashank Sharma wrote:
> >> ICL introduces a new gamma correction mode in display engine, called
> >> multi-segmented-gamma mode. This mode allows users to program the
> >> darker region of the gamma curve with sueprfine precision. An
> >> example use case for this is HDR curves (like PQ ST-2084).
> >>
> >> If we plot a gamma correction curve from value range between 0.0 to 1.0,
> >> ICL's multi-segment has 3 different sections:
> >> - superfine segment: 9 values, ranges between 0 - 1/(128 * 256)
> >> - fine segment: 257 values, ranges between 0 - 1/(128)
> >> - corase segment: 257 values, ranges between 0 - 1
> >>
> >> This patch:
> >> - Changes gamma LUTs size for ICL/GEN11 to 262144 entries (8 * 128 * 256),
> >>    so that userspace can program with highest precision supported.
> >> - Changes default gamma mode (non-legacy) to multi-segmented-gamma mode.
> >> - Adds functions to program/detect multi-segment gamma.
> >>
> >> V2: Addressed review comments from Ville
> >>      - separate function for superfine and fine segments.
> >>      - remove enum for segments.
> >>      - reuse last entry of the LUT as gc_max value.
> >>      - replace if() ....cond with switch...case in icl_load_luts.
> >>      - add an entry variable, instead of 'word'
> >>
> >> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> >> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> >> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> >>
> >> Suggested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
> >> Signed-off-by: Shashank Sharma <shashank.sharma@intel.com>
> >> Signed-off-by: Uma Shankar <uma.shankar@intel.com>
> >> ---
> >>   drivers/gpu/drm/i915/i915_pci.c    |   3 +-
> >>   drivers/gpu/drm/i915/intel_color.c | 125 ++++++++++++++++++++++++++++-
> >>   2 files changed, 123 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
> >> index ffa2ee70a03d..83698951760b 100644
> >> --- a/drivers/gpu/drm/i915/i915_pci.c
> >> +++ b/drivers/gpu/drm/i915/i915_pci.c
> >> @@ -749,7 +749,8 @@ static const struct intel_device_info intel_cannonlake_info = {
> >>   	GEN(11), \
> >>   	.ddb_size = 2048, \
> >>   	.has_logical_ring_elsq = 1, \
> >> -	.color = { .degamma_lut_size = 33, .gamma_lut_size = 1024 }
> >> +	.color = { .degamma_lut_size = 33, .gamma_lut_size = 262144 }
> > Ugh. Thats one big LUT. But looks correct.
> >
> >> +
> > Bogus newline.
> Got it.
> >>   
> >>   static const struct intel_device_info intel_icelake_11_info = {
> >>   	GEN11_FEATURES,
> >> diff --git a/drivers/gpu/drm/i915/intel_color.c b/drivers/gpu/drm/i915/intel_color.c
> >> index 6c341bea514c..49831e8d02fb 100644
> >> --- a/drivers/gpu/drm/i915/intel_color.c
> >> +++ b/drivers/gpu/drm/i915/intel_color.c
> >> @@ -41,6 +41,9 @@
> >>   #define CTM_COEFF_ABS(coeff)		((coeff) & (CTM_COEFF_SIGN - 1))
> >>   
> >>   #define LEGACY_LUT_LENGTH		256
> >> +#define ICL_GAMMA_MULTISEG_LUT_LENGTH		(256 * 128 * 8)
> >> +#define ICL_GAMMA_SUPERFINE_SEG_LENGTH	9
> >> +
> >>   /*
> >>    * Extract the CSC coefficient from a CTM coefficient (in U32.32 fixed point
> >>    * format). This macro takes the coefficient we want transformed and the
> >> @@ -767,6 +770,113 @@ static void glk_load_luts(const struct intel_crtc_state *crtc_state)
> >>   	}
> >>   }
> >>   
> >> +/* ilk+ "12.4" interpolated format (high 10 bits) */
> >> +static u32 ilk_lut_12p4_ldw(const struct drm_color_lut *color)
> >> +{
> >> +	return (color->red >> 6) << 20 | (color->green >> 6) << 10 |
> >> +		(color->blue >> 6);
> >> +}
> >> +
> >> +/* ilk+ "12.4" interpolated format (low 6 bits) */
> >> +static u32 ilk_lut_12p4_udw(const struct drm_color_lut *color)
> >> +{
> >> +	return (color->red & 0x3f) << 24 | (color->green & 0x3f) << 14 |
> >> +		(color->blue & 0x3f);
> >> +}
> >> +
> >> +static void
> >> +icl_load_gcmax(const struct intel_crtc_state *crtc_state,
> >> +			const struct drm_color_lut *entry)
> > Indentation looks off. Also s/entry/color/ to match the other similarish
> > funcs maybe?
> Sure.
> >> +{
> >> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
> >> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
> >> +	enum pipe pipe = crtc->pipe;
> >> +
> >> +	/* Fixme: LUT entries are 16 bit only, so we can prog 0xFFFF max */
> >> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 0), entry->red);
> >> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 1), entry->green);
> >> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 2), entry->blue);
> >> +}
> >> +
> >> +static void
> >> +icl_program_gamma_superfine_segment(const struct intel_crtc_state *crtc_state)
> >> +{
> >> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
> >> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
> >> +	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
> >> +	const struct drm_color_lut *lut = blob->data;
> >> +	enum pipe pipe = crtc->pipe;
> >> +	u32 i;
> >> +
> >> +	if (!lut || drm_color_lut_size(blob) < ICL_GAMMA_SUPERFINE_SEG_LENGTH)
> >> +		return;
> > These checks aren't needed. Just dead code.
> Will remove this and similars.
> >> +
> >> +	/*
> >> +	 * Every entry in the multi-segment LUT is corresponding to a superfine
> >> +	 * segment step which is 1/(8 * 128 * 256).
> >> +	 *
> >> +	 * Superfine segment has 9 entries, corresponding to values
> >> +	 * 0, 1/(8 * 128 * 256), 2/(8 * 128 * 256) .... 8/(8 * 128 * 256).
> >> +	 */
> >> +	I915_WRITE(PREC_PAL_MULTI_SEG_INDEX(pipe), PAL_PREC_AUTO_INCREMENT);
> >> +
> >> +	for (i = 0; i < 9; i++) {
> >> +		const struct drm_color_lut *entry = &lut[i];
> >> +
> >> +		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
> >> +			   ilk_lut_12p4_udw(entry));
> > ldw should come before udw.
> Got it.
> >> +		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
> >> +			   ilk_lut_12p4_ldw(entry));
> >> +	}
> >> +}
> >> +
> >> +static void
> >> +icl_program_gamma_multi_segment(const struct intel_crtc_state *crtc_state)
> >> +{
> >> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
> >> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
> >> +	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
> >> +	const struct drm_color_lut *lut = blob->data;
> >> +	const struct drm_color_lut *entry;
> > 'entry' declaration can be moved into the loops.
> Its being used in multiple loops,

IMO still better to move into the loops. We don't want to pass any
information between the loops.

> also being used for GCMax outside the 
> loop.

Hmm. That might be an arguemnt for keeping it out. But the current
gcmax usage looks broken. You're programming the same value into
the last PAL_PREC index and GCMAX.

> >> +	enum pipe pipe = crtc->pipe;
> >> +	u32 i;
> >> +
> >> +	if (!lut || (drm_color_lut_size(blob) < ICL_GAMMA_MULTISEG_LUT_LENGTH))
> >> +		return;
> > More checks that aren't needed.
> Got it.
> >> +
> >> +	/*
> >> +	 * Every entry in the multi-segment LUT is corresponding to a superfine
> >> +	 * segment step which is 1/(8*128*256).
> >> +	 *
> >> +	 * Fine segment's step is 1/(128 * 256) ie 1/(128 * 256),  2/(128*256)
> >> +	 * ... 256/(128*256). So in order to program fine segment of LUT we
> >> +	 * need to pick every 8'th entry in LUT, and program 256 indexes.
> >> +	 * Fine segment's index 0 is programmed in HW, and it starts from
> >> +	 * index 1.
> > The wording here is a bit confusing. I guess the problem is what to call
> > things. PAL_PREC_INDEX[0/1] is what we program, but that maps to the point
> > seg2[1] with seg2[0] being unused by the hw. Well, the spec says it's
> > implicit but IIRC I was told long ago that it's not actually used.
> >
> > Not sure how to word that in the best way. Maybe something like?
> >
> > /*
> >   * Fine segment (seg2) ...
> >   *
> >   * PAL_PREC_INDEX[0] and PAL_PREC_INDEX[1] map to seg2[1],
> >   * with seg2[0] being unused by the hardware.
> >   */
> >
> > Not sure that's any clearer.
> Ok, will try to come up with something in similar lines.
> >> +	 */
> >> +	I915_WRITE(PREC_PAL_INDEX(pipe), PAL_PREC_AUTO_INCREMENT);
> >> +	for (i = 1; i < 257; i++) {
> >> +		entry = &lut[i * 8];
> >> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_udw(entry));
> >> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_ldw(entry));
> >> +	}
> >> +
> >> +	/*
> >> +	 * Coarse segment's starts from index 0 and it's step is 1/256 ie 0,
> >> +	 * 1/256, 2/256 ...256/256. As per the description of each entry in LUT
> >> +	 * above, we need to pick every 8 * 128 = 1024th entry in LUT, and
> >> +	 * program 256 of those.
> >> +	 */
> > Could make a note here stating that seg3[0] and seg3[1] are also unused
> > by the hardware, even though we have to program them to advance the
> > index. I don't see it mentioned in the spec, but this one I definitely
> > remember confirming from Art way back when. However I never verified
> > that on actual hw. We could also consider just programming those two
> > entries to 0 and start the actual coarse segment programming from index 2.
> > Or we could skip them by reprogramming the index directly.
> If they are not being used, does it matter what and if we program into 
> them ? We can add a note though, mentioning this.

It shouldn't matter what we programing into them. But as mentioned I
never actually confirmed this on actual hardware. Would be nice to
double check that so we don't end up with incorrect comment.

> >> +	for (i = 0; i < 256; i++) {
> >> +		entry = &lut[i * 1024];
> > s/1024/8 * 128/ maybe?
> 
> Sure.
> 
> Regards
> 
> Shashank
> 
> >
> >> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_udw(entry));
> >> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_ldw(entry));
> >> +	}
> >> +
> >> +	icl_load_gcmax(crtc_state, entry);
> >> +	ivb_load_lut_ext_max(crtc);
> >> +}
> >> +
> >>   static void icl_load_luts(const struct intel_crtc_state *crtc_state)
> >>   {
> >>   	const struct drm_property_blob *gamma_lut = crtc_state->base.gamma_lut;
> >> @@ -775,10 +885,17 @@ static void icl_load_luts(const struct intel_crtc_state *crtc_state)
> >>   	if (crtc_state->base.degamma_lut)
> >>   		glk_load_degamma_lut(crtc_state);
> >>   
> >> -	if ((crtc_state->gamma_mode & GAMMA_MODE_MODE_MASK) ==
> >> -	    GAMMA_MODE_MODE_8BIT) {
> >> +	switch (crtc_state->gamma_mode & GAMMA_MODE_MODE_MASK) {
> >> +	case GAMMA_MODE_MODE_8BIT:
> >>   		i9xx_load_luts(crtc_state);
> >> -	} else {
> >> +		break;
> >> +
> >> +	case GAMMA_MODE_MODE_12BIT_MULTI_SEGMENTED:
> >> +		icl_program_gamma_superfine_segment(crtc_state);
> >> +		icl_program_gamma_multi_segment(crtc_state);
> >> +		break;
> >> +
> >> +	default:
> >>   		bdw_load_lut_10(crtc, gamma_lut, PAL_PREC_INDEX_VALUE(0));
> >>   		ivb_load_lut_ext_max(crtc);
> >>   	}
> >> @@ -1209,7 +1326,7 @@ static u32 icl_gamma_mode(const struct intel_crtc_state *crtc_state)
> >>   	    crtc_state_is_legacy_gamma(crtc_state))
> >>   		gamma_mode |= GAMMA_MODE_MODE_8BIT;
> >>   	else
> >> -		gamma_mode |= GAMMA_MODE_MODE_10BIT;
> >> +		gamma_mode |= GAMMA_MODE_MODE_12BIT_MULTI_SEGMENTED;
> >>   
> >>   	return gamma_mode;
> >>   }
> >> -- 
> >> 2.17.1
Sharma, Shashank May 6, 2019, 12:55 p.m. UTC | #4
On 5/6/2019 5:55 PM, Ville Syrjälä wrote:
> On Mon, May 06, 2019 at 04:09:33PM +0530, Sharma, Shashank wrote:
>> Regards
>>
>> Shashank
>>
>> On 5/3/2019 9:20 PM, Ville Syrjälä wrote:
>>> On Tue, Apr 30, 2019 at 08:51:08PM +0530, Shashank Sharma wrote:
>>>> ICL introduces a new gamma correction mode in display engine, called
>>>> multi-segmented-gamma mode. This mode allows users to program the
>>>> darker region of the gamma curve with sueprfine precision. An
>>>> example use case for this is HDR curves (like PQ ST-2084).
>>>>
>>>> If we plot a gamma correction curve from value range between 0.0 to 1.0,
>>>> ICL's multi-segment has 3 different sections:
>>>> - superfine segment: 9 values, ranges between 0 - 1/(128 * 256)
>>>> - fine segment: 257 values, ranges between 0 - 1/(128)
>>>> - corase segment: 257 values, ranges between 0 - 1
>>>>
>>>> This patch:
>>>> - Changes gamma LUTs size for ICL/GEN11 to 262144 entries (8 * 128 * 256),
>>>>     so that userspace can program with highest precision supported.
>>>> - Changes default gamma mode (non-legacy) to multi-segmented-gamma mode.
>>>> - Adds functions to program/detect multi-segment gamma.
>>>>
>>>> V2: Addressed review comments from Ville
>>>>       - separate function for superfine and fine segments.
>>>>       - remove enum for segments.
>>>>       - reuse last entry of the LUT as gc_max value.
>>>>       - replace if() ....cond with switch...case in icl_load_luts.
>>>>       - add an entry variable, instead of 'word'
>>>>
>>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
>>>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
>>>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
>>>>
>>>> Suggested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
>>>> Signed-off-by: Shashank Sharma <shashank.sharma@intel.com>
>>>> Signed-off-by: Uma Shankar <uma.shankar@intel.com>
>>>> ---
>>>>    drivers/gpu/drm/i915/i915_pci.c    |   3 +-
>>>>    drivers/gpu/drm/i915/intel_color.c | 125 ++++++++++++++++++++++++++++-
>>>>    2 files changed, 123 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
>>>> index ffa2ee70a03d..83698951760b 100644
>>>> --- a/drivers/gpu/drm/i915/i915_pci.c
>>>> +++ b/drivers/gpu/drm/i915/i915_pci.c
>>>> @@ -749,7 +749,8 @@ static const struct intel_device_info intel_cannonlake_info = {
>>>>    	GEN(11), \
>>>>    	.ddb_size = 2048, \
>>>>    	.has_logical_ring_elsq = 1, \
>>>> -	.color = { .degamma_lut_size = 33, .gamma_lut_size = 1024 }
>>>> +	.color = { .degamma_lut_size = 33, .gamma_lut_size = 262144 }
>>> Ugh. Thats one big LUT. But looks correct.
>>>
>>>> +
>>> Bogus newline.
>> Got it.
>>>>    
>>>>    static const struct intel_device_info intel_icelake_11_info = {
>>>>    	GEN11_FEATURES,
>>>> diff --git a/drivers/gpu/drm/i915/intel_color.c b/drivers/gpu/drm/i915/intel_color.c
>>>> index 6c341bea514c..49831e8d02fb 100644
>>>> --- a/drivers/gpu/drm/i915/intel_color.c
>>>> +++ b/drivers/gpu/drm/i915/intel_color.c
>>>> @@ -41,6 +41,9 @@
>>>>    #define CTM_COEFF_ABS(coeff)		((coeff) & (CTM_COEFF_SIGN - 1))
>>>>    
>>>>    #define LEGACY_LUT_LENGTH		256
>>>> +#define ICL_GAMMA_MULTISEG_LUT_LENGTH		(256 * 128 * 8)
>>>> +#define ICL_GAMMA_SUPERFINE_SEG_LENGTH	9
>>>> +
>>>>    /*
>>>>     * Extract the CSC coefficient from a CTM coefficient (in U32.32 fixed point
>>>>     * format). This macro takes the coefficient we want transformed and the
>>>> @@ -767,6 +770,113 @@ static void glk_load_luts(const struct intel_crtc_state *crtc_state)
>>>>    	}
>>>>    }
>>>>    
>>>> +/* ilk+ "12.4" interpolated format (high 10 bits) */
>>>> +static u32 ilk_lut_12p4_ldw(const struct drm_color_lut *color)
>>>> +{
>>>> +	return (color->red >> 6) << 20 | (color->green >> 6) << 10 |
>>>> +		(color->blue >> 6);
>>>> +}
>>>> +
>>>> +/* ilk+ "12.4" interpolated format (low 6 bits) */
>>>> +static u32 ilk_lut_12p4_udw(const struct drm_color_lut *color)
>>>> +{
>>>> +	return (color->red & 0x3f) << 24 | (color->green & 0x3f) << 14 |
>>>> +		(color->blue & 0x3f);
>>>> +}
>>>> +
>>>> +static void
>>>> +icl_load_gcmax(const struct intel_crtc_state *crtc_state,
>>>> +			const struct drm_color_lut *entry)
>>> Indentation looks off. Also s/entry/color/ to match the other similarish
>>> funcs maybe?
>> Sure.
>>>> +{
>>>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
>>>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
>>>> +	enum pipe pipe = crtc->pipe;
>>>> +
>>>> +	/* Fixme: LUT entries are 16 bit only, so we can prog 0xFFFF max */
>>>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 0), entry->red);
>>>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 1), entry->green);
>>>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 2), entry->blue);
>>>> +}
>>>> +
>>>> +static void
>>>> +icl_program_gamma_superfine_segment(const struct intel_crtc_state *crtc_state)
>>>> +{
>>>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
>>>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
>>>> +	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
>>>> +	const struct drm_color_lut *lut = blob->data;
>>>> +	enum pipe pipe = crtc->pipe;
>>>> +	u32 i;
>>>> +
>>>> +	if (!lut || drm_color_lut_size(blob) < ICL_GAMMA_SUPERFINE_SEG_LENGTH)
>>>> +		return;
>>> These checks aren't needed. Just dead code.
>> Will remove this and similars.
>>>> +
>>>> +	/*
>>>> +	 * Every entry in the multi-segment LUT is corresponding to a superfine
>>>> +	 * segment step which is 1/(8 * 128 * 256).
>>>> +	 *
>>>> +	 * Superfine segment has 9 entries, corresponding to values
>>>> +	 * 0, 1/(8 * 128 * 256), 2/(8 * 128 * 256) .... 8/(8 * 128 * 256).
>>>> +	 */
>>>> +	I915_WRITE(PREC_PAL_MULTI_SEG_INDEX(pipe), PAL_PREC_AUTO_INCREMENT);
>>>> +
>>>> +	for (i = 0; i < 9; i++) {
>>>> +		const struct drm_color_lut *entry = &lut[i];
>>>> +
>>>> +		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
>>>> +			   ilk_lut_12p4_udw(entry));
>>> ldw should come before udw.
>> Got it.
>>>> +		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
>>>> +			   ilk_lut_12p4_ldw(entry));
>>>> +	}
>>>> +}
>>>> +
>>>> +static void
>>>> +icl_program_gamma_multi_segment(const struct intel_crtc_state *crtc_state)
>>>> +{
>>>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
>>>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
>>>> +	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
>>>> +	const struct drm_color_lut *lut = blob->data;
>>>> +	const struct drm_color_lut *entry;
>>> 'entry' declaration can be moved into the loops.
>> Its being used in multiple loops,
> IMO still better to move into the loops. We don't want to pass any
> information between the loops.
>
>> also being used for GCMax outside the
>> loop.
> Hmm. That might be an arguemnt for keeping it out. But the current
> gcmax usage looks broken. You're programming the same value into
> the last PAL_PREC index and GCMAX.

Isn't this what you wanted ? IIRC, your recommendation was to program 
the highest values user wants to program in GCMAX reg. We also had a 
discussion on how user cant program 1.0, as we have LUT depth on 16 bit 
only, and we decided to use the last value of the LUT as GCMAX. Did I 
misunderstand anything there ?

- Shashank

>>>> +	enum pipe pipe = crtc->pipe;
>>>> +	u32 i;
>>>> +
>>>> +	if (!lut || (drm_color_lut_size(blob) < ICL_GAMMA_MULTISEG_LUT_LENGTH))
>>>> +		return;
>>> More checks that aren't needed.
>> Got it.
>>>> +
>>>> +	/*
>>>> +	 * Every entry in the multi-segment LUT is corresponding to a superfine
>>>> +	 * segment step which is 1/(8*128*256).
>>>> +	 *
>>>> +	 * Fine segment's step is 1/(128 * 256) ie 1/(128 * 256),  2/(128*256)
>>>> +	 * ... 256/(128*256). So in order to program fine segment of LUT we
>>>> +	 * need to pick every 8'th entry in LUT, and program 256 indexes.
>>>> +	 * Fine segment's index 0 is programmed in HW, and it starts from
>>>> +	 * index 1.
>>> The wording here is a bit confusing. I guess the problem is what to call
>>> things. PAL_PREC_INDEX[0/1] is what we program, but that maps to the point
>>> seg2[1] with seg2[0] being unused by the hw. Well, the spec says it's
>>> implicit but IIRC I was told long ago that it's not actually used.
>>>
>>> Not sure how to word that in the best way. Maybe something like?
>>>
>>> /*
>>>    * Fine segment (seg2) ...
>>>    *
>>>    * PAL_PREC_INDEX[0] and PAL_PREC_INDEX[1] map to seg2[1],
>>>    * with seg2[0] being unused by the hardware.
>>>    */
>>>
>>> Not sure that's any clearer.
>> Ok, will try to come up with something in similar lines.
>>>> +	 */
>>>> +	I915_WRITE(PREC_PAL_INDEX(pipe), PAL_PREC_AUTO_INCREMENT);
>>>> +	for (i = 1; i < 257; i++) {
>>>> +		entry = &lut[i * 8];
>>>> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_udw(entry));
>>>> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_ldw(entry));
>>>> +	}
>>>> +
>>>> +	/*
>>>> +	 * Coarse segment's starts from index 0 and it's step is 1/256 ie 0,
>>>> +	 * 1/256, 2/256 ...256/256. As per the description of each entry in LUT
>>>> +	 * above, we need to pick every 8 * 128 = 1024th entry in LUT, and
>>>> +	 * program 256 of those.
>>>> +	 */
>>> Could make a note here stating that seg3[0] and seg3[1] are also unused
>>> by the hardware, even though we have to program them to advance the
>>> index. I don't see it mentioned in the spec, but this one I definitely
>>> remember confirming from Art way back when. However I never verified
>>> that on actual hw. We could also consider just programming those two
>>> entries to 0 and start the actual coarse segment programming from index 2.
>>> Or we could skip them by reprogramming the index directly.
>> If they are not being used, does it matter what and if we program into
>> them ? We can add a note though, mentioning this.
> It shouldn't matter what we programing into them. But as mentioned I
> never actually confirmed this on actual hardware. Would be nice to
> double check that so we don't end up with incorrect comment.
>
>>>> +	for (i = 0; i < 256; i++) {
>>>> +		entry = &lut[i * 1024];
>>> s/1024/8 * 128/ maybe?
>> Sure.
>>
>> Regards
>>
>> Shashank
>>
>>>> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_udw(entry));
>>>> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_ldw(entry));
>>>> +	}
>>>> +
>>>> +	icl_load_gcmax(crtc_state, entry);
>>>> +	ivb_load_lut_ext_max(crtc);
>>>> +}
>>>> +
>>>>    static void icl_load_luts(const struct intel_crtc_state *crtc_state)
>>>>    {
>>>>    	const struct drm_property_blob *gamma_lut = crtc_state->base.gamma_lut;
>>>> @@ -775,10 +885,17 @@ static void icl_load_luts(const struct intel_crtc_state *crtc_state)
>>>>    	if (crtc_state->base.degamma_lut)
>>>>    		glk_load_degamma_lut(crtc_state);
>>>>    
>>>> -	if ((crtc_state->gamma_mode & GAMMA_MODE_MODE_MASK) ==
>>>> -	    GAMMA_MODE_MODE_8BIT) {
>>>> +	switch (crtc_state->gamma_mode & GAMMA_MODE_MODE_MASK) {
>>>> +	case GAMMA_MODE_MODE_8BIT:
>>>>    		i9xx_load_luts(crtc_state);
>>>> -	} else {
>>>> +		break;
>>>> +
>>>> +	case GAMMA_MODE_MODE_12BIT_MULTI_SEGMENTED:
>>>> +		icl_program_gamma_superfine_segment(crtc_state);
>>>> +		icl_program_gamma_multi_segment(crtc_state);
>>>> +		break;
>>>> +
>>>> +	default:
>>>>    		bdw_load_lut_10(crtc, gamma_lut, PAL_PREC_INDEX_VALUE(0));
>>>>    		ivb_load_lut_ext_max(crtc);
>>>>    	}
>>>> @@ -1209,7 +1326,7 @@ static u32 icl_gamma_mode(const struct intel_crtc_state *crtc_state)
>>>>    	    crtc_state_is_legacy_gamma(crtc_state))
>>>>    		gamma_mode |= GAMMA_MODE_MODE_8BIT;
>>>>    	else
>>>> -		gamma_mode |= GAMMA_MODE_MODE_10BIT;
>>>> +		gamma_mode |= GAMMA_MODE_MODE_12BIT_MULTI_SEGMENTED;
>>>>    
>>>>    	return gamma_mode;
>>>>    }
>>>> -- 
>>>> 2.17.1
Ville Syrjala May 6, 2019, 1:11 p.m. UTC | #5
On Mon, May 06, 2019 at 06:25:19PM +0530, Sharma, Shashank wrote:
> 
> On 5/6/2019 5:55 PM, Ville Syrjälä wrote:
> > On Mon, May 06, 2019 at 04:09:33PM +0530, Sharma, Shashank wrote:
> >> Regards
> >>
> >> Shashank
> >>
> >> On 5/3/2019 9:20 PM, Ville Syrjälä wrote:
> >>> On Tue, Apr 30, 2019 at 08:51:08PM +0530, Shashank Sharma wrote:
> >>>> ICL introduces a new gamma correction mode in display engine, called
> >>>> multi-segmented-gamma mode. This mode allows users to program the
> >>>> darker region of the gamma curve with sueprfine precision. An
> >>>> example use case for this is HDR curves (like PQ ST-2084).
> >>>>
> >>>> If we plot a gamma correction curve from value range between 0.0 to 1.0,
> >>>> ICL's multi-segment has 3 different sections:
> >>>> - superfine segment: 9 values, ranges between 0 - 1/(128 * 256)
> >>>> - fine segment: 257 values, ranges between 0 - 1/(128)
> >>>> - corase segment: 257 values, ranges between 0 - 1
> >>>>
> >>>> This patch:
> >>>> - Changes gamma LUTs size for ICL/GEN11 to 262144 entries (8 * 128 * 256),
> >>>>     so that userspace can program with highest precision supported.
> >>>> - Changes default gamma mode (non-legacy) to multi-segmented-gamma mode.
> >>>> - Adds functions to program/detect multi-segment gamma.
> >>>>
> >>>> V2: Addressed review comments from Ville
> >>>>       - separate function for superfine and fine segments.
> >>>>       - remove enum for segments.
> >>>>       - reuse last entry of the LUT as gc_max value.
> >>>>       - replace if() ....cond with switch...case in icl_load_luts.
> >>>>       - add an entry variable, instead of 'word'
> >>>>
> >>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> >>>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> >>>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> >>>>
> >>>> Suggested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
> >>>> Signed-off-by: Shashank Sharma <shashank.sharma@intel.com>
> >>>> Signed-off-by: Uma Shankar <uma.shankar@intel.com>
> >>>> ---
> >>>>    drivers/gpu/drm/i915/i915_pci.c    |   3 +-
> >>>>    drivers/gpu/drm/i915/intel_color.c | 125 ++++++++++++++++++++++++++++-
> >>>>    2 files changed, 123 insertions(+), 5 deletions(-)
> >>>>
> >>>> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
> >>>> index ffa2ee70a03d..83698951760b 100644
> >>>> --- a/drivers/gpu/drm/i915/i915_pci.c
> >>>> +++ b/drivers/gpu/drm/i915/i915_pci.c
> >>>> @@ -749,7 +749,8 @@ static const struct intel_device_info intel_cannonlake_info = {
> >>>>    	GEN(11), \
> >>>>    	.ddb_size = 2048, \
> >>>>    	.has_logical_ring_elsq = 1, \
> >>>> -	.color = { .degamma_lut_size = 33, .gamma_lut_size = 1024 }
> >>>> +	.color = { .degamma_lut_size = 33, .gamma_lut_size = 262144 }
> >>> Ugh. Thats one big LUT. But looks correct.
> >>>
> >>>> +
> >>> Bogus newline.
> >> Got it.
> >>>>    
> >>>>    static const struct intel_device_info intel_icelake_11_info = {
> >>>>    	GEN11_FEATURES,
> >>>> diff --git a/drivers/gpu/drm/i915/intel_color.c b/drivers/gpu/drm/i915/intel_color.c
> >>>> index 6c341bea514c..49831e8d02fb 100644
> >>>> --- a/drivers/gpu/drm/i915/intel_color.c
> >>>> +++ b/drivers/gpu/drm/i915/intel_color.c
> >>>> @@ -41,6 +41,9 @@
> >>>>    #define CTM_COEFF_ABS(coeff)		((coeff) & (CTM_COEFF_SIGN - 1))
> >>>>    
> >>>>    #define LEGACY_LUT_LENGTH		256
> >>>> +#define ICL_GAMMA_MULTISEG_LUT_LENGTH		(256 * 128 * 8)
> >>>> +#define ICL_GAMMA_SUPERFINE_SEG_LENGTH	9
> >>>> +
> >>>>    /*
> >>>>     * Extract the CSC coefficient from a CTM coefficient (in U32.32 fixed point
> >>>>     * format). This macro takes the coefficient we want transformed and the
> >>>> @@ -767,6 +770,113 @@ static void glk_load_luts(const struct intel_crtc_state *crtc_state)
> >>>>    	}
> >>>>    }
> >>>>    
> >>>> +/* ilk+ "12.4" interpolated format (high 10 bits) */
> >>>> +static u32 ilk_lut_12p4_ldw(const struct drm_color_lut *color)
> >>>> +{
> >>>> +	return (color->red >> 6) << 20 | (color->green >> 6) << 10 |
> >>>> +		(color->blue >> 6);
> >>>> +}
> >>>> +
> >>>> +/* ilk+ "12.4" interpolated format (low 6 bits) */
> >>>> +static u32 ilk_lut_12p4_udw(const struct drm_color_lut *color)
> >>>> +{
> >>>> +	return (color->red & 0x3f) << 24 | (color->green & 0x3f) << 14 |
> >>>> +		(color->blue & 0x3f);
> >>>> +}
> >>>> +
> >>>> +static void
> >>>> +icl_load_gcmax(const struct intel_crtc_state *crtc_state,
> >>>> +			const struct drm_color_lut *entry)
> >>> Indentation looks off. Also s/entry/color/ to match the other similarish
> >>> funcs maybe?
> >> Sure.
> >>>> +{
> >>>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
> >>>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
> >>>> +	enum pipe pipe = crtc->pipe;
> >>>> +
> >>>> +	/* Fixme: LUT entries are 16 bit only, so we can prog 0xFFFF max */
> >>>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 0), entry->red);
> >>>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 1), entry->green);
> >>>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 2), entry->blue);
> >>>> +}
> >>>> +
> >>>> +static void
> >>>> +icl_program_gamma_superfine_segment(const struct intel_crtc_state *crtc_state)
> >>>> +{
> >>>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
> >>>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
> >>>> +	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
> >>>> +	const struct drm_color_lut *lut = blob->data;
> >>>> +	enum pipe pipe = crtc->pipe;
> >>>> +	u32 i;
> >>>> +
> >>>> +	if (!lut || drm_color_lut_size(blob) < ICL_GAMMA_SUPERFINE_SEG_LENGTH)
> >>>> +		return;
> >>> These checks aren't needed. Just dead code.
> >> Will remove this and similars.
> >>>> +
> >>>> +	/*
> >>>> +	 * Every entry in the multi-segment LUT is corresponding to a superfine
> >>>> +	 * segment step which is 1/(8 * 128 * 256).
> >>>> +	 *
> >>>> +	 * Superfine segment has 9 entries, corresponding to values
> >>>> +	 * 0, 1/(8 * 128 * 256), 2/(8 * 128 * 256) .... 8/(8 * 128 * 256).
> >>>> +	 */
> >>>> +	I915_WRITE(PREC_PAL_MULTI_SEG_INDEX(pipe), PAL_PREC_AUTO_INCREMENT);
> >>>> +
> >>>> +	for (i = 0; i < 9; i++) {
> >>>> +		const struct drm_color_lut *entry = &lut[i];
> >>>> +
> >>>> +		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
> >>>> +			   ilk_lut_12p4_udw(entry));
> >>> ldw should come before udw.
> >> Got it.
> >>>> +		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
> >>>> +			   ilk_lut_12p4_ldw(entry));
> >>>> +	}
> >>>> +}
> >>>> +
> >>>> +static void
> >>>> +icl_program_gamma_multi_segment(const struct intel_crtc_state *crtc_state)
> >>>> +{
> >>>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
> >>>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
> >>>> +	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
> >>>> +	const struct drm_color_lut *lut = blob->data;
> >>>> +	const struct drm_color_lut *entry;
> >>> 'entry' declaration can be moved into the loops.
> >> Its being used in multiple loops,
> > IMO still better to move into the loops. We don't want to pass any
> > information between the loops.
> >
> >> also being used for GCMax outside the
> >> loop.
> > Hmm. That might be an arguemnt for keeping it out. But the current
> > gcmax usage looks broken. You're programming the same value into
> > the last PAL_PREC index and GCMAX.
> 
> Isn't this what you wanted ? IIRC, your recommendation was to program 
> the highest values user wants to program in GCMAX reg. We also had a 
> discussion on how user cant program 1.0, as we have LUT depth on 16 bit 
> only, and we decided to use the last value of the LUT as GCMAX. Did I 
> misunderstand anything there ?

It should be:
PAL_PREC[max] = lut[size-2];
GCMAX = lut[size-1];

Oh, we also need to increase the LUT size by one to make that
work correctly.
Sharma, Shashank May 6, 2019, 1:14 p.m. UTC | #6
Regards

Shashank

On 5/6/2019 6:41 PM, Ville Syrjälä wrote:
> On Mon, May 06, 2019 at 06:25:19PM +0530, Sharma, Shashank wrote:
>> On 5/6/2019 5:55 PM, Ville Syrjälä wrote:
>>> On Mon, May 06, 2019 at 04:09:33PM +0530, Sharma, Shashank wrote:
>>>> Regards
>>>>
>>>> Shashank
>>>>
>>>> On 5/3/2019 9:20 PM, Ville Syrjälä wrote:
>>>>> On Tue, Apr 30, 2019 at 08:51:08PM +0530, Shashank Sharma wrote:
>>>>>> ICL introduces a new gamma correction mode in display engine, called
>>>>>> multi-segmented-gamma mode. This mode allows users to program the
>>>>>> darker region of the gamma curve with sueprfine precision. An
>>>>>> example use case for this is HDR curves (like PQ ST-2084).
>>>>>>
>>>>>> If we plot a gamma correction curve from value range between 0.0 to 1.0,
>>>>>> ICL's multi-segment has 3 different sections:
>>>>>> - superfine segment: 9 values, ranges between 0 - 1/(128 * 256)
>>>>>> - fine segment: 257 values, ranges between 0 - 1/(128)
>>>>>> - corase segment: 257 values, ranges between 0 - 1
>>>>>>
>>>>>> This patch:
>>>>>> - Changes gamma LUTs size for ICL/GEN11 to 262144 entries (8 * 128 * 256),
>>>>>>      so that userspace can program with highest precision supported.
>>>>>> - Changes default gamma mode (non-legacy) to multi-segmented-gamma mode.
>>>>>> - Adds functions to program/detect multi-segment gamma.
>>>>>>
>>>>>> V2: Addressed review comments from Ville
>>>>>>        - separate function for superfine and fine segments.
>>>>>>        - remove enum for segments.
>>>>>>        - reuse last entry of the LUT as gc_max value.
>>>>>>        - replace if() ....cond with switch...case in icl_load_luts.
>>>>>>        - add an entry variable, instead of 'word'
>>>>>>
>>>>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
>>>>>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
>>>>>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
>>>>>>
>>>>>> Suggested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
>>>>>> Signed-off-by: Shashank Sharma <shashank.sharma@intel.com>
>>>>>> Signed-off-by: Uma Shankar <uma.shankar@intel.com>
>>>>>> ---
>>>>>>     drivers/gpu/drm/i915/i915_pci.c    |   3 +-
>>>>>>     drivers/gpu/drm/i915/intel_color.c | 125 ++++++++++++++++++++++++++++-
>>>>>>     2 files changed, 123 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
>>>>>> index ffa2ee70a03d..83698951760b 100644
>>>>>> --- a/drivers/gpu/drm/i915/i915_pci.c
>>>>>> +++ b/drivers/gpu/drm/i915/i915_pci.c
>>>>>> @@ -749,7 +749,8 @@ static const struct intel_device_info intel_cannonlake_info = {
>>>>>>     	GEN(11), \
>>>>>>     	.ddb_size = 2048, \
>>>>>>     	.has_logical_ring_elsq = 1, \
>>>>>> -	.color = { .degamma_lut_size = 33, .gamma_lut_size = 1024 }
>>>>>> +	.color = { .degamma_lut_size = 33, .gamma_lut_size = 262144 }
>>>>> Ugh. Thats one big LUT. But looks correct.
>>>>>
>>>>>> +
>>>>> Bogus newline.
>>>> Got it.
>>>>>>     
>>>>>>     static const struct intel_device_info intel_icelake_11_info = {
>>>>>>     	GEN11_FEATURES,
>>>>>> diff --git a/drivers/gpu/drm/i915/intel_color.c b/drivers/gpu/drm/i915/intel_color.c
>>>>>> index 6c341bea514c..49831e8d02fb 100644
>>>>>> --- a/drivers/gpu/drm/i915/intel_color.c
>>>>>> +++ b/drivers/gpu/drm/i915/intel_color.c
>>>>>> @@ -41,6 +41,9 @@
>>>>>>     #define CTM_COEFF_ABS(coeff)		((coeff) & (CTM_COEFF_SIGN - 1))
>>>>>>     
>>>>>>     #define LEGACY_LUT_LENGTH		256
>>>>>> +#define ICL_GAMMA_MULTISEG_LUT_LENGTH		(256 * 128 * 8)
>>>>>> +#define ICL_GAMMA_SUPERFINE_SEG_LENGTH	9
>>>>>> +
>>>>>>     /*
>>>>>>      * Extract the CSC coefficient from a CTM coefficient (in U32.32 fixed point
>>>>>>      * format). This macro takes the coefficient we want transformed and the
>>>>>> @@ -767,6 +770,113 @@ static void glk_load_luts(const struct intel_crtc_state *crtc_state)
>>>>>>     	}
>>>>>>     }
>>>>>>     
>>>>>> +/* ilk+ "12.4" interpolated format (high 10 bits) */
>>>>>> +static u32 ilk_lut_12p4_ldw(const struct drm_color_lut *color)
>>>>>> +{
>>>>>> +	return (color->red >> 6) << 20 | (color->green >> 6) << 10 |
>>>>>> +		(color->blue >> 6);
>>>>>> +}
>>>>>> +
>>>>>> +/* ilk+ "12.4" interpolated format (low 6 bits) */
>>>>>> +static u32 ilk_lut_12p4_udw(const struct drm_color_lut *color)
>>>>>> +{
>>>>>> +	return (color->red & 0x3f) << 24 | (color->green & 0x3f) << 14 |
>>>>>> +		(color->blue & 0x3f);
>>>>>> +}
>>>>>> +
>>>>>> +static void
>>>>>> +icl_load_gcmax(const struct intel_crtc_state *crtc_state,
>>>>>> +			const struct drm_color_lut *entry)
>>>>> Indentation looks off. Also s/entry/color/ to match the other similarish
>>>>> funcs maybe?
>>>> Sure.
>>>>>> +{
>>>>>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
>>>>>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
>>>>>> +	enum pipe pipe = crtc->pipe;
>>>>>> +
>>>>>> +	/* Fixme: LUT entries are 16 bit only, so we can prog 0xFFFF max */
>>>>>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 0), entry->red);
>>>>>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 1), entry->green);
>>>>>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 2), entry->blue);
>>>>>> +}
>>>>>> +
>>>>>> +static void
>>>>>> +icl_program_gamma_superfine_segment(const struct intel_crtc_state *crtc_state)
>>>>>> +{
>>>>>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
>>>>>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
>>>>>> +	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
>>>>>> +	const struct drm_color_lut *lut = blob->data;
>>>>>> +	enum pipe pipe = crtc->pipe;
>>>>>> +	u32 i;
>>>>>> +
>>>>>> +	if (!lut || drm_color_lut_size(blob) < ICL_GAMMA_SUPERFINE_SEG_LENGTH)
>>>>>> +		return;
>>>>> These checks aren't needed. Just dead code.
>>>> Will remove this and similars.
>>>>>> +
>>>>>> +	/*
>>>>>> +	 * Every entry in the multi-segment LUT is corresponding to a superfine
>>>>>> +	 * segment step which is 1/(8 * 128 * 256).
>>>>>> +	 *
>>>>>> +	 * Superfine segment has 9 entries, corresponding to values
>>>>>> +	 * 0, 1/(8 * 128 * 256), 2/(8 * 128 * 256) .... 8/(8 * 128 * 256).
>>>>>> +	 */
>>>>>> +	I915_WRITE(PREC_PAL_MULTI_SEG_INDEX(pipe), PAL_PREC_AUTO_INCREMENT);
>>>>>> +
>>>>>> +	for (i = 0; i < 9; i++) {
>>>>>> +		const struct drm_color_lut *entry = &lut[i];
>>>>>> +
>>>>>> +		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
>>>>>> +			   ilk_lut_12p4_udw(entry));
>>>>> ldw should come before udw.
>>>> Got it.
>>>>>> +		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
>>>>>> +			   ilk_lut_12p4_ldw(entry));
>>>>>> +	}
>>>>>> +}
>>>>>> +
>>>>>> +static void
>>>>>> +icl_program_gamma_multi_segment(const struct intel_crtc_state *crtc_state)
>>>>>> +{
>>>>>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
>>>>>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
>>>>>> +	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
>>>>>> +	const struct drm_color_lut *lut = blob->data;
>>>>>> +	const struct drm_color_lut *entry;
>>>>> 'entry' declaration can be moved into the loops.
>>>> Its being used in multiple loops,
>>> IMO still better to move into the loops. We don't want to pass any
>>> information between the loops.
>>>
>>>> also being used for GCMax outside the
>>>> loop.
>>> Hmm. That might be an arguemnt for keeping it out. But the current
>>> gcmax usage looks broken. You're programming the same value into
>>> the last PAL_PREC index and GCMAX.
>> Isn't this what you wanted ? IIRC, your recommendation was to program
>> the highest values user wants to program in GCMAX reg. We also had a
>> discussion on how user cant program 1.0, as we have LUT depth on 16 bit
>> only, and we decided to use the last value of the LUT as GCMAX. Did I
>> misunderstand anything there ?
> It should be:
> PAL_PREC[max] = lut[size-2];
> GCMAX = lut[size-1];
>
> Oh, we also need to increase the LUT size by one to make that
> work correctly.

Yep, that's why I was not so sure about this, and thought programming a 
1.0 is a better idea.

So now the new size would be 257 * 256 * 128 = 263,168 entries.I will do 
changes and send V3.

Regards

Shashank

>
Ville Syrjala May 6, 2019, 1:26 p.m. UTC | #7
On Mon, May 06, 2019 at 06:44:43PM +0530, Sharma, Shashank wrote:
> Regards
> 
> Shashank
> 
> On 5/6/2019 6:41 PM, Ville Syrjälä wrote:
> > On Mon, May 06, 2019 at 06:25:19PM +0530, Sharma, Shashank wrote:
> >> On 5/6/2019 5:55 PM, Ville Syrjälä wrote:
> >>> On Mon, May 06, 2019 at 04:09:33PM +0530, Sharma, Shashank wrote:
> >>>> Regards
> >>>>
> >>>> Shashank
> >>>>
> >>>> On 5/3/2019 9:20 PM, Ville Syrjälä wrote:
> >>>>> On Tue, Apr 30, 2019 at 08:51:08PM +0530, Shashank Sharma wrote:
> >>>>>> ICL introduces a new gamma correction mode in display engine, called
> >>>>>> multi-segmented-gamma mode. This mode allows users to program the
> >>>>>> darker region of the gamma curve with sueprfine precision. An
> >>>>>> example use case for this is HDR curves (like PQ ST-2084).
> >>>>>>
> >>>>>> If we plot a gamma correction curve from value range between 0.0 to 1.0,
> >>>>>> ICL's multi-segment has 3 different sections:
> >>>>>> - superfine segment: 9 values, ranges between 0 - 1/(128 * 256)
> >>>>>> - fine segment: 257 values, ranges between 0 - 1/(128)
> >>>>>> - corase segment: 257 values, ranges between 0 - 1
> >>>>>>
> >>>>>> This patch:
> >>>>>> - Changes gamma LUTs size for ICL/GEN11 to 262144 entries (8 * 128 * 256),
> >>>>>>      so that userspace can program with highest precision supported.
> >>>>>> - Changes default gamma mode (non-legacy) to multi-segmented-gamma mode.
> >>>>>> - Adds functions to program/detect multi-segment gamma.
> >>>>>>
> >>>>>> V2: Addressed review comments from Ville
> >>>>>>        - separate function for superfine and fine segments.
> >>>>>>        - remove enum for segments.
> >>>>>>        - reuse last entry of the LUT as gc_max value.
> >>>>>>        - replace if() ....cond with switch...case in icl_load_luts.
> >>>>>>        - add an entry variable, instead of 'word'
> >>>>>>
> >>>>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> >>>>>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> >>>>>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> >>>>>>
> >>>>>> Suggested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
> >>>>>> Signed-off-by: Shashank Sharma <shashank.sharma@intel.com>
> >>>>>> Signed-off-by: Uma Shankar <uma.shankar@intel.com>
> >>>>>> ---
> >>>>>>     drivers/gpu/drm/i915/i915_pci.c    |   3 +-
> >>>>>>     drivers/gpu/drm/i915/intel_color.c | 125 ++++++++++++++++++++++++++++-
> >>>>>>     2 files changed, 123 insertions(+), 5 deletions(-)
> >>>>>>
> >>>>>> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
> >>>>>> index ffa2ee70a03d..83698951760b 100644
> >>>>>> --- a/drivers/gpu/drm/i915/i915_pci.c
> >>>>>> +++ b/drivers/gpu/drm/i915/i915_pci.c
> >>>>>> @@ -749,7 +749,8 @@ static const struct intel_device_info intel_cannonlake_info = {
> >>>>>>     	GEN(11), \
> >>>>>>     	.ddb_size = 2048, \
> >>>>>>     	.has_logical_ring_elsq = 1, \
> >>>>>> -	.color = { .degamma_lut_size = 33, .gamma_lut_size = 1024 }
> >>>>>> +	.color = { .degamma_lut_size = 33, .gamma_lut_size = 262144 }
> >>>>> Ugh. Thats one big LUT. But looks correct.
> >>>>>
> >>>>>> +
> >>>>> Bogus newline.
> >>>> Got it.
> >>>>>>     
> >>>>>>     static const struct intel_device_info intel_icelake_11_info = {
> >>>>>>     	GEN11_FEATURES,
> >>>>>> diff --git a/drivers/gpu/drm/i915/intel_color.c b/drivers/gpu/drm/i915/intel_color.c
> >>>>>> index 6c341bea514c..49831e8d02fb 100644
> >>>>>> --- a/drivers/gpu/drm/i915/intel_color.c
> >>>>>> +++ b/drivers/gpu/drm/i915/intel_color.c
> >>>>>> @@ -41,6 +41,9 @@
> >>>>>>     #define CTM_COEFF_ABS(coeff)		((coeff) & (CTM_COEFF_SIGN - 1))
> >>>>>>     
> >>>>>>     #define LEGACY_LUT_LENGTH		256
> >>>>>> +#define ICL_GAMMA_MULTISEG_LUT_LENGTH		(256 * 128 * 8)
> >>>>>> +#define ICL_GAMMA_SUPERFINE_SEG_LENGTH	9
> >>>>>> +
> >>>>>>     /*
> >>>>>>      * Extract the CSC coefficient from a CTM coefficient (in U32.32 fixed point
> >>>>>>      * format). This macro takes the coefficient we want transformed and the
> >>>>>> @@ -767,6 +770,113 @@ static void glk_load_luts(const struct intel_crtc_state *crtc_state)
> >>>>>>     	}
> >>>>>>     }
> >>>>>>     
> >>>>>> +/* ilk+ "12.4" interpolated format (high 10 bits) */
> >>>>>> +static u32 ilk_lut_12p4_ldw(const struct drm_color_lut *color)
> >>>>>> +{
> >>>>>> +	return (color->red >> 6) << 20 | (color->green >> 6) << 10 |
> >>>>>> +		(color->blue >> 6);
> >>>>>> +}
> >>>>>> +
> >>>>>> +/* ilk+ "12.4" interpolated format (low 6 bits) */
> >>>>>> +static u32 ilk_lut_12p4_udw(const struct drm_color_lut *color)
> >>>>>> +{
> >>>>>> +	return (color->red & 0x3f) << 24 | (color->green & 0x3f) << 14 |
> >>>>>> +		(color->blue & 0x3f);
> >>>>>> +}
> >>>>>> +
> >>>>>> +static void
> >>>>>> +icl_load_gcmax(const struct intel_crtc_state *crtc_state,
> >>>>>> +			const struct drm_color_lut *entry)
> >>>>> Indentation looks off. Also s/entry/color/ to match the other similarish
> >>>>> funcs maybe?
> >>>> Sure.
> >>>>>> +{
> >>>>>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
> >>>>>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
> >>>>>> +	enum pipe pipe = crtc->pipe;
> >>>>>> +
> >>>>>> +	/* Fixme: LUT entries are 16 bit only, so we can prog 0xFFFF max */
> >>>>>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 0), entry->red);
> >>>>>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 1), entry->green);
> >>>>>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 2), entry->blue);
> >>>>>> +}
> >>>>>> +
> >>>>>> +static void
> >>>>>> +icl_program_gamma_superfine_segment(const struct intel_crtc_state *crtc_state)
> >>>>>> +{
> >>>>>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
> >>>>>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
> >>>>>> +	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
> >>>>>> +	const struct drm_color_lut *lut = blob->data;
> >>>>>> +	enum pipe pipe = crtc->pipe;
> >>>>>> +	u32 i;
> >>>>>> +
> >>>>>> +	if (!lut || drm_color_lut_size(blob) < ICL_GAMMA_SUPERFINE_SEG_LENGTH)
> >>>>>> +		return;
> >>>>> These checks aren't needed. Just dead code.
> >>>> Will remove this and similars.
> >>>>>> +
> >>>>>> +	/*
> >>>>>> +	 * Every entry in the multi-segment LUT is corresponding to a superfine
> >>>>>> +	 * segment step which is 1/(8 * 128 * 256).
> >>>>>> +	 *
> >>>>>> +	 * Superfine segment has 9 entries, corresponding to values
> >>>>>> +	 * 0, 1/(8 * 128 * 256), 2/(8 * 128 * 256) .... 8/(8 * 128 * 256).
> >>>>>> +	 */
> >>>>>> +	I915_WRITE(PREC_PAL_MULTI_SEG_INDEX(pipe), PAL_PREC_AUTO_INCREMENT);
> >>>>>> +
> >>>>>> +	for (i = 0; i < 9; i++) {
> >>>>>> +		const struct drm_color_lut *entry = &lut[i];
> >>>>>> +
> >>>>>> +		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
> >>>>>> +			   ilk_lut_12p4_udw(entry));
> >>>>> ldw should come before udw.
> >>>> Got it.
> >>>>>> +		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
> >>>>>> +			   ilk_lut_12p4_ldw(entry));
> >>>>>> +	}
> >>>>>> +}
> >>>>>> +
> >>>>>> +static void
> >>>>>> +icl_program_gamma_multi_segment(const struct intel_crtc_state *crtc_state)
> >>>>>> +{
> >>>>>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
> >>>>>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
> >>>>>> +	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
> >>>>>> +	const struct drm_color_lut *lut = blob->data;
> >>>>>> +	const struct drm_color_lut *entry;
> >>>>> 'entry' declaration can be moved into the loops.
> >>>> Its being used in multiple loops,
> >>> IMO still better to move into the loops. We don't want to pass any
> >>> information between the loops.
> >>>
> >>>> also being used for GCMax outside the
> >>>> loop.
> >>> Hmm. That might be an arguemnt for keeping it out. But the current
> >>> gcmax usage looks broken. You're programming the same value into
> >>> the last PAL_PREC index and GCMAX.
> >> Isn't this what you wanted ? IIRC, your recommendation was to program
> >> the highest values user wants to program in GCMAX reg. We also had a
> >> discussion on how user cant program 1.0, as we have LUT depth on 16 bit
> >> only, and we decided to use the last value of the LUT as GCMAX. Did I
> >> misunderstand anything there ?
> > It should be:
> > PAL_PREC[max] = lut[size-2];
> > GCMAX = lut[size-1];
> >
> > Oh, we also need to increase the LUT size by one to make that
> > work correctly.
> 
> Yep, that's why I was not so sure about this, and thought programming a 
> 1.0 is a better idea.
> 
> So now the new size would be 257 * 256 * 128 = 263,168 entries.I will do 
> changes and send V3.

Should be just 256*128*8+1=262145
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
index ffa2ee70a03d..83698951760b 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -749,7 +749,8 @@  static const struct intel_device_info intel_cannonlake_info = {
 	GEN(11), \
 	.ddb_size = 2048, \
 	.has_logical_ring_elsq = 1, \
-	.color = { .degamma_lut_size = 33, .gamma_lut_size = 1024 }
+	.color = { .degamma_lut_size = 33, .gamma_lut_size = 262144 }
+
 
 static const struct intel_device_info intel_icelake_11_info = {
 	GEN11_FEATURES,
diff --git a/drivers/gpu/drm/i915/intel_color.c b/drivers/gpu/drm/i915/intel_color.c
index 6c341bea514c..49831e8d02fb 100644
--- a/drivers/gpu/drm/i915/intel_color.c
+++ b/drivers/gpu/drm/i915/intel_color.c
@@ -41,6 +41,9 @@ 
 #define CTM_COEFF_ABS(coeff)		((coeff) & (CTM_COEFF_SIGN - 1))
 
 #define LEGACY_LUT_LENGTH		256
+#define ICL_GAMMA_MULTISEG_LUT_LENGTH		(256 * 128 * 8)
+#define ICL_GAMMA_SUPERFINE_SEG_LENGTH	9
+
 /*
  * Extract the CSC coefficient from a CTM coefficient (in U32.32 fixed point
  * format). This macro takes the coefficient we want transformed and the
@@ -767,6 +770,113 @@  static void glk_load_luts(const struct intel_crtc_state *crtc_state)
 	}
 }
 
+/* ilk+ "12.4" interpolated format (high 10 bits) */
+static u32 ilk_lut_12p4_ldw(const struct drm_color_lut *color)
+{
+	return (color->red >> 6) << 20 | (color->green >> 6) << 10 |
+		(color->blue >> 6);
+}
+
+/* ilk+ "12.4" interpolated format (low 6 bits) */
+static u32 ilk_lut_12p4_udw(const struct drm_color_lut *color)
+{
+	return (color->red & 0x3f) << 24 | (color->green & 0x3f) << 14 |
+		(color->blue & 0x3f);
+}
+
+static void
+icl_load_gcmax(const struct intel_crtc_state *crtc_state,
+			const struct drm_color_lut *entry)
+{
+	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
+	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
+	enum pipe pipe = crtc->pipe;
+
+	/* Fixme: LUT entries are 16 bit only, so we can prog 0xFFFF max */
+	I915_WRITE(PREC_PAL_GC_MAX(pipe, 0), entry->red);
+	I915_WRITE(PREC_PAL_GC_MAX(pipe, 1), entry->green);
+	I915_WRITE(PREC_PAL_GC_MAX(pipe, 2), entry->blue);
+}
+
+static void
+icl_program_gamma_superfine_segment(const struct intel_crtc_state *crtc_state)
+{
+	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
+	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
+	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
+	const struct drm_color_lut *lut = blob->data;
+	enum pipe pipe = crtc->pipe;
+	u32 i;
+
+	if (!lut || drm_color_lut_size(blob) < ICL_GAMMA_SUPERFINE_SEG_LENGTH)
+		return;
+
+	/*
+	 * Every entry in the multi-segment LUT is corresponding to a superfine
+	 * segment step which is 1/(8 * 128 * 256).
+	 *
+	 * Superfine segment has 9 entries, corresponding to values
+	 * 0, 1/(8 * 128 * 256), 2/(8 * 128 * 256) .... 8/(8 * 128 * 256).
+	 */
+	I915_WRITE(PREC_PAL_MULTI_SEG_INDEX(pipe), PAL_PREC_AUTO_INCREMENT);
+
+	for (i = 0; i < 9; i++) {
+		const struct drm_color_lut *entry = &lut[i];
+
+		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
+			   ilk_lut_12p4_udw(entry));
+		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
+			   ilk_lut_12p4_ldw(entry));
+	}
+}
+
+static void
+icl_program_gamma_multi_segment(const struct intel_crtc_state *crtc_state)
+{
+	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
+	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
+	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
+	const struct drm_color_lut *lut = blob->data;
+	const struct drm_color_lut *entry;
+	enum pipe pipe = crtc->pipe;
+	u32 i;
+
+	if (!lut || (drm_color_lut_size(blob) < ICL_GAMMA_MULTISEG_LUT_LENGTH))
+		return;
+
+	/*
+	 * Every entry in the multi-segment LUT is corresponding to a superfine
+	 * segment step which is 1/(8*128*256).
+	 *
+	 * Fine segment's step is 1/(128 * 256) ie 1/(128 * 256),  2/(128*256)
+	 * ... 256/(128*256). So in order to program fine segment of LUT we
+	 * need to pick every 8'th entry in LUT, and program 256 indexes.
+	 * Fine segment's index 0 is programmed in HW, and it starts from
+	 * index 1.
+	 */
+	I915_WRITE(PREC_PAL_INDEX(pipe), PAL_PREC_AUTO_INCREMENT);
+	for (i = 1; i < 257; i++) {
+		entry = &lut[i * 8];
+		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_udw(entry));
+		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_ldw(entry));
+	}
+
+	/*
+	 * Coarse segment's starts from index 0 and it's step is 1/256 ie 0,
+	 * 1/256, 2/256 ...256/256. As per the description of each entry in LUT
+	 * above, we need to pick every 8 * 128 = 1024th entry in LUT, and
+	 * program 256 of those.
+	 */
+	for (i = 0; i < 256; i++) {
+		entry = &lut[i * 1024];
+		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_udw(entry));
+		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_ldw(entry));
+	}
+
+	icl_load_gcmax(crtc_state, entry);
+	ivb_load_lut_ext_max(crtc);
+}
+
 static void icl_load_luts(const struct intel_crtc_state *crtc_state)
 {
 	const struct drm_property_blob *gamma_lut = crtc_state->base.gamma_lut;
@@ -775,10 +885,17 @@  static void icl_load_luts(const struct intel_crtc_state *crtc_state)
 	if (crtc_state->base.degamma_lut)
 		glk_load_degamma_lut(crtc_state);
 
-	if ((crtc_state->gamma_mode & GAMMA_MODE_MODE_MASK) ==
-	    GAMMA_MODE_MODE_8BIT) {
+	switch (crtc_state->gamma_mode & GAMMA_MODE_MODE_MASK) {
+	case GAMMA_MODE_MODE_8BIT:
 		i9xx_load_luts(crtc_state);
-	} else {
+		break;
+
+	case GAMMA_MODE_MODE_12BIT_MULTI_SEGMENTED:
+		icl_program_gamma_superfine_segment(crtc_state);
+		icl_program_gamma_multi_segment(crtc_state);
+		break;
+
+	default:
 		bdw_load_lut_10(crtc, gamma_lut, PAL_PREC_INDEX_VALUE(0));
 		ivb_load_lut_ext_max(crtc);
 	}
@@ -1209,7 +1326,7 @@  static u32 icl_gamma_mode(const struct intel_crtc_state *crtc_state)
 	    crtc_state_is_legacy_gamma(crtc_state))
 		gamma_mode |= GAMMA_MODE_MODE_8BIT;
 	else
-		gamma_mode |= GAMMA_MODE_MODE_10BIT;
+		gamma_mode |= GAMMA_MODE_MODE_12BIT_MULTI_SEGMENTED;
 
 	return gamma_mode;
 }