diff mbox series

[6/6] drm: vkms: Refactor the plane composer to accept new formats

Message ID 20211005201637.58563-7-igormtorrente@gmail.com (mailing list archive)
State New, archived
Headers show
Series Refactor the vkms to accept new formats | expand

Commit Message

Igor Matheus Andrade Torrente Oct. 5, 2021, 8:16 p.m. UTC
Currently the blend function only accepts XRGB_8888 and ARGB_8888
as a color input.

This patch refactors all the functions related to the plane composition
to overcome this limitation.

Now the blend function receives a format handler to each plane and a
blend function pointer. It will take two ARGB_1616161616 pixels, one
for each handler, and will use the blend function to calculate and
store the final color in the output buffer.

These format handlers will receive the `vkms_composer` and a pair of
coordinates. And they should return the respective pixel in the
ARGB_16161616 format.

The blend function will receive two ARGB_16161616 pixels, x, y, and
the vkms_composer of the output buffer. The method should perform the
blend operation and store output to the format aforementioned
ARGB_16161616.

Signed-off-by: Igor Matheus Andrade Torrente <igormtorrente@gmail.com>
---
 drivers/gpu/drm/vkms/vkms_composer.c | 275 ++++++++++++++-------------
 drivers/gpu/drm/vkms/vkms_formats.h  | 125 ++++++++++++
 2 files changed, 271 insertions(+), 129 deletions(-)
 create mode 100644 drivers/gpu/drm/vkms/vkms_formats.h

Comments

kernel test robot Oct. 5, 2021, 10:20 p.m. UTC | #1
Hi Igor,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.15-rc3 next-20210922]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Igor-Matheus-Andrade-Torrente/Refactor-the-vkms-to-accept-new-formats/20211006-042037
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 02d5e016800d082058b3d3b7c3ede136cdc6ddcb
config: riscv-buildonly-randconfig-r005-20211004 (attached as .config)
compiler: riscv32-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/9cd34ac9858091dc06086b2024e8f5f111657d48
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Igor-Matheus-Andrade-Torrente/Refactor-the-vkms-to-accept-new-formats/20211006-042037
        git checkout 9cd34ac9858091dc06086b2024e8f5f111657d48
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross ARCH=riscv 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from drivers/gpu/drm/vkms/vkms_composer.c:12:
>> drivers/gpu/drm/vkms/vkms_formats.h:24:7: error: no previous prototype for 'packed_pixels_addr' [-Werror=missing-prototypes]
      24 | void *packed_pixels_addr(struct vkms_composer *composer, int x, int y)
         |       ^~~~~~~~~~~~~~~~~~
>> drivers/gpu/drm/vkms/vkms_formats.h:31:5: error: no previous prototype for 'ARGB8888_to_ARGB16161616' [-Werror=missing-prototypes]
      31 | u64 ARGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
         |     ^~~~~~~~~~~~~~~~~~~~~~~~
>> drivers/gpu/drm/vkms/vkms_formats.h:49:5: error: no previous prototype for 'XRGB8888_to_ARGB16161616' [-Werror=missing-prototypes]
      49 | u64 XRGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
         |     ^~~~~~~~~~~~~~~~~~~~~~~~
>> drivers/gpu/drm/vkms/vkms_formats.h:63:5: error: no previous prototype for 'get_ARGB16161616' [-Werror=missing-prototypes]
      63 | u64 get_ARGB16161616(struct vkms_composer *composer, int x, int y)
         |     ^~~~~~~~~~~~~~~~
>> drivers/gpu/drm/vkms/vkms_formats.h:85:6: error: no previous prototype for 'convert_to_ARGB8888' [-Werror=missing-prototypes]
      85 | void convert_to_ARGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
         |      ^~~~~~~~~~~~~~~~~~~
>> drivers/gpu/drm/vkms/vkms_formats.h:106:6: error: no previous prototype for 'convert_to_XRGB8888' [-Werror=missing-prototypes]
     106 | void convert_to_XRGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
         |      ^~~~~~~~~~~~~~~~~~~
>> drivers/gpu/drm/vkms/vkms_formats.h:117:6: error: no previous prototype for 'convert_to_ARGB16161616' [-Werror=missing-prototypes]
     117 | void convert_to_ARGB16161616(u64 argb_src1, u64 argb_src2, int x, int y,
         |      ^~~~~~~~~~~~~~~~~~~~~~~
   cc1: all warnings being treated as errors


vim +/packed_pixels_addr +24 drivers/gpu/drm/vkms/vkms_formats.h

     7	
     8	#define pixel_offset(composer, x, y) \
     9		((composer)->offset + ((y) * (composer)->pitch) + ((x) * (composer)->cpp))
    10	
    11	/*
    12	 * packed_pixels_addr - Get the pointer to pixel of a given pair of coordinates
    13	 *
    14	 * @composer: Buffer metadata
    15	 * @x: The x(width) coordinate of the 2D buffer
    16	 * @y: The y(Heigth) coordinate of the 2D buffer
    17	 *
    18	 * Takes the information stored in the composer, a pair of coordinates, and
    19	 * returns the address of the first color channel.
    20	 * This function assumes the channels are packed together, i.e. a color channel
    21	 * comes immediately after another. And therefore, this function doesn't work
    22	 * for YUV with chroma subsampling (e.g. YUV420 and NV21).
    23	 */
  > 24	void *packed_pixels_addr(struct vkms_composer *composer, int x, int y)
    25	{
    26		int offset = pixel_offset(composer, x, y);
    27	
    28		return (u8 *)composer->map[0].vaddr + offset;
    29	}
    30	
  > 31	u64 ARGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
    32	{
    33		u8 *pixel_addr = packed_pixels_addr(composer, x, y);
    34	
    35		/*
    36		 * Organizes the channels in their respective positions and converts
    37		 * the 8 bits channel to 16.
    38		 * The 257 is the "conversion ratio". This number is obtained by the
    39		 * (2^16 - 1) / (2^8 - 1) division. Which, in this case, tries to get
    40		 * the best color value in a color space with more possibilities.
    41		 * And a similar idea applies to others RGB color conversions.
    42		 */
    43		return ((u64)pixel_addr[3] * 257) << 48 |
    44		       ((u64)pixel_addr[2] * 257) << 32 |
    45		       ((u64)pixel_addr[1] * 257) << 16 |
    46		       ((u64)pixel_addr[0] * 257);
    47	}
    48	
  > 49	u64 XRGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
    50	{
    51		u8 *pixel_addr = packed_pixels_addr(composer, x, y);
    52	
    53		/*
    54		 * The same as the ARGB8888 but with the alpha channel as the
    55		 * maximum value as possible.
    56		 */
    57		return 0xffffllu << 48 |
    58		       ((u64)pixel_addr[2] * 257) << 32 |
    59		       ((u64)pixel_addr[1] * 257) << 16 |
    60		       ((u64)pixel_addr[0] * 257);
    61	}
    62	
  > 63	u64 get_ARGB16161616(struct vkms_composer *composer, int x, int y)
    64	{
    65		__le64 *pixel_addr = packed_pixels_addr(composer, x, y);
    66	
    67		/*
    68		 * Because the format byte order is in little-endian and this code
    69		 * needs to run on big-endian machines too, we need modify
    70		 * the byte order from little-endian to the CPU native byte order.
    71		 */
    72		return le64_to_cpu(*pixel_addr);
    73	}
    74	
    75	/*
    76	 * The following functions are used as blend operations. But unlike the
    77	 * `alpha_blend`, these functions take an ARGB16161616 pixel from the
    78	 * source, convert it to a specific format, and store it in the destination.
    79	 *
    80	 * They are used in the `compose_active_planes` and `write_wb_buffer` to
    81	 * copy and convert one pixel from/to the output buffer to/from
    82	 * another buffer (e.g. writeback buffer, primary plane buffer).
    83	 */
    84	
  > 85	void convert_to_ARGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
    86				 struct vkms_composer *dst_composer)
    87	{
    88		u8 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
    89	
    90		/*
    91		 * This sequence below is important because the format's byte order is
    92		 * in little-endian. In the case of the ARGB8888 the memory is
    93		 * organized this way:
    94		 *
    95		 * | Addr     | = blue channel
    96		 * | Addr + 1 | = green channel
    97		 * | Addr + 2 | = Red channel
    98		 * | Addr + 3 | = Alpha channel
    99		 */
   100		pixel_addr[0] = DIV_ROUND_UP(argb_src1 & 0xffffllu, 257);
   101		pixel_addr[1] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 16)) >> 16, 257);
   102		pixel_addr[2] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 32)) >> 32, 257);
   103		pixel_addr[3] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 48)) >> 48, 257);
   104	}
   105	
 > 106	void convert_to_XRGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
   107				 struct vkms_composer *dst_composer)
   108	{
   109		u8 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
   110	
   111		pixel_addr[0] = DIV_ROUND_UP(argb_src1 & 0xffffllu, 257);
   112		pixel_addr[1] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 16)) >> 16, 257);
   113		pixel_addr[2] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 32)) >> 32, 257);
   114		pixel_addr[3] = 0xff;
   115	}
   116	
 > 117	void convert_to_ARGB16161616(u64 argb_src1, u64 argb_src2, int x, int y,
   118				     struct vkms_composer *dst_composer)
   119	{
   120		__le64 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
   121	
   122		*pixel_addr = cpu_to_le64(argb_src1);
   123	}
   124	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot Oct. 5, 2021, 11:18 p.m. UTC | #2
Hi Igor,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v5.15-rc3 next-20210922]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Igor-Matheus-Andrade-Torrente/Refactor-the-vkms-to-accept-new-formats/20211006-042037
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 02d5e016800d082058b3d3b7c3ede136cdc6ddcb
config: x86_64-randconfig-a003-20211004 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project c0039de2953d15815448b4b3c3bafb45607781e0)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/9cd34ac9858091dc06086b2024e8f5f111657d48
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Igor-Matheus-Andrade-Torrente/Refactor-the-vkms-to-accept-new-formats/20211006-042037
        git checkout 9cd34ac9858091dc06086b2024e8f5f111657d48
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from drivers/gpu/drm/vkms/vkms_composer.c:12:
>> drivers/gpu/drm/vkms/vkms_formats.h:24:7: warning: no previous prototype for function 'packed_pixels_addr' [-Wmissing-prototypes]
   void *packed_pixels_addr(struct vkms_composer *composer, int x, int y)
         ^
   drivers/gpu/drm/vkms/vkms_formats.h:24:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void *packed_pixels_addr(struct vkms_composer *composer, int x, int y)
   ^
   static 
>> drivers/gpu/drm/vkms/vkms_formats.h:31:5: warning: no previous prototype for function 'ARGB8888_to_ARGB16161616' [-Wmissing-prototypes]
   u64 ARGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
       ^
   drivers/gpu/drm/vkms/vkms_formats.h:31:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   u64 ARGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
   ^
   static 
>> drivers/gpu/drm/vkms/vkms_formats.h:49:5: warning: no previous prototype for function 'XRGB8888_to_ARGB16161616' [-Wmissing-prototypes]
   u64 XRGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
       ^
   drivers/gpu/drm/vkms/vkms_formats.h:49:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   u64 XRGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
   ^
   static 
>> drivers/gpu/drm/vkms/vkms_formats.h:63:5: warning: no previous prototype for function 'get_ARGB16161616' [-Wmissing-prototypes]
   u64 get_ARGB16161616(struct vkms_composer *composer, int x, int y)
       ^
   drivers/gpu/drm/vkms/vkms_formats.h:63:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   u64 get_ARGB16161616(struct vkms_composer *composer, int x, int y)
   ^
   static 
>> drivers/gpu/drm/vkms/vkms_formats.h:85:6: warning: no previous prototype for function 'convert_to_ARGB8888' [-Wmissing-prototypes]
   void convert_to_ARGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
        ^
   drivers/gpu/drm/vkms/vkms_formats.h:85:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void convert_to_ARGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
   ^
   static 
>> drivers/gpu/drm/vkms/vkms_formats.h:106:6: warning: no previous prototype for function 'convert_to_XRGB8888' [-Wmissing-prototypes]
   void convert_to_XRGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
        ^
   drivers/gpu/drm/vkms/vkms_formats.h:106:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void convert_to_XRGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
   ^
   static 
>> drivers/gpu/drm/vkms/vkms_formats.h:117:6: warning: no previous prototype for function 'convert_to_ARGB16161616' [-Wmissing-prototypes]
   void convert_to_ARGB16161616(u64 argb_src1, u64 argb_src2, int x, int y,
        ^
   drivers/gpu/drm/vkms/vkms_formats.h:117:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void convert_to_ARGB16161616(u64 argb_src1, u64 argb_src2, int x, int y,
   ^
   static 
   7 warnings generated.


vim +/packed_pixels_addr +24 drivers/gpu/drm/vkms/vkms_formats.h

     7	
     8	#define pixel_offset(composer, x, y) \
     9		((composer)->offset + ((y) * (composer)->pitch) + ((x) * (composer)->cpp))
    10	
    11	/*
    12	 * packed_pixels_addr - Get the pointer to pixel of a given pair of coordinates
    13	 *
    14	 * @composer: Buffer metadata
    15	 * @x: The x(width) coordinate of the 2D buffer
    16	 * @y: The y(Heigth) coordinate of the 2D buffer
    17	 *
    18	 * Takes the information stored in the composer, a pair of coordinates, and
    19	 * returns the address of the first color channel.
    20	 * This function assumes the channels are packed together, i.e. a color channel
    21	 * comes immediately after another. And therefore, this function doesn't work
    22	 * for YUV with chroma subsampling (e.g. YUV420 and NV21).
    23	 */
  > 24	void *packed_pixels_addr(struct vkms_composer *composer, int x, int y)
    25	{
    26		int offset = pixel_offset(composer, x, y);
    27	
    28		return (u8 *)composer->map[0].vaddr + offset;
    29	}
    30	
  > 31	u64 ARGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
    32	{
    33		u8 *pixel_addr = packed_pixels_addr(composer, x, y);
    34	
    35		/*
    36		 * Organizes the channels in their respective positions and converts
    37		 * the 8 bits channel to 16.
    38		 * The 257 is the "conversion ratio". This number is obtained by the
    39		 * (2^16 - 1) / (2^8 - 1) division. Which, in this case, tries to get
    40		 * the best color value in a color space with more possibilities.
    41		 * And a similar idea applies to others RGB color conversions.
    42		 */
    43		return ((u64)pixel_addr[3] * 257) << 48 |
    44		       ((u64)pixel_addr[2] * 257) << 32 |
    45		       ((u64)pixel_addr[1] * 257) << 16 |
    46		       ((u64)pixel_addr[0] * 257);
    47	}
    48	
  > 49	u64 XRGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
    50	{
    51		u8 *pixel_addr = packed_pixels_addr(composer, x, y);
    52	
    53		/*
    54		 * The same as the ARGB8888 but with the alpha channel as the
    55		 * maximum value as possible.
    56		 */
    57		return 0xffffllu << 48 |
    58		       ((u64)pixel_addr[2] * 257) << 32 |
    59		       ((u64)pixel_addr[1] * 257) << 16 |
    60		       ((u64)pixel_addr[0] * 257);
    61	}
    62	
  > 63	u64 get_ARGB16161616(struct vkms_composer *composer, int x, int y)
    64	{
    65		__le64 *pixel_addr = packed_pixels_addr(composer, x, y);
    66	
    67		/*
    68		 * Because the format byte order is in little-endian and this code
    69		 * needs to run on big-endian machines too, we need modify
    70		 * the byte order from little-endian to the CPU native byte order.
    71		 */
    72		return le64_to_cpu(*pixel_addr);
    73	}
    74	
    75	/*
    76	 * The following functions are used as blend operations. But unlike the
    77	 * `alpha_blend`, these functions take an ARGB16161616 pixel from the
    78	 * source, convert it to a specific format, and store it in the destination.
    79	 *
    80	 * They are used in the `compose_active_planes` and `write_wb_buffer` to
    81	 * copy and convert one pixel from/to the output buffer to/from
    82	 * another buffer (e.g. writeback buffer, primary plane buffer).
    83	 */
    84	
  > 85	void convert_to_ARGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
    86				 struct vkms_composer *dst_composer)
    87	{
    88		u8 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
    89	
    90		/*
    91		 * This sequence below is important because the format's byte order is
    92		 * in little-endian. In the case of the ARGB8888 the memory is
    93		 * organized this way:
    94		 *
    95		 * | Addr     | = blue channel
    96		 * | Addr + 1 | = green channel
    97		 * | Addr + 2 | = Red channel
    98		 * | Addr + 3 | = Alpha channel
    99		 */
   100		pixel_addr[0] = DIV_ROUND_UP(argb_src1 & 0xffffllu, 257);
   101		pixel_addr[1] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 16)) >> 16, 257);
   102		pixel_addr[2] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 32)) >> 32, 257);
   103		pixel_addr[3] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 48)) >> 48, 257);
   104	}
   105	
 > 106	void convert_to_XRGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
   107				 struct vkms_composer *dst_composer)
   108	{
   109		u8 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
   110	
   111		pixel_addr[0] = DIV_ROUND_UP(argb_src1 & 0xffffllu, 257);
   112		pixel_addr[1] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 16)) >> 16, 257);
   113		pixel_addr[2] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 32)) >> 32, 257);
   114		pixel_addr[3] = 0xff;
   115	}
   116	
 > 117	void convert_to_ARGB16161616(u64 argb_src1, u64 argb_src2, int x, int y,
   118				     struct vkms_composer *dst_composer)
   119	{
   120		__le64 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
   121	
   122		*pixel_addr = cpu_to_le64(argb_src1);
   123	}
   124	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot Oct. 5, 2021, 11:36 p.m. UTC | #3
Hi Igor,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.15-rc3 next-20210922]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Igor-Matheus-Andrade-Torrente/Refactor-the-vkms-to-accept-new-formats/20211006-042037
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 02d5e016800d082058b3d3b7c3ede136cdc6ddcb
config: i386-buildonly-randconfig-r004-20211004 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project c0039de2953d15815448b4b3c3bafb45607781e0)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/9cd34ac9858091dc06086b2024e8f5f111657d48
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Igor-Matheus-Andrade-Torrente/Refactor-the-vkms-to-accept-new-formats/20211006-042037
        git checkout 9cd34ac9858091dc06086b2024e8f5f111657d48
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from drivers/gpu/drm/vkms/vkms_composer.c:12:
>> drivers/gpu/drm/vkms/vkms_formats.h:24:7: error: no previous prototype for function 'packed_pixels_addr' [-Werror,-Wmissing-prototypes]
   void *packed_pixels_addr(struct vkms_composer *composer, int x, int y)
         ^
   drivers/gpu/drm/vkms/vkms_formats.h:24:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void *packed_pixels_addr(struct vkms_composer *composer, int x, int y)
   ^
   static 
>> drivers/gpu/drm/vkms/vkms_formats.h:31:5: error: no previous prototype for function 'ARGB8888_to_ARGB16161616' [-Werror,-Wmissing-prototypes]
   u64 ARGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
       ^
   drivers/gpu/drm/vkms/vkms_formats.h:31:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   u64 ARGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
   ^
   static 
>> drivers/gpu/drm/vkms/vkms_formats.h:49:5: error: no previous prototype for function 'XRGB8888_to_ARGB16161616' [-Werror,-Wmissing-prototypes]
   u64 XRGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
       ^
   drivers/gpu/drm/vkms/vkms_formats.h:49:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   u64 XRGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
   ^
   static 
>> drivers/gpu/drm/vkms/vkms_formats.h:63:5: error: no previous prototype for function 'get_ARGB16161616' [-Werror,-Wmissing-prototypes]
   u64 get_ARGB16161616(struct vkms_composer *composer, int x, int y)
       ^
   drivers/gpu/drm/vkms/vkms_formats.h:63:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   u64 get_ARGB16161616(struct vkms_composer *composer, int x, int y)
   ^
   static 
>> drivers/gpu/drm/vkms/vkms_formats.h:85:6: error: no previous prototype for function 'convert_to_ARGB8888' [-Werror,-Wmissing-prototypes]
   void convert_to_ARGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
        ^
   drivers/gpu/drm/vkms/vkms_formats.h:85:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void convert_to_ARGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
   ^
   static 
>> drivers/gpu/drm/vkms/vkms_formats.h:106:6: error: no previous prototype for function 'convert_to_XRGB8888' [-Werror,-Wmissing-prototypes]
   void convert_to_XRGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
        ^
   drivers/gpu/drm/vkms/vkms_formats.h:106:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void convert_to_XRGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
   ^
   static 
>> drivers/gpu/drm/vkms/vkms_formats.h:117:6: error: no previous prototype for function 'convert_to_ARGB16161616' [-Werror,-Wmissing-prototypes]
   void convert_to_ARGB16161616(u64 argb_src1, u64 argb_src2, int x, int y,
        ^
   drivers/gpu/drm/vkms/vkms_formats.h:117:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void convert_to_ARGB16161616(u64 argb_src1, u64 argb_src2, int x, int y,
   ^
   static 
   7 errors generated.


vim +/packed_pixels_addr +24 drivers/gpu/drm/vkms/vkms_formats.h

     7	
     8	#define pixel_offset(composer, x, y) \
     9		((composer)->offset + ((y) * (composer)->pitch) + ((x) * (composer)->cpp))
    10	
    11	/*
    12	 * packed_pixels_addr - Get the pointer to pixel of a given pair of coordinates
    13	 *
    14	 * @composer: Buffer metadata
    15	 * @x: The x(width) coordinate of the 2D buffer
    16	 * @y: The y(Heigth) coordinate of the 2D buffer
    17	 *
    18	 * Takes the information stored in the composer, a pair of coordinates, and
    19	 * returns the address of the first color channel.
    20	 * This function assumes the channels are packed together, i.e. a color channel
    21	 * comes immediately after another. And therefore, this function doesn't work
    22	 * for YUV with chroma subsampling (e.g. YUV420 and NV21).
    23	 */
  > 24	void *packed_pixels_addr(struct vkms_composer *composer, int x, int y)
    25	{
    26		int offset = pixel_offset(composer, x, y);
    27	
    28		return (u8 *)composer->map[0].vaddr + offset;
    29	}
    30	
  > 31	u64 ARGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
    32	{
    33		u8 *pixel_addr = packed_pixels_addr(composer, x, y);
    34	
    35		/*
    36		 * Organizes the channels in their respective positions and converts
    37		 * the 8 bits channel to 16.
    38		 * The 257 is the "conversion ratio". This number is obtained by the
    39		 * (2^16 - 1) / (2^8 - 1) division. Which, in this case, tries to get
    40		 * the best color value in a color space with more possibilities.
    41		 * And a similar idea applies to others RGB color conversions.
    42		 */
    43		return ((u64)pixel_addr[3] * 257) << 48 |
    44		       ((u64)pixel_addr[2] * 257) << 32 |
    45		       ((u64)pixel_addr[1] * 257) << 16 |
    46		       ((u64)pixel_addr[0] * 257);
    47	}
    48	
  > 49	u64 XRGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
    50	{
    51		u8 *pixel_addr = packed_pixels_addr(composer, x, y);
    52	
    53		/*
    54		 * The same as the ARGB8888 but with the alpha channel as the
    55		 * maximum value as possible.
    56		 */
    57		return 0xffffllu << 48 |
    58		       ((u64)pixel_addr[2] * 257) << 32 |
    59		       ((u64)pixel_addr[1] * 257) << 16 |
    60		       ((u64)pixel_addr[0] * 257);
    61	}
    62	
  > 63	u64 get_ARGB16161616(struct vkms_composer *composer, int x, int y)
    64	{
    65		__le64 *pixel_addr = packed_pixels_addr(composer, x, y);
    66	
    67		/*
    68		 * Because the format byte order is in little-endian and this code
    69		 * needs to run on big-endian machines too, we need modify
    70		 * the byte order from little-endian to the CPU native byte order.
    71		 */
    72		return le64_to_cpu(*pixel_addr);
    73	}
    74	
    75	/*
    76	 * The following functions are used as blend operations. But unlike the
    77	 * `alpha_blend`, these functions take an ARGB16161616 pixel from the
    78	 * source, convert it to a specific format, and store it in the destination.
    79	 *
    80	 * They are used in the `compose_active_planes` and `write_wb_buffer` to
    81	 * copy and convert one pixel from/to the output buffer to/from
    82	 * another buffer (e.g. writeback buffer, primary plane buffer).
    83	 */
    84	
  > 85	void convert_to_ARGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
    86				 struct vkms_composer *dst_composer)
    87	{
    88		u8 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
    89	
    90		/*
    91		 * This sequence below is important because the format's byte order is
    92		 * in little-endian. In the case of the ARGB8888 the memory is
    93		 * organized this way:
    94		 *
    95		 * | Addr     | = blue channel
    96		 * | Addr + 1 | = green channel
    97		 * | Addr + 2 | = Red channel
    98		 * | Addr + 3 | = Alpha channel
    99		 */
   100		pixel_addr[0] = DIV_ROUND_UP(argb_src1 & 0xffffllu, 257);
   101		pixel_addr[1] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 16)) >> 16, 257);
   102		pixel_addr[2] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 32)) >> 32, 257);
   103		pixel_addr[3] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 48)) >> 48, 257);
   104	}
   105	
 > 106	void convert_to_XRGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
   107				 struct vkms_composer *dst_composer)
   108	{
   109		u8 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
   110	
   111		pixel_addr[0] = DIV_ROUND_UP(argb_src1 & 0xffffllu, 257);
   112		pixel_addr[1] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 16)) >> 16, 257);
   113		pixel_addr[2] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 32)) >> 32, 257);
   114		pixel_addr[3] = 0xff;
   115	}
   116	
 > 117	void convert_to_ARGB16161616(u64 argb_src1, u64 argb_src2, int x, int y,
   118				     struct vkms_composer *dst_composer)
   119	{
   120		__le64 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
   121	
   122		*pixel_addr = cpu_to_le64(argb_src1);
   123	}
   124	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Pekka Paalanen Oct. 18, 2021, 8:30 a.m. UTC | #4
On Tue,  5 Oct 2021 17:16:37 -0300
Igor Matheus Andrade Torrente <igormtorrente@gmail.com> wrote:

> Currently the blend function only accepts XRGB_8888 and ARGB_8888
> as a color input.
> 
> This patch refactors all the functions related to the plane composition
> to overcome this limitation.
> 
> Now the blend function receives a format handler to each plane and a
> blend function pointer. It will take two ARGB_1616161616 pixels, one
> for each handler, and will use the blend function to calculate and
> store the final color in the output buffer.
> 
> These format handlers will receive the `vkms_composer` and a pair of
> coordinates. And they should return the respective pixel in the
> ARGB_16161616 format.
> 
> The blend function will receive two ARGB_16161616 pixels, x, y, and
> the vkms_composer of the output buffer. The method should perform the
> blend operation and store output to the format aforementioned
> ARGB_16161616.

Hi,

here are some drive-by comments which you are free to take or leave.

To find more efficient ways to do this, you might want research Pixman
library. It's MIT licensed.

IIRC, the elementary operations there operate on pixel lines (pointer +
length), not individual pixels (image, x, y). Input conversion function
reads and converts a whole line a time. Blending function blends two
lines and writes the output into back one of the lines. Output
conversion function takes a line and converts it into destination
buffer. That way the elementary operations do not need to take stride
into account, and blending does not need to deal with varying memory
alignment FWIW. The inner loops involve much less function calls and
state, probably.

Pixman may not do exactly this, but it does something very similar.
Pixman also has multiple different levels of optimisations, which may
not be necessary for VKMS.

It's a trade-off between speed and temporary memory consumed. You need
temporary buffers for two lines, but not more (not a whole image in
full intermediate precision). Further optimisation could determine
whether to process whole image rows as lines, or split a row into
multiple lines to stay within CPU cache size.

Since it seems you are blending multiple planes into a single
destination which is assumed to be opaque, you might also be able to do
the multiple blends without convert-writing and read-converting to/from
the destination between every plane. I think that might be possible to
architect on top of the per-line operations still.

> Signed-off-by: Igor Matheus Andrade Torrente <igormtorrente@gmail.com>
> ---
>  drivers/gpu/drm/vkms/vkms_composer.c | 275 ++++++++++++++-------------
>  drivers/gpu/drm/vkms/vkms_formats.h  | 125 ++++++++++++
>  2 files changed, 271 insertions(+), 129 deletions(-)
>  create mode 100644 drivers/gpu/drm/vkms/vkms_formats.h

...

> +
> +u64 ARGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
> +{
> +	u8 *pixel_addr = packed_pixels_addr(composer, x, y);
> +
> +	/*
> +	 * Organizes the channels in their respective positions and converts
> +	 * the 8 bits channel to 16.
> +	 * The 257 is the "conversion ratio". This number is obtained by the
> +	 * (2^16 - 1) / (2^8 - 1) division. Which, in this case, tries to get
> +	 * the best color value in a color space with more possibilities.

Pixel format, not color space.

> +	 * And a similar idea applies to others RGB color conversions.
> +	 */
> +	return ((u64)pixel_addr[3] * 257) << 48 |
> +	       ((u64)pixel_addr[2] * 257) << 32 |
> +	       ((u64)pixel_addr[1] * 257) << 16 |
> +	       ((u64)pixel_addr[0] * 257);

I wonder if the compiler is smart enough to choose between "mul 257"
and "(v << 8) | v" operations... but that's probably totally drowning
under the overhead of per (x,y) looping.

> +}
> +
> +u64 XRGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
> +{
> +	u8 *pixel_addr = packed_pixels_addr(composer, x, y);
> +
> +	/*
> +	 * The same as the ARGB8888 but with the alpha channel as the
> +	 * maximum value as possible.
> +	 */
> +	return 0xffffllu << 48 |
> +	       ((u64)pixel_addr[2] * 257) << 32 |
> +	       ((u64)pixel_addr[1] * 257) << 16 |
> +	       ((u64)pixel_addr[0] * 257);
> +}
> +
> +u64 get_ARGB16161616(struct vkms_composer *composer, int x, int y)
> +{
> +	__le64 *pixel_addr = packed_pixels_addr(composer, x, y);
> +
> +	/*
> +	 * Because the format byte order is in little-endian and this code
> +	 * needs to run on big-endian machines too, we need modify
> +	 * the byte order from little-endian to the CPU native byte order.
> +	 */
> +	return le64_to_cpu(*pixel_addr);
> +}
> +
> +/*
> + * The following functions are used as blend operations. But unlike the
> + * `alpha_blend`, these functions take an ARGB16161616 pixel from the
> + * source, convert it to a specific format, and store it in the destination.
> + *
> + * They are used in the `compose_active_planes` and `write_wb_buffer` to
> + * copy and convert one pixel from/to the output buffer to/from
> + * another buffer (e.g. writeback buffer, primary plane buffer).
> + */
> +
> +void convert_to_ARGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
> +			 struct vkms_composer *dst_composer)
> +{
> +	u8 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
> +
> +	/*
> +	 * This sequence below is important because the format's byte order is
> +	 * in little-endian. In the case of the ARGB8888 the memory is
> +	 * organized this way:
> +	 *
> +	 * | Addr     | = blue channel
> +	 * | Addr + 1 | = green channel
> +	 * | Addr + 2 | = Red channel
> +	 * | Addr + 3 | = Alpha channel
> +	 */
> +	pixel_addr[0] = DIV_ROUND_UP(argb_src1 & 0xffffllu, 257);
> +	pixel_addr[1] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 16)) >> 16, 257);
> +	pixel_addr[2] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 32)) >> 32, 257);
> +	pixel_addr[3] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 48)) >> 48, 257);

This could be potentially expensive if the compiler ends up using an
actual div instruction.

Btw. this would be shorter to write as

	(argb_src1 >> 16) & 0xffff

etc.

Thanks,
pq

> +}
> +
> +void convert_to_XRGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
> +			 struct vkms_composer *dst_composer)
> +{
> +	u8 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
> +
> +	pixel_addr[0] = DIV_ROUND_UP(argb_src1 & 0xffffllu, 257);
> +	pixel_addr[1] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 16)) >> 16, 257);
> +	pixel_addr[2] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 32)) >> 32, 257);
> +	pixel_addr[3] = 0xff;
> +}
> +
> +void convert_to_ARGB16161616(u64 argb_src1, u64 argb_src2, int x, int y,
> +			     struct vkms_composer *dst_composer)
> +{
> +	__le64 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
> +
> +	*pixel_addr = cpu_to_le64(argb_src1);
> +}
> +
> +#endif /* _VKMS_FORMATS_H_ */
Igor Matheus Andrade Torrente Oct. 18, 2021, 7:26 p.m. UTC | #5
Hi Pekka,

On 10/18/21 5:30 AM, Pekka Paalanen wrote:
> On Tue,  5 Oct 2021 17:16:37 -0300
> Igor Matheus Andrade Torrente <igormtorrente@gmail.com> wrote:
> 
>> Currently the blend function only accepts XRGB_8888 and ARGB_8888
>> as a color input.
>>
>> This patch refactors all the functions related to the plane composition
>> to overcome this limitation.
>>
>> Now the blend function receives a format handler to each plane and a
>> blend function pointer. It will take two ARGB_1616161616 pixels, one
>> for each handler, and will use the blend function to calculate and
>> store the final color in the output buffer.
>>
>> These format handlers will receive the `vkms_composer` and a pair of
>> coordinates. And they should return the respective pixel in the
>> ARGB_16161616 format.
>>
>> The blend function will receive two ARGB_16161616 pixels, x, y, and
>> the vkms_composer of the output buffer. The method should perform the
>> blend operation and store output to the format aforementioned
>> ARGB_16161616.
> 
> Hi,
> 
> here are some drive-by comments which you are free to take or leave.
> 
> To find more efficient ways to do this, you might want research Pixman
> library. It's MIT licensed.
> 
> IIRC, the elementary operations there operate on pixel lines (pointer +
> length), not individual pixels (image, x, y). Input conversion function
> reads and converts a whole line a time. Blending function blends two
> lines and writes the output into back one of the lines. Output
> conversion function takes a line and converts it into destination
> buffer. That way the elementary operations do not need to take stride
> into account, and blending does not need to deal with varying memory
> alignment FWIW. The inner loops involve much less function calls and
> state, probably.

I was doing some rudimentary profiling, and I noticed that the code 
spends most of the time with the indirect system call overhead and not 
the actual computation. This can definitively help with it.

> 
> Pixman may not do exactly this, but it does something very similar.
> Pixman also has multiple different levels of optimisations, which may
> not be necessary for VKMS.
> 
> It's a trade-off between speed and temporary memory consumed. You need
> temporary buffers for two lines, but not more (not a whole image in
> full intermediate precision). Further optimisation could determine
> whether to process whole image rows as lines, or split a row into
> multiple lines to stay within CPU cache size.
> 

Sorry, I didn't understand the idea of the last sentence.

> Since it seems you are blending multiple planes into a single
> destination which is assumed to be opaque, you might also be able to do
> the multiple blends without convert-writing and read-converting to/from
> the destination between every plane. I think that might be possible to
> architect on top of the per-line operations still.

I tried it. But I don't know how to do this without looking like a mess. 
Does the pixman perform some similar?

> 
>> Signed-off-by: Igor Matheus Andrade Torrente <igormtorrente@gmail.com>
>> ---
>>   drivers/gpu/drm/vkms/vkms_composer.c | 275 ++++++++++++++-------------
>>   drivers/gpu/drm/vkms/vkms_formats.h  | 125 ++++++++++++
>>   2 files changed, 271 insertions(+), 129 deletions(-)
>>   create mode 100644 drivers/gpu/drm/vkms/vkms_formats.h
> 
> ...
> 
>> +
>> +u64 ARGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
>> +{
>> +	u8 *pixel_addr = packed_pixels_addr(composer, x, y);
>> +
>> +	/*
>> +	 * Organizes the channels in their respective positions and converts
>> +	 * the 8 bits channel to 16.
>> +	 * The 257 is the "conversion ratio". This number is obtained by the
>> +	 * (2^16 - 1) / (2^8 - 1) division. Which, in this case, tries to get
>> +	 * the best color value in a color space with more possibilities.
> 
> Pixel format, not color space. >
>> +	 * And a similar idea applies to others RGB color conversions.
>> +	 */
>> +	return ((u64)pixel_addr[3] * 257) << 48 |
>> +	       ((u64)pixel_addr[2] * 257) << 32 |
>> +	       ((u64)pixel_addr[1] * 257) << 16 |
>> +	       ((u64)pixel_addr[0] * 257);
> 
> I wonder if the compiler is smart enough to choose between "mul 257"
> and "(v << 8) | v" operations... but that's probably totally drowning
> under the overhead of per (x,y) looping.

I disassembled the code to check it. And looks like the compiler is 
replacing the multiplication with shifts and additions.

ARGB8888_to_ARGB16161616:
    0xffffffff816ad660 <+0>:     imul   0x12c(%rdi),%edx
    0xffffffff816ad667 <+7>:     imul   0x130(%rdi),%esi
    0xffffffff816ad66e <+14>:    add    %edx,%esi
    0xffffffff816ad670 <+16>:    add    0x128(%rdi),%esi
    0xffffffff816ad676 <+22>:    movslq %esi,%rax
    0xffffffff816ad679 <+25>:    add    0xe8(%rdi),%rax
    0xffffffff816ad680 <+32>:    movzbl 0x3(%rax),%ecx
    0xffffffff816ad684 <+36>:    movzbl 0x2(%rax),%esi
    0xffffffff816ad688 <+40>:    mov    %rcx,%rdx
    0xffffffff816ad68b <+43>:    shl    $0x8,%rdx
    0xffffffff816ad68f <+47>:    add    %rcx,%rdx
    0xffffffff816ad692 <+50>:    mov    %rsi,%rcx
    0xffffffff816ad695 <+53>:    shl    $0x8,%rcx
    0xffffffff816ad699 <+57>:    shl    $0x30,%rdx
    0xffffffff816ad69d <+61>:    add    %rsi,%rcx
    0xffffffff816ad6a0 <+64>:    movzbl (%rax),%esi
    0xffffffff816ad6a3 <+67>:    shl    $0x20,%rcx
    0xffffffff816ad6a7 <+71>:    or     %rcx,%rdx
    0xffffffff816ad6aa <+74>:    mov    %rsi,%rcx
    0xffffffff816ad6ad <+77>:    shl    $0x8,%rcx
    0xffffffff816ad6b1 <+81>:    add    %rsi,%rcx
    0xffffffff816ad6b4 <+84>:    or     %rcx,%rdx
    0xffffffff816ad6b7 <+87>:    movzbl 0x1(%rax),%ecx
    0xffffffff816ad6bb <+91>:    mov    %rcx,%rax
    0xffffffff816ad6be <+94>:    shl    $0x8,%rax
    0xffffffff816ad6c2 <+98>:    add    %rcx,%rax
    0xffffffff816ad6c5 <+101>:   shl    $0x10,%rax
    0xffffffff816ad6c9 <+105>:   or     %rdx,%rax
    0xffffffff816ad6cc <+108>:   ret

> 
>> +}
>> +
>> +u64 XRGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
>> +{
>> +	u8 *pixel_addr = packed_pixels_addr(composer, x, y);
>> +
>> +	/*
>> +	 * The same as the ARGB8888 but with the alpha channel as the
>> +	 * maximum value as possible.
>> +	 */
>> +	return 0xffffllu << 48 |
>> +	       ((u64)pixel_addr[2] * 257) << 32 |
>> +	       ((u64)pixel_addr[1] * 257) << 16 |
>> +	       ((u64)pixel_addr[0] * 257);
>> +}
>> +
>> +u64 get_ARGB16161616(struct vkms_composer *composer, int x, int y)
>> +{
>> +	__le64 *pixel_addr = packed_pixels_addr(composer, x, y);
>> +
>> +	/*
>> +	 * Because the format byte order is in little-endian and this code
>> +	 * needs to run on big-endian machines too, we need modify
>> +	 * the byte order from little-endian to the CPU native byte order.
>> +	 */
>> +	return le64_to_cpu(*pixel_addr);
>> +}
>> +
>> +/*
>> + * The following functions are used as blend operations. But unlike the
>> + * `alpha_blend`, these functions take an ARGB16161616 pixel from the
>> + * source, convert it to a specific format, and store it in the destination.
>> + *
>> + * They are used in the `compose_active_planes` and `write_wb_buffer` to
>> + * copy and convert one pixel from/to the output buffer to/from
>> + * another buffer (e.g. writeback buffer, primary plane buffer).
>> + */
>> +
>> +void convert_to_ARGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
>> +			 struct vkms_composer *dst_composer)
>> +{
>> +	u8 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
>> +
>> +	/*
>> +	 * This sequence below is important because the format's byte order is
>> +	 * in little-endian. In the case of the ARGB8888 the memory is
>> +	 * organized this way:
>> +	 *
>> +	 * | Addr     | = blue channel
>> +	 * | Addr + 1 | = green channel
>> +	 * | Addr + 2 | = Red channel
>> +	 * | Addr + 3 | = Alpha channel
>> +	 */
>> +	pixel_addr[0] = DIV_ROUND_UP(argb_src1 & 0xffffllu, 257);
>> +	pixel_addr[1] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 16)) >> 16, 257);
>> +	pixel_addr[2] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 32)) >> 32, 257);
>> +	pixel_addr[3] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 48)) >> 48, 257);
> 
> This could be potentially expensive if the compiler ends up using an
> actual div instruction.
> 
Yes, I'm using the DIV_ROUND_UP because I couldn't figure out how the 
"Faster div by 255" works to adapt to 16 bits.

> Btw. this would be shorter to write as
> 
> 	(argb_src1 >> 16) & 0xffff
> 
> etc.
I will use it in the V2. Thanks.

> 
> Thanks,
> pq
> 
>> +}
>> +
>> +void convert_to_XRGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
>> +			 struct vkms_composer *dst_composer)
>> +{
>> +	u8 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
>> +
>> +	pixel_addr[0] = DIV_ROUND_UP(argb_src1 & 0xffffllu, 257);
>> +	pixel_addr[1] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 16)) >> 16, 257);
>> +	pixel_addr[2] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 32)) >> 32, 257);
>> +	pixel_addr[3] = 0xff;
>> +}
>> +
>> +void convert_to_ARGB16161616(u64 argb_src1, u64 argb_src2, int x, int y,
>> +			     struct vkms_composer *dst_composer)
>> +{
>> +	__le64 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
>> +
>> +	*pixel_addr = cpu_to_le64(argb_src1);
>> +}
>> +
>> +#endif /* _VKMS_FORMATS_H_ */
>
Pekka Paalanen Oct. 19, 2021, 8:05 a.m. UTC | #6
On Mon, 18 Oct 2021 16:26:06 -0300
Igor Matheus Andrade Torrente <igormtorrente@gmail.com> wrote:

> Hi Pekka,
> 
> On 10/18/21 5:30 AM, Pekka Paalanen wrote:
> > On Tue,  5 Oct 2021 17:16:37 -0300
> > Igor Matheus Andrade Torrente <igormtorrente@gmail.com> wrote:
> >   
> >> Currently the blend function only accepts XRGB_8888 and ARGB_8888
> >> as a color input.
> >>
> >> This patch refactors all the functions related to the plane composition
> >> to overcome this limitation.
> >>
> >> Now the blend function receives a format handler to each plane and a
> >> blend function pointer. It will take two ARGB_1616161616 pixels, one
> >> for each handler, and will use the blend function to calculate and
> >> store the final color in the output buffer.
> >>
> >> These format handlers will receive the `vkms_composer` and a pair of
> >> coordinates. And they should return the respective pixel in the
> >> ARGB_16161616 format.
> >>
> >> The blend function will receive two ARGB_16161616 pixels, x, y, and
> >> the vkms_composer of the output buffer. The method should perform the
> >> blend operation and store output to the format aforementioned
> >> ARGB_16161616.  
> > 
> > Hi,
> > 
> > here are some drive-by comments which you are free to take or leave.
> > 
> > To find more efficient ways to do this, you might want research Pixman
> > library. It's MIT licensed.
> > 
> > IIRC, the elementary operations there operate on pixel lines (pointer +
> > length), not individual pixels (image, x, y). Input conversion function
> > reads and converts a whole line a time. Blending function blends two
> > lines and writes the output into back one of the lines. Output
> > conversion function takes a line and converts it into destination
> > buffer. That way the elementary operations do not need to take stride
> > into account, and blending does not need to deal with varying memory
> > alignment FWIW. The inner loops involve much less function calls and
> > state, probably.  
> 
> I was doing some rudimentary profiling, and I noticed that the code 
> spends most of the time with the indirect system call overhead and not 
> the actual computation. This can definitively help with it.

Hi,

I suppose you mean function (pointer) call, not system call?

Really good that you have already profiled things. All optimisations
should be guided by profiling, otherwise they are just guesses (and I
got lucky this time I guess).

> > Pixman may not do exactly this, but it does something very similar.
> > Pixman also has multiple different levels of optimisations, which may
> > not be necessary for VKMS.
> > 
> > It's a trade-off between speed and temporary memory consumed. You need
> > temporary buffers for two lines, but not more (not a whole image in
> > full intermediate precision). Further optimisation could determine
> > whether to process whole image rows as lines, or split a row into
> > multiple lines to stay within CPU cache size.
> >   
> 
> Sorry, I didn't understand the idea of the last sentence.

If an image is very wide, a single row could still be relatively large
in size (bytes). If it is large enough that the working set falls out
of a faster CPU cache into a slower CPU cache (or worse yet, into RAM
accesses), performance may suffer and become memory bandwidth limited
rather than ALU rate limited. Theoretically that can be worked around
by limiting the maximum size of a line, and splitting an image row into
multiple lines.

However, this is an optimisation one probably should not do until there
is performance profiling data showing that it actually is useful. The
optimal line size limit depends on the CPU model as well. So it's a bit
far out, but something you could keep in mind just in case.

> > Since it seems you are blending multiple planes into a single
> > destination which is assumed to be opaque, you might also be able to do
> > the multiple blends without convert-writing and read-converting to/from
> > the destination between every plane. I think that might be possible to
> > architect on top of the per-line operations still.  
> 
> I tried it. But I don't know how to do this without looking like a mess. 

Dedicate one of the temporary line buffers for the destination, and
instead of writing it out and loading it back for each input plane,
leave it in place over all planes and write it out just once at the end.

I do expect more state tracking needed. You need to iterate over the
list of planes for each output row, extract only the used part of an
input plane's buffer into the other temporary line buffer, and offset
the destination line buffer and length to match when passing those into
a blending function.

It's not an obvious win but a trade-off, so profiling is again needed.

Btw. the use of temporary line buffers should also help with
implementing scaling. You could do the filtering during reading of the
input buffer. If the filter is not nearest, meaning you need to access
more than one input pixel per pixel-for-blending, there are a few ways
to go about that. You could do the filtering during the input buffer
reading, or you could load two input buffer rows into temporary line
buffers and do filtering as a separate step into yet another line
buffer. As the composition advances from top to bottom, only one of the
input buffer rows will change at a time (during up-scaling) so you can
avoid re-loading a row by swapping the line buffers.

This is getting ahead of things, so don't worry about scaling or
filtering yet. The first thing is to see if you can make the line
buffer approach give you a significant speed-up.

> Does the pixman perform some similar?

No, Pixman composition operation has only three images: source,
mask, and destination. All those it can handle simultaneously, but
since there is no "multi-blending" API, it doesn't need to take care of
more.

IIRC, Pixman also has a form of optimised operations that do blending
and converting to destination in the same pass. The advantage of that
is that blending can work with less precision when it knows what
precision the output will be converted to and it saves some bandwidth
by not needing to write-after-blending and read-for-conversion
intermediate precision values. The disadvantage is that the number of
needed specialised blending functions explodes by the number of
possible destination formats. Pixman indeed makes those specialised
functions optional, falling back to more generic C code. I would hope
that VKMS does not need to go this far in optimisations though.

> >   
> >> Signed-off-by: Igor Matheus Andrade Torrente <igormtorrente@gmail.com>
> >> ---
> >>   drivers/gpu/drm/vkms/vkms_composer.c | 275 ++++++++++++++-------------
> >>   drivers/gpu/drm/vkms/vkms_formats.h  | 125 ++++++++++++
> >>   2 files changed, 271 insertions(+), 129 deletions(-)
> >>   create mode 100644 drivers/gpu/drm/vkms/vkms_formats.h  
> > 
> > ...
> >   
> >> +
> >> +u64 ARGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
> >> +{
> >> +	u8 *pixel_addr = packed_pixels_addr(composer, x, y);
> >> +
> >> +	/*
> >> +	 * Organizes the channels in their respective positions and converts
> >> +	 * the 8 bits channel to 16.
> >> +	 * The 257 is the "conversion ratio". This number is obtained by the
> >> +	 * (2^16 - 1) / (2^8 - 1) division. Which, in this case, tries to get
> >> +	 * the best color value in a color space with more possibilities.  
> > 
> > Pixel format, not color space. >  
> >> +	 * And a similar idea applies to others RGB color conversions.
> >> +	 */
> >> +	return ((u64)pixel_addr[3] * 257) << 48 |
> >> +	       ((u64)pixel_addr[2] * 257) << 32 |
> >> +	       ((u64)pixel_addr[1] * 257) << 16 |
> >> +	       ((u64)pixel_addr[0] * 257);  
> > 
> > I wonder if the compiler is smart enough to choose between "mul 257"
> > and "(v << 8) | v" operations... but that's probably totally drowning
> > under the overhead of per (x,y) looping.  
> 
> I disassembled the code to check it. And looks like the compiler is 
> replacing the multiplication with shifts and additions.
> 
> ARGB8888_to_ARGB16161616:
>     0xffffffff816ad660 <+0>:     imul   0x12c(%rdi),%edx
>     0xffffffff816ad667 <+7>:     imul   0x130(%rdi),%esi
>     0xffffffff816ad66e <+14>:    add    %edx,%esi
>     0xffffffff816ad670 <+16>:    add    0x128(%rdi),%esi
>     0xffffffff816ad676 <+22>:    movslq %esi,%rax
>     0xffffffff816ad679 <+25>:    add    0xe8(%rdi),%rax
>     0xffffffff816ad680 <+32>:    movzbl 0x3(%rax),%ecx
>     0xffffffff816ad684 <+36>:    movzbl 0x2(%rax),%esi
>     0xffffffff816ad688 <+40>:    mov    %rcx,%rdx
>     0xffffffff816ad68b <+43>:    shl    $0x8,%rdx
>     0xffffffff816ad68f <+47>:    add    %rcx,%rdx
>     0xffffffff816ad692 <+50>:    mov    %rsi,%rcx
>     0xffffffff816ad695 <+53>:    shl    $0x8,%rcx
>     0xffffffff816ad699 <+57>:    shl    $0x30,%rdx
>     0xffffffff816ad69d <+61>:    add    %rsi,%rcx
>     0xffffffff816ad6a0 <+64>:    movzbl (%rax),%esi
>     0xffffffff816ad6a3 <+67>:    shl    $0x20,%rcx
>     0xffffffff816ad6a7 <+71>:    or     %rcx,%rdx
>     0xffffffff816ad6aa <+74>:    mov    %rsi,%rcx
>     0xffffffff816ad6ad <+77>:    shl    $0x8,%rcx
>     0xffffffff816ad6b1 <+81>:    add    %rsi,%rcx
>     0xffffffff816ad6b4 <+84>:    or     %rcx,%rdx
>     0xffffffff816ad6b7 <+87>:    movzbl 0x1(%rax),%ecx
>     0xffffffff816ad6bb <+91>:    mov    %rcx,%rax
>     0xffffffff816ad6be <+94>:    shl    $0x8,%rax
>     0xffffffff816ad6c2 <+98>:    add    %rcx,%rax
>     0xffffffff816ad6c5 <+101>:   shl    $0x10,%rax
>     0xffffffff816ad6c9 <+105>:   or     %rdx,%rax
>     0xffffffff816ad6cc <+108>:   ret

Nice!

> >   
> >> +}
> >> +
> >> +u64 XRGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
> >> +{
> >> +	u8 *pixel_addr = packed_pixels_addr(composer, x, y);
> >> +
> >> +	/*
> >> +	 * The same as the ARGB8888 but with the alpha channel as the
> >> +	 * maximum value as possible.
> >> +	 */
> >> +	return 0xffffllu << 48 |
> >> +	       ((u64)pixel_addr[2] * 257) << 32 |
> >> +	       ((u64)pixel_addr[1] * 257) << 16 |
> >> +	       ((u64)pixel_addr[0] * 257);
> >> +}
> >> +
> >> +u64 get_ARGB16161616(struct vkms_composer *composer, int x, int y)
> >> +{
> >> +	__le64 *pixel_addr = packed_pixels_addr(composer, x, y);
> >> +
> >> +	/*
> >> +	 * Because the format byte order is in little-endian and this code
> >> +	 * needs to run on big-endian machines too, we need modify
> >> +	 * the byte order from little-endian to the CPU native byte order.
> >> +	 */
> >> +	return le64_to_cpu(*pixel_addr);
> >> +}
> >> +
> >> +/*
> >> + * The following functions are used as blend operations. But unlike the
> >> + * `alpha_blend`, these functions take an ARGB16161616 pixel from the
> >> + * source, convert it to a specific format, and store it in the destination.
> >> + *
> >> + * They are used in the `compose_active_planes` and `write_wb_buffer` to
> >> + * copy and convert one pixel from/to the output buffer to/from
> >> + * another buffer (e.g. writeback buffer, primary plane buffer).
> >> + */
> >> +
> >> +void convert_to_ARGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
> >> +			 struct vkms_composer *dst_composer)
> >> +{
> >> +	u8 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
> >> +
> >> +	/*
> >> +	 * This sequence below is important because the format's byte order is
> >> +	 * in little-endian. In the case of the ARGB8888 the memory is
> >> +	 * organized this way:
> >> +	 *
> >> +	 * | Addr     | = blue channel
> >> +	 * | Addr + 1 | = green channel
> >> +	 * | Addr + 2 | = Red channel
> >> +	 * | Addr + 3 | = Alpha channel
> >> +	 */
> >> +	pixel_addr[0] = DIV_ROUND_UP(argb_src1 & 0xffffllu, 257);
> >> +	pixel_addr[1] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 16)) >> 16, 257);
> >> +	pixel_addr[2] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 32)) >> 32, 257);
> >> +	pixel_addr[3] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 48)) >> 48, 257);  
> > 
> > This could be potentially expensive if the compiler ends up using an
> > actual div instruction.
> >   
> Yes, I'm using the DIV_ROUND_UP because I couldn't figure out how the 
> "Faster div by 255" works to adapt to 16 bits.

But does the compiler actually do a div instruction? If not, then no
worries. If it does, then maybe something to look into, *if* this shows
up in profiling.


Thanks,
pq

> > Btw. this would be shorter to write as
> > 
> > 	(argb_src1 >> 16) & 0xffff
> > 
> > etc.  
> I will use it in the V2. Thanks.
> 
> > 
> > Thanks,
> > pq
> >   
> >> +}
> >> +
> >> +void convert_to_XRGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
> >> +			 struct vkms_composer *dst_composer)
> >> +{
> >> +	u8 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
> >> +
> >> +	pixel_addr[0] = DIV_ROUND_UP(argb_src1 & 0xffffllu, 257);
> >> +	pixel_addr[1] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 16)) >> 16, 257);
> >> +	pixel_addr[2] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 32)) >> 32, 257);
> >> +	pixel_addr[3] = 0xff;
> >> +}
> >> +
> >> +void convert_to_ARGB16161616(u64 argb_src1, u64 argb_src2, int x, int y,
> >> +			     struct vkms_composer *dst_composer)
> >> +{
> >> +	__le64 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
> >> +
> >> +	*pixel_addr = cpu_to_le64(argb_src1);
> >> +}
> >> +
> >> +#endif /* _VKMS_FORMATS_H_ */  
> >
Igor Matheus Andrade Torrente Oct. 19, 2021, 9:10 p.m. UTC | #7
Hi Pekka,

On 10/19/21 5:05 AM, Pekka Paalanen wrote:
> On Mon, 18 Oct 2021 16:26:06 -0300
> Igor Matheus Andrade Torrente <igormtorrente@gmail.com> wrote:
> 
>> Hi Pekka,
>>
>> On 10/18/21 5:30 AM, Pekka Paalanen wrote:
>>> On Tue,  5 Oct 2021 17:16:37 -0300
>>> Igor Matheus Andrade Torrente <igormtorrente@gmail.com> wrote:
>>>    
>>>> Currently the blend function only accepts XRGB_8888 and ARGB_8888
>>>> as a color input.
>>>>
>>>> This patch refactors all the functions related to the plane composition
>>>> to overcome this limitation.
>>>>
>>>> Now the blend function receives a format handler to each plane and a
>>>> blend function pointer. It will take two ARGB_1616161616 pixels, one
>>>> for each handler, and will use the blend function to calculate and
>>>> store the final color in the output buffer.
>>>>
>>>> These format handlers will receive the `vkms_composer` and a pair of
>>>> coordinates. And they should return the respective pixel in the
>>>> ARGB_16161616 format.
>>>>
>>>> The blend function will receive two ARGB_16161616 pixels, x, y, and
>>>> the vkms_composer of the output buffer. The method should perform the
>>>> blend operation and store output to the format aforementioned
>>>> ARGB_16161616.
>>>
>>> Hi,
>>>
>>> here are some drive-by comments which you are free to take or leave.
>>>
>>> To find more efficient ways to do this, you might want research 
>>> library. It's MIT licensed.
>>>
>>> IIRC, the elementary operations there operate on pixel lines (pointer +
>>> length), not individual pixels (image, x, y). Input conversion function
>>> reads and converts a whole line a time. Blending function blends two
>>> lines and writes the output into back one of the lines. Output
>>> conversion function takes a line and converts it into destination
>>> buffer. That way the elementary operations do not need to take stride
>>> into account, and blending does not need to deal with varying memory
>>> alignment FWIW. The inner loops involve much less function calls and
>>> state, probably.
>>
>> I was doing some rudimentary profiling, and I noticed that the code
>> spends most of the time with the indirect system call overhead and not
>> the actual computation. This can definitively help with it.
> 
> Hi,
> 
> I suppose you mean function (pointer) call, not system call?

Yes, I misspelled it.

> 
> Really good that you have already profiled things. All optimisations
> should be guided by profiling, otherwise they are just guesses (and I
> got lucky this time I guess).
> 
>>> Pixman may not do exactly this, but it does something very similar.
>>> Pixman also has multiple different levels of optimisations, which may
>>> not be necessary for VKMS.
>>>
>>> It's a trade-off between speed and temporary memory consumed. You need
>>> temporary buffers for two lines, but not more (not a whole image in
>>> full intermediate precision). Further optimisation could determine
>>> whether to process whole image rows as lines, or split a row into
>>> multiple lines to stay within CPU cache size.
>>>    
>>
>> Sorry, I didn't understand the idea of the last sentence.
> 
> If an image is very wide, a single row could still be relatively large
> in size (bytes). If it is large enough that the working set falls out
> of a faster CPU cache into a slower CPU cache (or worse yet, into RAM
> accesses), performance may suffer and become memory bandwidth limited
> rather than ALU rate limited. Theoretically that can be worked around
> by limiting the maximum size of a line, and splitting an image row into
> multiple lines.

Got it now, thanks.

> 
> However, this is an optimisation one probably should not do until there
> is performance profiling data showing that it actually is useful. The
> optimal line size limit depends on the CPU model as well. So it's a bit
> far out, but something you could keep in mind just in case.

OK. For now I will not deal with this complexity, and if necessary I
will revisit it latter.

> 
>>> Since it seems you are blending multiple planes into a single
>>> destination which is assumed to be opaque, you might also be able to do
>>> the multiple blends without convert-writing and read-converting to/from
>>> the destination between every plane. I think that might be possible to
>>> architect on top of the per-line operations still.
>>
>> I tried it. But I don't know how to do this without looking like a mess.

I don't think it changes anything, but I forgot to mention that I tried
it with the blend per pixel and not a per line.

> 
> Dedicate one of the temporary line buffers for the destination, and
> instead of writing it out and loading it back for each input plane,
> leave it in place over all planes and write it out just once at the end.
> 
> I do expect more state tracking needed. You need to iterate over the
> list of planes for each output row, extract only the used part of an
> input plane's buffer into the other temporary line buffer, and offset
> the destination line buffer and length to match when passing those into
> a blending function.+

It's exactly this state tracking that was a mess when I was trying to
implement something similar. But this is one more thing that probably
I will have to revisit later.

> 
> It's not an obvious win but a trade-off, so profiling is again needed.
> 
> Btw. the use of temporary line buffers should also help with
> implementing scaling. You could do the filtering during reading of the
> input buffer. If the filter is not nearest, meaning you need to access
> more than one input pixel per pixel-for-blending, there are a few ways
> to go about that. You could do the filtering during the input buffer
> reading, or you could load two input buffer rows into temporary line
> buffers and do filtering as a separate step into yet another line
> buffer. As the composition advances from top to bottom, only one of the
> input buffer rows will change at a time (during up-scaling) so you can
> avoid re-loading a row by swapping the line buffers.
> 
> This is getting ahead of things, so don't worry about scaling or
> filtering yet. The first thing is to see if you can make the line
> buffer approach give you a significant speed-up.
> 
>> Does the pixman perform some similar?
> 
> No, Pixman composition operation has only three images: source,
> mask, and destination. All those it can handle simultaneously, but
> since there is no "multi-blending" API, it doesn't need to take care of
> more.
> 
> IIRC, Pixman also has a form of optimised operations that do blending
> and converting to destination in the same pass. The advantage of that
> is that blending can work with less precision when it knows what
> precision the output will be converted to and it saves some bandwidth
> by not needing to write-after-blending and read-for-conversion
> intermediate precision values. The disadvantage is that the number of
> needed specialised blending functions explodes by the number of
> possible destination formats. Pixman indeed makes those specialised
> functions optional, falling back to more generic C code. I would hope
> that VKMS does not need to go this far in optimisations though.

This should be plenty fast indeed. Maybe worth for formats that are
extremely common.

> 
>>>    
>>>> Signed-off-by: Igor Matheus Andrade Torrente <igormtorrente@gmail.com>
>>>> ---
>>>>    drivers/gpu/drm/vkms/vkms_composer.c | 275 ++++++++++++++-------------
>>>>    drivers/gpu/drm/vkms/vkms_formats.h  | 125 ++++++++++++
>>>>    2 files changed, 271 insertions(+), 129 deletions(-)
>>>>    create mode 100644 drivers/gpu/drm/vkms/vkms_formats.h
>>>
>>> ...
>>>    
>>>> +
>>>> +u64 ARGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
>>>> +{
>>>> +	u8 *pixel_addr = packed_pixels_addr(composer, x, y);
>>>> +
>>>> +	/*
>>>> +	 * Organizes the channels in their respective positions and converts
>>>> +	 * the 8 bits channel to 16.
>>>> +	 * The 257 is the "conversion ratio". This number is obtained by the
>>>> +	 * (2^16 - 1) / (2^8 - 1) division. Which, in this case, tries to get
>>>> +	 * the best color value in a color space with more possibilities.
>>>
>>> Pixel format, not color space. >
>>>> +	 * And a similar idea applies to others RGB color conversions.
>>>> +	 */
>>>> +	return ((u64)pixel_addr[3] * 257) << 48 |
>>>> +	       ((u64)pixel_addr[2] * 257) << 32 |
>>>> +	       ((u64)pixel_addr[1] * 257) << 16 |
>>>> +	       ((u64)pixel_addr[0] * 257);
>>>
>>> I wonder if the compiler is smart enough to choose between "mul 257"
>>> and "(v << 8) | v" operations... but that's probably totally drowning
>>> under the overhead of per (x,y) looping.
>>
>> I disassembled the code to check it. And looks like the compiler is
>> replacing the multiplication with shifts and additions.
>>
>> ARGB8888_to_ARGB16161616:
>>      0xffffffff816ad660 <+0>:     imul   0x12c(%rdi),%edx
>>      0xffffffff816ad667 <+7>:     imul   0x130(%rdi),%esi
>>      0xffffffff816ad66e <+14>:    add    %edx,%esi
>>      0xffffffff816ad670 <+16>:    add    0x128(%rdi),%esi
>>      0xffffffff816ad676 <+22>:    movslq %esi,%rax
>>      0xffffffff816ad679 <+25>:    add    0xe8(%rdi),%rax
>>      0xffffffff816ad680 <+32>:    movzbl 0x3(%rax),%ecx
>>      0xffffffff816ad684 <+36>:    movzbl 0x2(%rax),%esi
>>      0xffffffff816ad688 <+40>:    mov    %rcx,%rdx
>>      0xffffffff816ad68b <+43>:    shl    $0x8,%rdx
>>      0xffffffff816ad68f <+47>:    add    %rcx,%rdx
>>      0xffffffff816ad692 <+50>:    mov    %rsi,%rcx
>>      0xffffffff816ad695 <+53>:    shl    $0x8,%rcx
>>      0xffffffff816ad699 <+57>:    shl    $0x30,%rdx
>>      0xffffffff816ad69d <+61>:    add    %rsi,%rcx
>>      0xffffffff816ad6a0 <+64>:    movzbl (%rax),%esi
>>      0xffffffff816ad6a3 <+67>:    shl    $0x20,%rcx
>>      0xffffffff816ad6a7 <+71>:    or     %rcx,%rdx
>>      0xffffffff816ad6aa <+74>:    mov    %rsi,%rcx
>>      0xffffffff816ad6ad <+77>:    shl    $0x8,%rcx
>>      0xffffffff816ad6b1 <+81>:    add    %rsi,%rcx
>>      0xffffffff816ad6b4 <+84>:    or     %rcx,%rdx
>>      0xffffffff816ad6b7 <+87>:    movzbl 0x1(%rax),%ecx
>>      0xffffffff816ad6bb <+91>:    mov    %rcx,%rax
>>      0xffffffff816ad6be <+94>:    shl    $0x8,%rax
>>      0xffffffff816ad6c2 <+98>:    add    %rcx,%rax
>>      0xffffffff816ad6c5 <+101>:   shl    $0x10,%rax
>>      0xffffffff816ad6c9 <+105>:   or     %rdx,%rax
>>      0xffffffff816ad6cc <+108>:   ret
> 
> Nice!
> 
>>>    
>>>> +}
>>>> +
>>>> +u64 XRGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
>>>> +{
>>>> +	u8 *pixel_addr = packed_pixels_addr(composer, x, y);
>>>> +
>>>> +	/*
>>>> +	 * The same as the ARGB8888 but with the alpha channel as the
>>>> +	 * maximum value as possible.
>>>> +	 */
>>>> +	return 0xffffllu << 48 |
>>>> +	       ((u64)pixel_addr[2] * 257) << 32 |
>>>> +	       ((u64)pixel_addr[1] * 257) << 16 |
>>>> +	       ((u64)pixel_addr[0] * 257);
>>>> +}
>>>> +
>>>> +u64 get_ARGB16161616(struct vkms_composer *composer, int x, int y)
>>>> +{
>>>> +	__le64 *pixel_addr = packed_pixels_addr(composer, x, y);
>>>> +
>>>> +	/*
>>>> +	 * Because the format byte order is in little-endian and this code
>>>> +	 * needs to run on big-endian machines too, we need modify
>>>> +	 * the byte order from little-endian to the CPU native byte order.
>>>> +	 */
>>>> +	return le64_to_cpu(*pixel_addr);
>>>> +}
>>>> +
>>>> +/*
>>>> + * The following functions are used as blend operations. But unlike the
>>>> + * `alpha_blend`, these functions take an ARGB16161616 pixel from the
>>>> + * source, convert it to a specific format, and store it in the destination.
>>>> + *
>>>> + * They are used in the `compose_active_planes` and `write_wb_buffer` to
>>>> + * copy and convert one pixel from/to the output buffer to/from
>>>> + * another buffer (e.g. writeback buffer, primary plane buffer).
>>>> + */
>>>> +
>>>> +void convert_to_ARGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
>>>> +			 struct vkms_composer *dst_composer)
>>>> +{
>>>> +	u8 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
>>>> +
>>>> +	/*This should be plenty fast indeed. Maybe worth for formats that are
extremely common.
>>>> +	 * This sequence below is important because the format's byte order is
>>>> +	 * in little-endian. In the case of the ARGB8888 the memory is
>>>> +	 * organized this way:
>>>> +	 *
>>>> +	 * | Addr     | = blue channel
>>>> +	 * | Addr + 1 | = green channel
>>>> +	 * | Addr + 2 | = Red channel
>>>> +	 * | Addr + 3 | = Alpha channel
>>>> +	 */
>>>> +	pixel_addr[0] = DIV_ROUND_UP(argb_src1 & 0xffffllu, 257);
>>>> +	pixel_addr[1] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 16)) >> 16, 257);
>>>> +	pixel_addr[2] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 32)) >> 32, 257);
>>>> +	pixel_addr[3] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 48)) >> 48, 257);
>>>
>>> This could be potentially expensive if the compiler ends up using an
>>> actual div instruction.
>>>    
>> Yes, I'm using the DIV_ROUND_UP because I couldn't figure out how the
>> "Faster div by 255" works to adapt to 16 bits.
> 
> But does the compiler actually do a div instruction? If not, then no
> worries. If it does, then maybe something to look into, *if* this shows
> up in profiling.

GCC isn't issuing any div instruction here.

convert_to_ARGB8888:
    0xffffffff816ad770 <+0>:	imul   0x12c(%rcx),%edx
    0xffffffff816ad777 <+7>:	mov    %rcx,%rax
    0xffffffff816ad77a <+10>:	imul   0x130(%rcx),%esi
    0xffffffff816ad781 <+17>:	add    %edx,%esi
    0xffffffff816ad783 <+19>:	movzwl %di,%edx
    0xffffffff816ad786 <+22>:	add    0x128(%rcx),%esi
    0xffffffff816ad78c <+28>:	add    $0x100,%rdx
    0xffffffff816ad793 <+35>:	movslq %esi,%rcx
    0xffffffff816ad796 <+38>:	movabs $0xff00ff00ff00ff01,%rsi
    0xffffffff816ad7a0 <+48>:	add    0xe8(%rax),%rcx
    0xffffffff816ad7a7 <+55>:	mov    %rdx,%rax
    0xffffffff816ad7aa <+58>:	mul    %rsi
    0xffffffff816ad7ad <+61>:	shr    $0x8,%rdx
    0xffffffff816ad7b1 <+65>:	mov    %dl,(%rcx)
    0xffffffff816ad7b3 <+67>:	mov    %rdi,%rdx
    0xffffffff816ad7b6 <+70>:	shr    $0x10,%rdx
    0xffffffff816ad7ba <+74>:	movzwl %dx,%edx
    0xffffffff816ad7bd <+77>:	add    $0x100,%rdx
    0xffffffff816ad7c4 <+84>:	mov    %rdx,%rax
    0xffffffff816ad7c7 <+87>:	mul    %rsi
    0xffffffff816ad7ca <+90>:	shr    $0x8,%rdx
    0xffffffff816ad7ce <+94>:	mov    %dl,0x1(%rcx)
    0xffffffff816ad7d1 <+97>:	mov    %rdi,%rdx
    0xffffffff816ad7d4 <+100>:	shr    $0x30,%rdi
    0xffffffff816ad7d8 <+104>:	shr    $0x20,%rdx
    0xffffffff816ad7dc <+108>:	movzwl %dx,%edx
    0xffffffff816ad7df <+111>:	add    $0x100,%rdx
    0xffffffff816ad7e6 <+118>:	mov    %rdx,%rax
    0xffffffff816ad7e9 <+121>:	mul    %rsi
    0xffffffff816ad7ec <+124>:	shr    $0x8,%rdx
    0xffffffff816ad7f0 <+128>:	mov    %dl,0x2(%rcx)
    0xffffffff816ad7f3 <+131>:	lea    0x100(%rdi),%rdx
    0xffffffff816ad7fa <+138>:	mov    %rdx,%rax
    0xffffffff816ad7fd <+141>:	mul    %rsi
    0xffffffff816ad800 <+144>:	shr    $0x8,%rdx
    0xffffffff816ad804 <+148>:	mov    %dl,0x3(%rcx)
    0xffffffff816ad807 <+151>:	ret

> 
> 
> Thanks,
> pq
> 
>>> Btw. this would be shorter to write as
>>>
>>> 	(argb_src1 >> 16) & 0xffff
>>>
>>> etc.
>> I will use it in the V2. Thanks.
>>
>>>
>>> Thanks,
>>> pq
>>>    
>>>> +}
>>>> +
>>>> +void convert_to_XRGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
>>>> +			 struct vkms_composer *dst_composer)
>>>> +{
>>>> +	u8 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
>>>> +
>>>> +	pixel_addr[0] = DIV_ROUND_UP(argb_src1 & 0xffffllu, 257);
>>>> +	pixel_addr[1] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 16)) >> 16, 257);
>>>> +	pixel_addr[2] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 32)) >> 32, 257);
>>>> +	pixel_addr[3] = 0xff;
>>>> +}
>>>> +
>>>> +void convert_to_ARGB16161616(u64 argb_src1, u64 argb_src2, int x, int y,
>>>> +			     struct vkms_composer *dst_composer)
>>>> +{
>>>> +	__le64 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
>>>> +
>>>> +	*pixel_addr = cpu_to_le64(argb_src1);
>>>> +}
>>>> +
>>>> +#endif /* _VKMS_FORMATS_H_ */
>>>    
>
Pekka Paalanen Oct. 20, 2021, 8:25 a.m. UTC | #8
On Tue, 19 Oct 2021 18:10:57 -0300
Igor Matheus Andrade Torrente <igormtorrente@gmail.com> wrote:

> Hi Pekka,
> 
> On 10/19/21 5:05 AM, Pekka Paalanen wrote:
> > On Mon, 18 Oct 2021 16:26:06 -0300
> > Igor Matheus Andrade Torrente <igormtorrente@gmail.com> wrote:
> >   
> >> Hi Pekka,
> >>
> >> On 10/18/21 5:30 AM, Pekka Paalanen wrote:  
> >>> On Tue,  5 Oct 2021 17:16:37 -0300
> >>> Igor Matheus Andrade Torrente <igormtorrente@gmail.com> wrote:
> >>>      
> >>>> Currently the blend function only accepts XRGB_8888 and ARGB_8888
> >>>> as a color input.
> >>>>
> >>>> This patch refactors all the functions related to the plane composition
> >>>> to overcome this limitation.
> >>>>
> >>>> Now the blend function receives a format handler to each plane and a
> >>>> blend function pointer. It will take two ARGB_1616161616 pixels, one
> >>>> for each handler, and will use the blend function to calculate and
> >>>> store the final color in the output buffer.
> >>>>
> >>>> These format handlers will receive the `vkms_composer` and a pair of
> >>>> coordinates. And they should return the respective pixel in the
> >>>> ARGB_16161616 format.
> >>>>
> >>>> The blend function will receive two ARGB_16161616 pixels, x, y, and
> >>>> the vkms_composer of the output buffer. The method should perform the
> >>>> blend operation and store output to the format aforementioned
> >>>> ARGB_16161616.  
> >>>
> >>> Hi,
> >>>
> >>> here are some drive-by comments which you are free to take or leave.
> >>>
> >>> To find more efficient ways to do this, you might want research 
> >>> library. It's MIT licensed.
> >>>
> >>> IIRC, the elementary operations there operate on pixel lines (pointer +
> >>> length), not individual pixels (image, x, y). Input conversion function
> >>> reads and converts a whole line a time. Blending function blends two
> >>> lines and writes the output into back one of the lines. Output
> >>> conversion function takes a line and converts it into destination
> >>> buffer. That way the elementary operations do not need to take stride
> >>> into account, and blending does not need to deal with varying memory
> >>> alignment FWIW. The inner loops involve much less function calls and
> >>> state, probably.  
> >>
> >> I was doing some rudimentary profiling, and I noticed that the code
> >> spends most of the time with the indirect system call overhead and not
> >> the actual computation. This can definitively help with it.  
> > 
> > Hi,
> > 
> > I suppose you mean function (pointer) call, not system call?  
> 
> Yes, I misspelled it.
> 
> > 
> > Really good that you have already profiled things. All optimisations
> > should be guided by profiling, otherwise they are just guesses (and I
> > got lucky this time I guess).
> >   
> >>> Pixman may not do exactly this, but it does something very similar.
> >>> Pixman also has multiple different levels of optimisations, which may
> >>> not be necessary for VKMS.
> >>>
> >>> It's a trade-off between speed and temporary memory consumed. You need
> >>> temporary buffers for two lines, but not more (not a whole image in
> >>> full intermediate precision). Further optimisation could determine
> >>> whether to process whole image rows as lines, or split a row into
> >>> multiple lines to stay within CPU cache size.
> >>>      
> >>
> >> Sorry, I didn't understand the idea of the last sentence.  
> > 
> > If an image is very wide, a single row could still be relatively large
> > in size (bytes). If it is large enough that the working set falls out
> > of a faster CPU cache into a slower CPU cache (or worse yet, into RAM
> > accesses), performance may suffer and become memory bandwidth limited
> > rather than ALU rate limited. Theoretically that can be worked around
> > by limiting the maximum size of a line, and splitting an image row into
> > multiple lines.  
> 
> Got it now, thanks.
> 
> > 
> > However, this is an optimisation one probably should not do until there
> > is performance profiling data showing that it actually is useful. The
> > optimal line size limit depends on the CPU model as well. So it's a bit
> > far out, but something you could keep in mind just in case.  
> 
> OK. For now I will not deal with this complexity, and if necessary I
> will revisit it latter.
> 
> >   
> >>> Since it seems you are blending multiple planes into a single
> >>> destination which is assumed to be opaque, you might also be able to do
> >>> the multiple blends without convert-writing and read-converting to/from
> >>> the destination between every plane. I think that might be possible to
> >>> architect on top of the per-line operations still.  
> >>
> >> I tried it. But I don't know how to do this without looking like a mess.  
> 
> I don't think it changes anything, but I forgot to mention that I tried
> it with the blend per pixel and not a per line.

Hi,

I believe that can get messy per-pixel, yeah. You kind of want to keep
the per-pixel (inner loop) operations as fast as possible, and that
naturally tends to lead to contorted code as you try to optimise it
(prematurely perhaps).

I would expect per-line code to be cleaner, because there is less state
to be handled in the inner loop, and the outer loops spin rarely enough
that you can afford to write more clear code.

> > Dedicate one of the temporary line buffers for the destination, and
> > instead of writing it out and loading it back for each input plane,
> > leave it in place over all planes and write it out just once at the end.
> > 
> > I do expect more state tracking needed. You need to iterate over the
> > list of planes for each output row, extract only the used part of an
> > input plane's buffer into the other temporary line buffer, and offset
> > the destination line buffer and length to match when passing those into
> > a blending function.+  
> 
> It's exactly this state tracking that was a mess when I was trying to
> implement something similar. But this is one more thing that probably
> I will have to revisit later.

Mind the difference between state tracking in the inner loop vs. the
outer loops. Per-line, inner loop becomes simpler while the outer loops
become slightly more complicated, but it should be a net win, because
the outer loops spin fewer times.

When nesting gets too deep, code becomes a mess, or the number of local
variables in a function grows too far, it usually helps to split things
into more functions. The compiler will inline them for you
automatically when that is beneficial.

Function pointer calls cannot usually be inlined, hence they should be
kept out of the innermost loop.

> > It's not an obvious win but a trade-off, so profiling is again needed.
> > 
> > Btw. the use of temporary line buffers should also help with
> > implementing scaling. You could do the filtering during reading of the
> > input buffer. If the filter is not nearest, meaning you need to access
> > more than one input pixel per pixel-for-blending, there are a few ways
> > to go about that. You could do the filtering during the input buffer
> > reading, or you could load two input buffer rows into temporary line
> > buffers and do filtering as a separate step into yet another line
> > buffer. As the composition advances from top to bottom, only one of the
> > input buffer rows will change at a time (during up-scaling) so you can
> > avoid re-loading a row by swapping the line buffers.
> > 
> > This is getting ahead of things, so don't worry about scaling or
> > filtering yet. The first thing is to see if you can make the line
> > buffer approach give you a significant speed-up.
> >   
> >> Does the pixman perform some similar?  
> > 
> > No, Pixman composition operation has only three images: source,
> > mask, and destination. All those it can handle simultaneously, but
> > since there is no "multi-blending" API, it doesn't need to take care of
> > more.
> > 
> > IIRC, Pixman also has a form of optimised operations that do blending
> > and converting to destination in the same pass. The advantage of that
> > is that blending can work with less precision when it knows what
> > precision the output will be converted to and it saves some bandwidth
> > by not needing to write-after-blending and read-for-conversion
> > intermediate precision values. The disadvantage is that the number of
> > needed specialised blending functions explodes by the number of
> > possible destination formats. Pixman indeed makes those specialised
> > functions optional, falling back to more generic C code. I would hope
> > that VKMS does not need to go this far in optimisations though.  
> 
> This should be plenty fast indeed. Maybe worth for formats that are
> extremely common.

My feeling says that it would be better to not aim that far at first,
and see how the per-line approach works with separate pluggable
read-conversion, blending, and write-conversion functions.

Besides, combined blend-write operations would probably conflict with
the idea of blending all planes first and then writing destination once.

There are KMS properties for different blending functions, so that's a
reason to make the blending function pluggable as well. Not just for a
potential optimisation to skip the X channel on input completely.

If you want to support background color property, that might be handled
with a special read-conversion function which just fills the temporary
line buffer with the background color.

It is always possible to get faster by writing more specialised
functions that can take advantage of more invariants/assumptions, but
you get a combinatorial explosion of specialisations.


Thanks,
pq
diff mbox series

Patch

diff --git a/drivers/gpu/drm/vkms/vkms_composer.c b/drivers/gpu/drm/vkms/vkms_composer.c
index 82f79e508f81..1e7c10c02a52 100644
--- a/drivers/gpu/drm/vkms/vkms_composer.c
+++ b/drivers/gpu/drm/vkms/vkms_composer.c
@@ -9,18 +9,28 @@ 
 #include <drm/drm_vblank.h>
 
 #include "vkms_drv.h"
-
-static u32 get_pixel_from_buffer(int x, int y, const u8 *buffer,
-				 const struct vkms_composer *composer)
-{
-	u32 pixel;
-	int src_offset = composer->offset + (y * composer->pitch)
-				      + (x * composer->cpp);
-
-	pixel = *(u32 *)&buffer[src_offset];
-
-	return pixel;
-}
+#include "vkms_formats.h"
+
+#define get_output_vkms_composer(buffer_pointer, composer)		\
+	((struct vkms_composer) {					\
+		.fb = (struct drm_framebuffer) {			\
+			.format = &(struct drm_format_info) {		\
+				.format = DRM_FORMAT_ARGB16161616,	\
+			},						\
+		},							\
+		.map[0].vaddr = (buffer_pointer),			\
+		.src = (composer)->src,					\
+		.dst = (composer)->dst,					\
+		.cpp = sizeof(u64),					\
+		.pitch = drm_rect_width(&(composer)->dst) * sizeof(u64)	\
+	})
+
+struct vkms_pixel_composition_functions {
+	u64 (*get_src_pixel)(struct vkms_composer *composer, int x, int y);
+	u64 (*get_dst_pixel)(struct vkms_composer *composer, int x, int y);
+	void (*pixel_blend)(u64 argb_src1, u64 argb_src2, int x, int y,
+			    struct vkms_composer *dst_composer);
+};
 
 /**
  * compute_crc - Compute CRC value on output frame
@@ -31,42 +41,33 @@  static u32 get_pixel_from_buffer(int x, int y, const u8 *buffer,
  * returns CRC value computed using crc32 on the visible portion of
  * the final framebuffer at vaddr_out
  */
-static uint32_t compute_crc(const u8 *vaddr,
+static uint32_t compute_crc(const __le64 *vaddr,
 			    const struct vkms_composer *composer)
 {
-	int x, y;
-	u32 crc = 0, pixel = 0;
-	int x_src = composer->src.x1 >> 16;
-	int y_src = composer->src.y1 >> 16;
-	int h_src = drm_rect_height(&composer->src) >> 16;
-	int w_src = drm_rect_width(&composer->src) >> 16;
-
-	for (y = y_src; y < y_src + h_src; ++y) {
-		for (x = x_src; x < x_src + w_src; ++x) {
-			pixel = get_pixel_from_buffer(x, y, vaddr, composer);
-			crc = crc32_le(crc, (void *)&pixel, sizeof(u32));
-		}
-	}
+	int h = drm_rect_height(&composer->dst);
+	int w = drm_rect_width(&composer->dst);
 
-	return crc;
+	return crc32_le(0, (void *)vaddr, w * h * sizeof(u64));
 }
 
-static u8 blend_channel(u8 src, u8 dst, u8 alpha)
+static __le16 blend_channel(u16 src, u16 dst, u16 alpha)
 {
-	u32 pre_blend;
-	u8 new_color;
+	u64 pre_blend;
+	u16 new_color;
 
-	pre_blend = (src * 255 + dst * (255 - alpha));
+	pre_blend = (src * 0xffff + dst * (0xffff - alpha));
 
-	/* Faster div by 255 */
-	new_color = ((pre_blend + ((pre_blend + 257) >> 8)) >> 8);
+	new_color = DIV_ROUND_UP(pre_blend, 0xffff);
 
-	return new_color;
+	return cpu_to_le16(new_color);
 }
 
 /**
  * alpha_blend - alpha blending equation
- * @argb_src: src pixel on premultiplied alpha mode
+ * @argb_src1: pixel of the source plane on premultiplied alpha mode
+ * @argb_src2: pixel of the destiny planes on premultiplied alpha mode
+ * @x: The x coodinate(width) of the pixel
+ * @y: The y coodinate(heigth) of the pixel
  * @argb_dst: dst pixel completely opaque
  *
  * blend pixels using premultiplied blend formula. The current DRM assumption
@@ -74,50 +75,52 @@  static u8 blend_channel(u8 src, u8 dst, u8 alpha)
  * channel values. See more drm_plane_create_blend_mode_property(). Also, this
  * formula assumes a completely opaque background.
  */
-static void alpha_blend(const u8 *argb_src, u8 *argb_dst)
+static void alpha_blend(u64 argb_src1, u64 argb_src2, int y, int x,
+			struct vkms_composer *dst_composer)
 {
-	u8 alpha;
+	__le16 *output_pixel = packed_pixels_addr(dst_composer, y, x);
 
-	alpha = argb_src[3];
-	argb_dst[0] = blend_channel(argb_src[0], argb_dst[0], alpha);
-	argb_dst[1] = blend_channel(argb_src[1], argb_dst[1], alpha);
-	argb_dst[2] = blend_channel(argb_src[2], argb_dst[2], alpha);
-}
+	u16 src1_a = (argb_src1 & (0xffffllu << 48)) >> 48;
+	u16 src1_r = (argb_src1 & (0xffffllu << 32)) >> 32;
+	u16 src1_g = (argb_src1 & (0xffffllu << 16)) >> 16;
+	u16 src1_b = argb_src1 & 0xffffllu;
 
-/**
- * x_blend - blending equation that ignores the pixel alpha
- *
- * overwrites RGB color value from src pixel to dst pixel.
- */
-static void x_blend(const u8 *xrgb_src, u8 *xrgb_dst)
-{
-	memcpy(xrgb_dst, xrgb_src, sizeof(u8) * 3);
+	u16 src2_r = (argb_src2 & (0xffffllu << 32)) >> 32;
+	u16 src2_g = (argb_src2 & (0xffffllu << 16)) >> 16;
+	u16 src2_b = argb_src2 & 0xffffllu;
+
+	output_pixel[0] = blend_channel(src1_b, src2_b, src1_a);
+	output_pixel[1] = blend_channel(src1_g, src2_g, src1_a);
+	output_pixel[2] = blend_channel(src1_r, src2_r, src1_a);
+	output_pixel[3] = 0xffff;
 }
 
 /**
- * blend - blend value at vaddr_src with value at vaddr_dst
- * @vaddr_dst: destination address
- * @vaddr_src: source address
- * @dst_composer: destination framebuffer's metadata
  * @src_composer: source framebuffer's metadata
- * @pixel_blend: blending equation based on plane format
+ * @dst_composer: destiny framebuffer's metadata
+ * @funcs: A struct containing all the composition functions(get_src_pixel,
+ *         get_dst_pixel, and pixel_blend)
  *
- * Blend the vaddr_src value with the vaddr_dst value using a pixel blend
- * equation according to the supported plane formats DRM_FORMAT_(A/XRGB8888)
- * and clearing alpha channel to an completely opaque background. This function
- * uses buffer's metadata to locate the new composite values at vaddr_dst.
+ * Using the pixel_blend function passed as parameter, this function blends
+ * all pixels from src planes into a output buffer.
+ * Information of the output buffer is in the dst_composer parameter
+ * and the source plane in the src_composer.
+ * The get_src_pixel will use the src_composer to get the respective pixel,
+ * convert, and return it as ARGB_16161616.
+ * The same is true for the dst_composer and get_dst_pixel respectively.
+ * And finally, the blend function will receive the dst_composer, src,
+ * and dst pixels. Blend, and store thre result in the output using the
+ * dst_composer buffer information.
  *
  * TODO: completely clear the primary plane (a = 0xff) before starting to blend
  * pixel color values
  */
-static void blend(void *vaddr_dst, void *vaddr_src,
+static void blend(struct vkms_composer *src_composer,
 		  struct vkms_composer *dst_composer,
-		  struct vkms_composer *src_composer,
-		  void (*pixel_blend)(const u8 *, u8 *))
+		  struct vkms_pixel_composition_functions *funcs)
 {
 	int i, j, j_dst, i_dst;
-	int offset_src, offset_dst;
-	u8 *pixel_dst, *pixel_src;
+	u64 pixel_dst, pixel_src;
 
 	int x_src = src_composer->src.x1 >> 16;
 	int y_src = src_composer->src.y1 >> 16;
@@ -130,80 +133,101 @@  static void blend(void *vaddr_dst, void *vaddr_src,
 	int y_limit = y_src + h_dst;
 	int x_limit = x_src + w_dst;
 
-	for (i = y_src, i_dst = y_dst; i < y_limit; ++i) {
-		for (j = x_src, j_dst = x_dst; j < x_limit; ++j) {
-			offset_dst = dst_composer->offset
-				     + (i_dst * dst_composer->pitch)
-				     + (j_dst++ * dst_composer->cpp);
-			offset_src = src_composer->offset
-				     + (i * src_composer->pitch)
-				     + (j * src_composer->cpp);
-
-			pixel_src = (u8 *)(vaddr_src + offset_src);
-			pixel_dst = (u8 *)(vaddr_dst + offset_dst);
-			pixel_blend(pixel_src, pixel_dst);
-			/* clearing alpha channel (0xff)*/
-			pixel_dst[3] = 0xff;
+	for (i = y_src, i_dst = y_dst; i < y_limit; ++i, i_dst++) {
+		for (j = x_src, j_dst = x_dst; j < x_limit; ++j, j_dst++) {
+			pixel_src = funcs->get_src_pixel(src_composer, j, i);
+			pixel_dst = funcs->get_dst_pixel(dst_composer, j_dst, i_dst);
+
+			funcs->pixel_blend(pixel_src, pixel_dst, j_dst, i_dst,
+					   dst_composer);
 		}
-		i_dst++;
 	}
 }
 
-static void compose_plane(struct vkms_composer *primary_composer,
-			  struct vkms_composer *plane_composer,
-			  void *vaddr_out)
+static u64 ((*get_pixel_fmt_transform_function(u32 format))
+	    (struct vkms_composer *, int, int))
 {
-	struct drm_framebuffer *fb = &plane_composer->fb;
-	void *vaddr;
-	void (*pixel_blend)(const u8 *p_src, u8 *p_dst);
+	if (format == DRM_FORMAT_ARGB8888)
+		return &ARGB8888_to_ARGB16161616;
+	else if (format == DRM_FORMAT_ARGB16161616)
+		return &get_ARGB16161616;
+	else
+		return &XRGB8888_to_ARGB16161616;
+}
 
-	if (WARN_ON(dma_buf_map_is_null(&primary_composer->map[0])))
-		return;
+static void ((*get_pixel_blend_function(u32 format))
+	     (u64, u64, int, int, struct vkms_composer *))
+{
+	if (format == DRM_FORMAT_ARGB8888)
+		return &convert_to_ARGB8888;
+	else if (format == DRM_FORMAT_ARGB16161616)
+		return &convert_to_ARGB16161616;
+	else
+		return &convert_to_XRGB8888;
+}
 
-	vaddr = plane_composer->map[0].vaddr;
+static void compose_plane(struct vkms_composer *src_composer,
+			  struct vkms_composer *dst_composer,
+			  struct vkms_pixel_composition_functions *funcs)
+{
+	u32 src_format = src_composer->fb.format->format;
+	u32 dst_format = dst_composer->fb.format->format;
 
-	if (fb->format->format == DRM_FORMAT_ARGB8888)
-		pixel_blend = &alpha_blend;
-	else
-		pixel_blend = &x_blend;
+	funcs->get_src_pixel = get_pixel_fmt_transform_function(src_format);
+	funcs->get_dst_pixel = get_pixel_fmt_transform_function(dst_format);
 
-	blend(vaddr_out, vaddr, primary_composer, plane_composer, pixel_blend);
+	blend(src_composer, dst_composer, funcs);
 }
 
-static int compose_active_planes(void **vaddr_out,
-				 struct vkms_composer *primary_composer,
-				 struct vkms_crtc_state *crtc_state)
+static __le64 *compose_active_planes(struct vkms_composer *primary_composer,
+				     struct vkms_crtc_state *crtc_state)
 {
-	struct drm_framebuffer *fb = &primary_composer->fb;
-	struct drm_gem_object *gem_obj = drm_gem_fb_get_obj(fb, 0);
-	const void *vaddr;
+	struct vkms_plane_state **active_planes = crtc_state->active_planes;
+	int h = drm_rect_height(&primary_composer->dst);
+	int w = drm_rect_width(&primary_composer->dst);
+	struct vkms_pixel_composition_functions funcs;
+	struct vkms_composer dst_composer;
+	__le64 *vaddr_out;
 	int i;
 
-	if (!*vaddr_out) {
-		*vaddr_out = kvzalloc(gem_obj->size, GFP_KERNEL);
-		if (!*vaddr_out) {
-			DRM_ERROR("Cannot allocate memory for output frame.");
-			return -ENOMEM;
-		}
-	}
-
 	if (WARN_ON(dma_buf_map_is_null(&primary_composer->map[0])))
-		return -EINVAL;
+		return NULL;
 
-	vaddr = primary_composer->map[0].vaddr;
+	vaddr_out = kvzalloc(w * h * sizeof(__le64), GFP_KERNEL);
+	if (!vaddr_out) {
+		DRM_ERROR("Cannot allocate memory for output frame.");
+		return NULL;
+	}
 
-	memcpy(*vaddr_out, vaddr, gem_obj->size);
+	dst_composer = get_output_vkms_composer(vaddr_out, primary_composer);
+	funcs.pixel_blend = get_pixel_blend_function(DRM_FORMAT_ARGB16161616);
+	compose_plane(active_planes[0]->composer, &dst_composer, &funcs);
 
 	/* If there are other planes besides primary, we consider the active
 	 * planes should be in z-order and compose them associatively:
 	 * ((primary <- overlay) <- cursor)
 	 */
+	funcs.pixel_blend = alpha_blend;
 	for (i = 1; i < crtc_state->num_active_planes; i++)
-		compose_plane(primary_composer,
-			      crtc_state->active_planes[i]->composer,
-			      *vaddr_out);
+		compose_plane(active_planes[i]->composer, &dst_composer, &funcs);
 
-	return 0;
+	return vaddr_out;
+}
+
+static void write_wb_buffer(struct vkms_writeback_job *active_wb,
+			    struct vkms_composer *primary_composer,
+			    __le64 *vaddr_out)
+{
+	u32 dst_fb_format = active_wb->composer.fb.format->format;
+	struct vkms_pixel_composition_functions funcs;
+	struct vkms_composer src_composer;
+
+	src_composer = get_output_vkms_composer(vaddr_out, primary_composer);
+	funcs.pixel_blend = get_pixel_blend_function(dst_fb_format);
+	active_wb->composer.src = primary_composer->src;
+	active_wb->composer.dst = primary_composer->dst;
+
+	compose_plane(&src_composer, &active_wb->composer, &funcs);
 }
 
 /**
@@ -221,14 +245,14 @@  void vkms_composer_worker(struct work_struct *work)
 						struct vkms_crtc_state,
 						composer_work);
 	struct drm_crtc *crtc = crtc_state->base.crtc;
+	struct vkms_writeback_job *active_wb = crtc_state->active_writeback;
 	struct vkms_output *out = drm_crtc_to_vkms_output(crtc);
 	struct vkms_composer *primary_composer = NULL;
 	struct vkms_plane_state *act_plane = NULL;
 	bool crc_pending, wb_pending;
-	void *vaddr_out = NULL;
+	__le64 *vaddr_out = NULL;
 	u32 crc32 = 0;
 	u64 frame_start, frame_end;
-	int ret;
 
 	spin_lock_irq(&out->composer_lock);
 	frame_start = crtc_state->frame_start;
@@ -256,28 +280,21 @@  void vkms_composer_worker(struct work_struct *work)
 	if (!primary_composer)
 		return;
 
-	if (wb_pending)
-		vaddr_out = crtc_state->active_writeback->data[0].vaddr;
-
-	ret = compose_active_planes(&vaddr_out, primary_composer,
-				    crtc_state);
-	if (ret) {
-		if (ret == -EINVAL && !wb_pending)
-			kvfree(vaddr_out);
+	vaddr_out = compose_active_planes(primary_composer, crtc_state);
+	if (!vaddr_out)
 		return;
-	}
-
-	crc32 = compute_crc(vaddr_out, primary_composer);
 
 	if (wb_pending) {
+		write_wb_buffer(active_wb, primary_composer, vaddr_out);
 		drm_writeback_signal_completion(&out->wb_connector, 0);
 		spin_lock_irq(&out->composer_lock);
 		crtc_state->wb_pending = false;
 		spin_unlock_irq(&out->composer_lock);
-	} else {
-		kvfree(vaddr_out);
 	}
 
+	crc32 = compute_crc(vaddr_out, primary_composer);
+	kvfree(vaddr_out);
+
 	/*
 	 * The worker can fall behind the vblank hrtimer, make sure we catch up.
 	 */
diff --git a/drivers/gpu/drm/vkms/vkms_formats.h b/drivers/gpu/drm/vkms/vkms_formats.h
new file mode 100644
index 000000000000..60e21adbf68d
--- /dev/null
+++ b/drivers/gpu/drm/vkms/vkms_formats.h
@@ -0,0 +1,125 @@ 
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#ifndef _VKMS_FORMATS_H_
+#define _VKMS_FORMATS_H_
+
+#include <drm/drm_rect.h>
+
+#define pixel_offset(composer, x, y) \
+	((composer)->offset + ((y) * (composer)->pitch) + ((x) * (composer)->cpp))
+
+/*
+ * packed_pixels_addr - Get the pointer to pixel of a given pair of coordinates
+ *
+ * @composer: Buffer metadata
+ * @x: The x(width) coordinate of the 2D buffer
+ * @y: The y(Heigth) coordinate of the 2D buffer
+ *
+ * Takes the information stored in the composer, a pair of coordinates, and
+ * returns the address of the first color channel.
+ * This function assumes the channels are packed together, i.e. a color channel
+ * comes immediately after another. And therefore, this function doesn't work
+ * for YUV with chroma subsampling (e.g. YUV420 and NV21).
+ */
+void *packed_pixels_addr(struct vkms_composer *composer, int x, int y)
+{
+	int offset = pixel_offset(composer, x, y);
+
+	return (u8 *)composer->map[0].vaddr + offset;
+}
+
+u64 ARGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
+{
+	u8 *pixel_addr = packed_pixels_addr(composer, x, y);
+
+	/*
+	 * Organizes the channels in their respective positions and converts
+	 * the 8 bits channel to 16.
+	 * The 257 is the "conversion ratio". This number is obtained by the
+	 * (2^16 - 1) / (2^8 - 1) division. Which, in this case, tries to get
+	 * the best color value in a color space with more possibilities.
+	 * And a similar idea applies to others RGB color conversions.
+	 */
+	return ((u64)pixel_addr[3] * 257) << 48 |
+	       ((u64)pixel_addr[2] * 257) << 32 |
+	       ((u64)pixel_addr[1] * 257) << 16 |
+	       ((u64)pixel_addr[0] * 257);
+}
+
+u64 XRGB8888_to_ARGB16161616(struct vkms_composer *composer, int x, int y)
+{
+	u8 *pixel_addr = packed_pixels_addr(composer, x, y);
+
+	/*
+	 * The same as the ARGB8888 but with the alpha channel as the
+	 * maximum value as possible.
+	 */
+	return 0xffffllu << 48 |
+	       ((u64)pixel_addr[2] * 257) << 32 |
+	       ((u64)pixel_addr[1] * 257) << 16 |
+	       ((u64)pixel_addr[0] * 257);
+}
+
+u64 get_ARGB16161616(struct vkms_composer *composer, int x, int y)
+{
+	__le64 *pixel_addr = packed_pixels_addr(composer, x, y);
+
+	/*
+	 * Because the format byte order is in little-endian and this code
+	 * needs to run on big-endian machines too, we need modify
+	 * the byte order from little-endian to the CPU native byte order.
+	 */
+	return le64_to_cpu(*pixel_addr);
+}
+
+/*
+ * The following functions are used as blend operations. But unlike the
+ * `alpha_blend`, these functions take an ARGB16161616 pixel from the
+ * source, convert it to a specific format, and store it in the destination.
+ *
+ * They are used in the `compose_active_planes` and `write_wb_buffer` to
+ * copy and convert one pixel from/to the output buffer to/from
+ * another buffer (e.g. writeback buffer, primary plane buffer).
+ */
+
+void convert_to_ARGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
+			 struct vkms_composer *dst_composer)
+{
+	u8 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
+
+	/*
+	 * This sequence below is important because the format's byte order is
+	 * in little-endian. In the case of the ARGB8888 the memory is
+	 * organized this way:
+	 *
+	 * | Addr     | = blue channel
+	 * | Addr + 1 | = green channel
+	 * | Addr + 2 | = Red channel
+	 * | Addr + 3 | = Alpha channel
+	 */
+	pixel_addr[0] = DIV_ROUND_UP(argb_src1 & 0xffffllu, 257);
+	pixel_addr[1] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 16)) >> 16, 257);
+	pixel_addr[2] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 32)) >> 32, 257);
+	pixel_addr[3] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 48)) >> 48, 257);
+}
+
+void convert_to_XRGB8888(u64 argb_src1, u64 argb_src2, int x, int y,
+			 struct vkms_composer *dst_composer)
+{
+	u8 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
+
+	pixel_addr[0] = DIV_ROUND_UP(argb_src1 & 0xffffllu, 257);
+	pixel_addr[1] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 16)) >> 16, 257);
+	pixel_addr[2] = DIV_ROUND_UP((argb_src1 & (0xffffllu << 32)) >> 32, 257);
+	pixel_addr[3] = 0xff;
+}
+
+void convert_to_ARGB16161616(u64 argb_src1, u64 argb_src2, int x, int y,
+			     struct vkms_composer *dst_composer)
+{
+	__le64 *pixel_addr = packed_pixels_addr(dst_composer, x, y);
+
+	*pixel_addr = cpu_to_le64(argb_src1);
+}
+
+#endif /* _VKMS_FORMATS_H_ */