diff mbox series

[V3,3/5] drm/vkms: Decouple crc operations from composer

Message ID 6e8a03dc7c666ee998ee36172a96cff39f8dd46d.1561491964.git.rodrigosiqueiramelo@gmail.com (mailing list archive)
State New, archived
Headers show
Series drm/vkms: Introduces writeback support | expand

Commit Message

Rodrigo Siqueira June 26, 2019, 1:37 a.m. UTC
In the vkms_composer.c, some of the functions related to CRC and compose
have interdependence between each other. This patch reworks some
functions inside vkms_composer to make crc and composer computation
decoupled.

This patch is preparation work for making vkms able to support new
features.

Signed-off-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com>
---
 drivers/gpu/drm/vkms/vkms_composer.c | 49 ++++++++++++++++------------
 1 file changed, 29 insertions(+), 20 deletions(-)

Comments

Daniel Vetter July 11, 2019, 8:19 a.m. UTC | #1
On Tue, Jun 25, 2019 at 10:37:58PM -0300, Rodrigo Siqueira wrote:
> In the vkms_composer.c, some of the functions related to CRC and compose
> have interdependence between each other. This patch reworks some
> functions inside vkms_composer to make crc and composer computation
> decoupled.
> 
> This patch is preparation work for making vkms able to support new
> features.
> 
> Signed-off-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com>
> ---
>  drivers/gpu/drm/vkms/vkms_composer.c | 49 ++++++++++++++++------------
>  1 file changed, 29 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/gpu/drm/vkms/vkms_composer.c b/drivers/gpu/drm/vkms/vkms_composer.c
> index eb7ea8be1f98..51a270514219 100644
> --- a/drivers/gpu/drm/vkms/vkms_composer.c
> +++ b/drivers/gpu/drm/vkms/vkms_composer.c
> @@ -105,35 +105,31 @@ static void compose_cursor(struct vkms_composer *cursor_composer,
>  	      primary_composer, cursor_composer);
>  }
>  
> -static uint32_t _vkms_get_crc(struct vkms_composer *primary_composer,
> -			      struct vkms_composer *cursor_composer)
> +static int compose_planes(void **vaddr_out,
> +			  struct vkms_composer *primary_composer,
> +			  struct vkms_composer *cursor_composer)
>  {
>  	struct drm_framebuffer *fb = &primary_composer->fb;
>  	struct drm_gem_object *gem_obj = drm_gem_fb_get_obj(fb, 0);
>  	struct vkms_gem_object *vkms_obj = drm_gem_to_vkms_gem(gem_obj);
> -	void *vaddr_out = kzalloc(vkms_obj->gem.size, GFP_KERNEL);
> -	u32 crc = 0;
>  
> -	if (!vaddr_out) {
> -		DRM_ERROR("Failed to allocate memory for output frame.");
> -		return 0;
> +	if (!*vaddr_out) {
> +		*vaddr_out = kzalloc(vkms_obj->gem.size, GFP_KERNEL);

Uh allocating memory here isn't great, since you effectily can't handle
the error at all. Also for big resolutions kzalloc will fall back to
kvmalloc I think, which is rather expensive to set up.

But I guess this is a preexisting issue, so welp.

What I would do is pull out the allocation at least, so that
compose_planes really only dos composing, can never fail because it
doesn't need to allocate memory.

> +		if (!*vaddr_out) {
> +			DRM_ERROR("Cannot allocate memory for output frame.");
> +			return -ENOMEM;
> +		}
>  	}
>  
> -	if (WARN_ON(!vkms_obj->vaddr)) {
> -		kfree(vaddr_out);
> -		return crc;
> -	}
> +	if (WARN_ON(!vkms_obj->vaddr))
> +		return -EINVAL;
>  
> -	memcpy(vaddr_out, vkms_obj->vaddr, vkms_obj->gem.size);
> +	memcpy(*vaddr_out, vkms_obj->vaddr, vkms_obj->gem.size);
>  
>  	if (cursor_composer)
> -		compose_cursor(cursor_composer, primary_composer, vaddr_out);
> +		compose_cursor(cursor_composer, primary_composer, *vaddr_out);
>  
> -	crc = compute_crc(vaddr_out, primary_composer);
> -
> -	kfree(vaddr_out);
> -
> -	return crc;
> +	return 0;
>  }
>  
>  /**
> @@ -154,9 +150,11 @@ void vkms_composer_worker(struct work_struct *work)
>  	struct vkms_output *out = drm_crtc_to_vkms_output(crtc);
>  	struct vkms_composer *primary_composer = NULL;
>  	struct vkms_composer *cursor_composer = NULL;
> +	void *vaddr_out = NULL;
>  	u32 crc32 = 0;
>  	u64 frame_start, frame_end;
>  	bool crc_pending;
> +	int ret;
>  
>  	spin_lock_irq(&out->composer_lock);
>  	frame_start = crtc_state->frame_start;
> @@ -180,14 +178,25 @@ void vkms_composer_worker(struct work_struct *work)
>  	if (crtc_state->num_active_planes == 2)
>  		cursor_composer = crtc_state->active_planes[1]->composer;
>  
> -	if (primary_composer)
> -		crc32 = _vkms_get_crc(primary_composer, cursor_composer);
> +	if (!primary_composer)
> +		return;
> +
> +	ret = compose_planes(&vaddr_out, primary_composer, cursor_composer);
> +	if (ret) {
> +		if (ret == -EINVAL)
> +			kfree(vaddr_out);
> +		return;
> +	}
> +
> +	crc32 = compute_crc(vaddr_out, primary_composer);
>  
>  	/*
>  	 * The worker can fall behind the vblank hrtimer, make sure we catch up.
>  	 */
>  	while (frame_start <= frame_end)
>  		drm_crtc_add_crc_entry(crtc, true, frame_start++, &crc32);
> +
> +	kfree(vaddr_out);

Especially since you're freeing the memory _outside_ of compose_planes.

Aside: This all kinda doesn't go in the right direction for
high-performance composing, so I guess I need to get started with typing
up what that should look like.
-Daniel

>  }
>  
>  static const char * const pipe_crc_sources[] = {"auto"};
> -- 
> 2.21.0
Simon Ser July 11, 2019, 8:23 a.m. UTC | #2
On Thursday, July 11, 2019 11:19 AM, Daniel Vetter <daniel@ffwll.ch> wrote:
> Aside: This all kinda doesn't go in the right direction for
> high-performance composing, so I guess I need to get started with typing
> up what that should look like.

Some related logs from IRC:

2019-07-10 19:42:49     danvet  ickle, is there an idiot guide to reasonable fast blending/composing with the cpu?
2019-07-10 19:43:13     danvet  for vkms, so maitainability is high on the wishlist, but it needs to be somewhat fast to be able to keep up
2019-07-10 19:44:02     ickle   pixman for rectangles
2019-07-10 19:45:00     ickle   if you want reasonably fast, you want simd
2019-07-10 19:45:16     ickle   so better to use some prebaked library
2019-07-10 19:45:38     ickle   but within vkms?
2019-07-10 19:45:48     ickle   or just for igt/vkms?
2019-07-10 19:46:40     ickle   within vkms for writeback blending I guess
2019-07-10 19:51:38     ickle   http://paste.debian.net/1091097/
2019-07-10 19:53:08     ickle   http://paste.debian.net/1091098/
2019-07-10 19:54:36     emersion        danvet: what are your plans for the compositor refactoring?
2019-07-10 19:55:21     emersion        are we still really using macros instead of functions
2019-07-10 19:56:22     <--     nvishwa1 (~nvishwa1@192.55.54.44) has quit (Remote host closed the connection)
2019-07-10 19:56:44     emersion        is this coming from pixman, ickle?
2019-07-10 19:56:50     -->     karolherbst (~kherbst@2a02:8308:b0be:6900:d9e8:6dcd:2f6a:6cb5) has joined #intel-gfx
2019-07-10 19:57:23     ickle   yes, take it as a reference on how to do abgr32 premultiplied alpha blending
2019-07-10 19:57:33     <--     amathes (ajmathes@nat/intel/x-ywywuftprrqndbaj) has quit (Ping timeout: 245 seconds)
2019-07-10 19:57:48     emersion        yeah, that's a good idea
2019-07-10 19:57:49     ickle   or argb32 actually
2019-07-10 19:58:01     emersion        (asking for the source because license)
2019-07-10 19:58:04     ickle   MIT
2019-07-10 19:58:07     emersion        nice
2019-07-10 20:00:12     <--     sandeep (sandeep@nat/intel/x-buwbzkgopszvwbcr) has quit (Remote host closed the connection)
2019-07-10 20:01:34     danvet  ickle, yeah vkms in the kernel
2019-07-10 20:02:41     danvet  also might need more than 8 bits ...
2019-07-10 20:02:57     danvet  and kinda hoped I could just tell gcc to simdify it for me
2019-07-10 20:03:23     danvet  emersion, compositor refactoring
2019-07-10 20:03:46     danvet  ickle, higher level I figured something like a fetch fifo in a standard format
2019-07-10 20:04:02     danvet  with some drm_format->standard format conversion tools
2019-07-10 20:04:08     danvet  and then one blender
2019-07-10 20:04:39     danvet  and then either add that to the crc and toss it (again maybe scanline-by-scanline, or whatever fits reasonable into l1$ all together)
2019-07-10 20:04:44     danvet  or dump it into writeback
2019-07-10 20:08:25     danvet  https://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html <- this stuff essentially, using generics
2019-07-10 20:08:38     danvet  *generic intrinsics
2019-07-10 20:08:49     danvet  or is that going to be real awful?
2019-07-10 20:09:35     ickle   shouldn't be required for _basic_ blending
2019-07-10 20:09:52     danvet  yeah I think all we want is premultiplied alpha
2019-07-10 20:09:53     ickle   if all you need is an over operator, then gcc should be pretty good
2019-07-10 20:09:57     <--     sdutt (sdutt@nat/intel/x-zhquyrigdztslyqh) has quit (Ping timeout: 268 seconds)
2019-07-10 20:09:59     danvet  maybe some yuv->rgb
2019-07-10 20:10:19     danvet  expanding from whatever silly format we have to the right vector
2019-07-10 20:10:33     emersion        what would be your universal format?
2019-07-10 20:10:46     danvet  a16r16b16g16
2019-07-10 20:10:50     danvet  except if gcc barfs on that
2019-07-10 20:11:02     danvet  or maybe go all in on 4x float :-)
2019-07-10 20:11:10     emersion        eh
2019-07-10 20:11:29     emersion        fp16 would work, but would also mean rounding errors, probably?
2019-07-10 20:11:35     danvet  uint16 is not going to be awesome for hdr, but good enough for everything else
2019-07-10 20:11:42     danvet  no cpu has fp16
2019-07-10 20:11:52     emersion        ah, fp32
2019-07-10 20:11:57     krh     wait for avx1024
2019-07-10 20:12:01     emersion        seems kind of overkill
2019-07-10 20:12:01     danvet  fg32 is probably fastest option we can get on common hw
2019-07-10 20:12:12     danvet  krh, very much aiming for good enough here
2019-07-10 20:12:26     ickle   one plan is to sneak ksim into the kernel as generic gpu-on-x86
2019-07-10 20:12:52     krh     can the kernel use avx2?
2019-07-10 20:13:05     danvet  well, all stuff I'd need to figure out
2019-07-10 20:13:07     danvet  I hope so
2019-07-10 20:13:12     ickle   it can
2019-07-10 20:13:19     ickle   easier than mmx
2019-07-10 20:13:49     emersion        when you say gcc barfs on a16r16b16g16: why?
2019-07-10 20:13:57     danvet  kernel_fpu_begin/end + telling gcc to optimize the crap out of the file with the blending functions
2019-07-10 20:14:11     emersion        uint64 too hard for gcc to optimize?
2019-07-10 20:14:14     danvet  emersion, I haven't checked, but if it generates silly code then might be better to go with fp32
2019-07-10 20:14:28     vsyrjala        the compiler always generates silly code
2019-07-10 20:14:30     danvet  ideally it should all boil down to sse/avx
2019-07-10 20:14:32     emersion        ahah
2019-07-10 20:14:51     danvet  and ideally all with generic intrinsics so the arm folks don't freak out
2019-07-10 20:15:07     -->     sandeep (~sandeep@134.134.139.76) has joined #intel-gfx
2019-07-10 20:15:23     emersion        +1 for generic intrinsics
2019-07-10 20:15:42     danvet  the conversion from uint to fp32 might be hilarious
2019-07-10 20:15:50     emersion        yeah, probably
2019-07-10 20:15:52     danvet  perhaps the one place where we want to use an sse or avx intrinsic
2019-07-10 20:16:19     danvet  especially if we convert to simd16 instead of something like 4x4
2019-07-10 20:16:22     emersion        we should probably do some little experiments before doing anything
2019-07-10 20:16:24     danvet  simd4x4
2019-07-10 20:16:31     danvet  yeah
2019-07-10 20:16:41     danvet  that's why I'm asking here, since I have roughly 0 clue about this
2019-07-10 20:17:48     danvet  I don't think we ever need a dot or anything like that, so plain simd is propably best
2019-07-10 20:18:07     danvet  except the input is usually 4x or 3x vectors
2019-07-10 20:18:07     emersion        a dot?
2019-07-10 20:18:12     danvet  dot product
2019-07-10 20:18:14     emersion        ah
2019-07-10 20:18:15     danvet  for vertex shaders
2019-07-10 20:18:20     emersion        yeah, probably not
2019-07-10 20:18:21     bwidawsk        danvet› btw, I think "generic intrinsic" is an ARM thing
2019-07-10 20:18:29     bwidawsk        I think everywhere else, they just say intrinsic
2019-07-10 20:18:41     bwidawsk        at least, I've never heard the generic prefix other than ARM compiler
2019-07-10 20:19:19     vsyrjala        just make the max supported resolution 8x8 or something and speed shouldn't be a huge issue
2019-07-10 20:19:30     emersion        i think he meant generic intrinsic vs. sse/avx/whatever
2019-07-10 20:19:49     bwidawsk        emersion› yes, I figured out what he meant, I just mentioned it because it was a source of confusion
2019-07-10 20:19:52     emersion        vsyrjala, helpful as always :)
2019-07-10 20:20:32     danvet  oh gcc has all the casting implementing in the intrinsics too
2019-07-10 20:20:50     danvet  bwidawsk, gcc manpage also calls them generic intrinsics
2019-07-10 20:21:25     bwidawsk        danvet› not my gcc manpage
2019-07-10 20:21:40     danvet  well "generic vector operations"
2019-07-10 20:21:46     danvet  ^^ that what you meant?
2019-07-10 20:22:03     danvet  vs "machine-specific vector intrinsics"
2019-07-10 20:22:38     bwidawsk        I thought the generic intrinsic term came from ARM's proprietary compiler
2019-07-10 20:22:40     bwidawsk        but I might be wrong
diff mbox series

Patch

diff --git a/drivers/gpu/drm/vkms/vkms_composer.c b/drivers/gpu/drm/vkms/vkms_composer.c
index eb7ea8be1f98..51a270514219 100644
--- a/drivers/gpu/drm/vkms/vkms_composer.c
+++ b/drivers/gpu/drm/vkms/vkms_composer.c
@@ -105,35 +105,31 @@  static void compose_cursor(struct vkms_composer *cursor_composer,
 	      primary_composer, cursor_composer);
 }
 
-static uint32_t _vkms_get_crc(struct vkms_composer *primary_composer,
-			      struct vkms_composer *cursor_composer)
+static int compose_planes(void **vaddr_out,
+			  struct vkms_composer *primary_composer,
+			  struct vkms_composer *cursor_composer)
 {
 	struct drm_framebuffer *fb = &primary_composer->fb;
 	struct drm_gem_object *gem_obj = drm_gem_fb_get_obj(fb, 0);
 	struct vkms_gem_object *vkms_obj = drm_gem_to_vkms_gem(gem_obj);
-	void *vaddr_out = kzalloc(vkms_obj->gem.size, GFP_KERNEL);
-	u32 crc = 0;
 
-	if (!vaddr_out) {
-		DRM_ERROR("Failed to allocate memory for output frame.");
-		return 0;
+	if (!*vaddr_out) {
+		*vaddr_out = kzalloc(vkms_obj->gem.size, GFP_KERNEL);
+		if (!*vaddr_out) {
+			DRM_ERROR("Cannot allocate memory for output frame.");
+			return -ENOMEM;
+		}
 	}
 
-	if (WARN_ON(!vkms_obj->vaddr)) {
-		kfree(vaddr_out);
-		return crc;
-	}
+	if (WARN_ON(!vkms_obj->vaddr))
+		return -EINVAL;
 
-	memcpy(vaddr_out, vkms_obj->vaddr, vkms_obj->gem.size);
+	memcpy(*vaddr_out, vkms_obj->vaddr, vkms_obj->gem.size);
 
 	if (cursor_composer)
-		compose_cursor(cursor_composer, primary_composer, vaddr_out);
+		compose_cursor(cursor_composer, primary_composer, *vaddr_out);
 
-	crc = compute_crc(vaddr_out, primary_composer);
-
-	kfree(vaddr_out);
-
-	return crc;
+	return 0;
 }
 
 /**
@@ -154,9 +150,11 @@  void vkms_composer_worker(struct work_struct *work)
 	struct vkms_output *out = drm_crtc_to_vkms_output(crtc);
 	struct vkms_composer *primary_composer = NULL;
 	struct vkms_composer *cursor_composer = NULL;
+	void *vaddr_out = NULL;
 	u32 crc32 = 0;
 	u64 frame_start, frame_end;
 	bool crc_pending;
+	int ret;
 
 	spin_lock_irq(&out->composer_lock);
 	frame_start = crtc_state->frame_start;
@@ -180,14 +178,25 @@  void vkms_composer_worker(struct work_struct *work)
 	if (crtc_state->num_active_planes == 2)
 		cursor_composer = crtc_state->active_planes[1]->composer;
 
-	if (primary_composer)
-		crc32 = _vkms_get_crc(primary_composer, cursor_composer);
+	if (!primary_composer)
+		return;
+
+	ret = compose_planes(&vaddr_out, primary_composer, cursor_composer);
+	if (ret) {
+		if (ret == -EINVAL)
+			kfree(vaddr_out);
+		return;
+	}
+
+	crc32 = compute_crc(vaddr_out, primary_composer);
 
 	/*
 	 * The worker can fall behind the vblank hrtimer, make sure we catch up.
 	 */
 	while (frame_start <= frame_end)
 		drm_crtc_add_crc_entry(crtc, true, frame_start++, &crc32);
+
+	kfree(vaddr_out);
 }
 
 static const char * const pipe_crc_sources[] = {"auto"};