diff mbox series

[xf86-video-intel,v1] sna: Added AYUV format support for textured and sprite video adapters.

Message ID 20181002093853.2387-1-stanislav.lisovskiy@intel.com (mailing list archive)
State New, archived
Headers show
Series [xf86-video-intel,v1] sna: Added AYUV format support for textured and sprite video adapters. | expand

Commit Message

Stanislav Lisovskiy Oct. 2, 2018, 9:38 a.m. UTC
sna/gen9+: Had to split out wm_kernel from the sna_composite_op flags,
otherwise new shader kernels go beyond existing flags field.

Signed-off-by: Stanislav Lisovskiy <stanislav.lisovskiy@intel.com>
---
 src/render_program/Makefile.am                |   2 +
 .../exa_wm_src_sample_argb_ayuv.g8a           |  60 +++++++++
 .../exa_wm_src_sample_argb_ayuv.g8b           |   6 +
 src/sna/gen9_render.c                         |  62 +++++++--
 src/sna/sna_render.h                          |   8 ++
 src/sna/sna_video.c                           | 123 ++++++++++++++++++
 src/sna/sna_video.h                           |  20 +++
 src/sna/sna_video_sprite.c                    |  19 ++-
 src/sna/sna_video_textured.c                  |   8 ++
 9 files changed, 290 insertions(+), 18 deletions(-)
 create mode 100644 src/render_program/exa_wm_src_sample_argb_ayuv.g8a
 create mode 100644 src/render_program/exa_wm_src_sample_argb_ayuv.g8b

Comments

Chris Wilson Oct. 3, 2018, 11:29 a.m. UTC | #1
Quoting Stanislav Lisovskiy (2018-10-02 10:38:53)
> diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
> index 6669af9d..ef88d1f9 100644
> --- a/src/sna/sna_render.h
> +++ b/src/sna/sna_render.h
> @@ -139,20 +139,25 @@ struct sna_composite_op {
>  
>                 struct {
>                         uint32_t flags;
> +                       uint8_t wm_kernel;
>                 } gen6;
>  
>                 struct {
>                         uint32_t flags;
> +                       uint8_t wm_kernel;
>                 } gen7;
>  
>                 struct {
>                         uint32_t flags;
> +                       uint8_t wm_kernel;
>                 } gen8;
>  
>                 struct {
>                         uint32_t flags;
> +                       uint8_t wm_kernel;
>                 } gen9;
>         } u;
> +       unsigned long gen9_kernel;

Do you want to try again without the surplus changes? Maybe ask Ville
for his patches to base your work on?
-Chris
Ville Syrjälä Oct. 3, 2018, 12:28 p.m. UTC | #2
On Wed, Oct 03, 2018 at 12:29:53PM +0100, Chris Wilson wrote:
> Quoting Stanislav Lisovskiy (2018-10-02 10:38:53)
> > diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
> > index 6669af9d..ef88d1f9 100644
> > --- a/src/sna/sna_render.h
> > +++ b/src/sna/sna_render.h
> > @@ -139,20 +139,25 @@ struct sna_composite_op {
> >  
> >                 struct {
> >                         uint32_t flags;
> > +                       uint8_t wm_kernel;
> >                 } gen6;
> >  
> >                 struct {
> >                         uint32_t flags;
> > +                       uint8_t wm_kernel;
> >                 } gen7;
> >  
> >                 struct {
> >                         uint32_t flags;
> > +                       uint8_t wm_kernel;
> >                 } gen8;
> >  
> >                 struct {
> >                         uint32_t flags;
> > +                       uint8_t wm_kernel;
> >                 } gen9;
> >         } u;
> > +       unsigned long gen9_kernel;
> 
> Do you want to try again without the surplus changes? Maybe ask Ville
> for his patches to base your work on?

Unfortunaltely I still haven't managed to figure out why chrome
becomes a bit hangy on my ivb when I start to emit
3DSTATE_CONSTANT_* in the ddx.

The error state is somewhat peculiar BTW. It always hangs at the
start of a batch like so:

  ACTHD: 0x00000000 00efa014

batch (rcs0 (submitted by chrome [23031], ctx 2 [5], score 0)) at 0x00000000_00efa000
0x00efa000:      0x7a000003: PIPE_CONTROL
0x00efa004:      0x00105021:    qword write, cs stall, render target cache flush, DC flush, depth cache flush, 
0x00efa008:      0x00000000:    destination address
0x00efa00c:      0x00000000:    immediate dword low
0x00efa010:      0x00000000:    immediate dword high
0x00efa014:      0x61010008: STATE_BASE_ADDRESS
0x00efa018:      0x00000111:    general state base address 0x00000110
0x00efa01c:      0x00001001:    surface state base address 0x00001000
0x00efa020:      0x00001001:    dynamic state base address 0x00001000
0x00efa024:      0x00000001:    indirect state base address 0x00000000
0x00efa028:      0x00005001:    instruction state base address 0x00005000
0x00efa02c:      0x00000001:    general state upper bound disabled
0x00efa030:      0xfffff001:    dynamic state upper bound 0xfffff000
0x00efa034:      0x00000001:    indirect state upper bound disabled
0x00efa038:      0x00000001:    instruction state upper bound disabled
0x00efa03c:      0x7a000003: PIPE_CONTROL
0x00efa040:      0x00000c04:    no write, instruction cache invalidate, texture cache invalidate, state cache invalida>
0x00efa044:      0x00000000:    destination address
0x00efa048:      0x00000000:    immediate dword low
0x00efa04c:      0x00000000:    immediate dword high

No idea why there's an end of pipe flush as the first thing in the batch,
and no idea how that could possibly hang due to stuff that was done in
another batch/context.
Chris Wilson Oct. 3, 2018, 12:34 p.m. UTC | #3
Quoting Ville Syrjälä (2018-10-03 13:28:30)
> On Wed, Oct 03, 2018 at 12:29:53PM +0100, Chris Wilson wrote:
> > Quoting Stanislav Lisovskiy (2018-10-02 10:38:53)
> > > diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
> > > index 6669af9d..ef88d1f9 100644
> > > --- a/src/sna/sna_render.h
> > > +++ b/src/sna/sna_render.h
> > > @@ -139,20 +139,25 @@ struct sna_composite_op {
> > >  
> > >                 struct {
> > >                         uint32_t flags;
> > > +                       uint8_t wm_kernel;
> > >                 } gen6;
> > >  
> > >                 struct {
> > >                         uint32_t flags;
> > > +                       uint8_t wm_kernel;
> > >                 } gen7;
> > >  
> > >                 struct {
> > >                         uint32_t flags;
> > > +                       uint8_t wm_kernel;
> > >                 } gen8;
> > >  
> > >                 struct {
> > >                         uint32_t flags;
> > > +                       uint8_t wm_kernel;
> > >                 } gen9;
> > >         } u;
> > > +       unsigned long gen9_kernel;
> > 
> > Do you want to try again without the surplus changes? Maybe ask Ville
> > for his patches to base your work on?
> 
> Unfortunaltely I still haven't managed to figure out why chrome
> becomes a bit hangy on my ivb when I start to emit
> 3DSTATE_CONSTANT_* in the ddx.
> 
> The error state is somewhat peculiar BTW. It always hangs at the
> start of a batch like so:
> 
>   ACTHD: 0x00000000 00efa014
> 
> batch (rcs0 (submitted by chrome [23031], ctx 2 [5], score 0)) at 0x00000000_00efa000
> 0x00efa000:      0x7a000003: PIPE_CONTROL
> 0x00efa004:      0x00105021:    qword write, cs stall, render target cache flush, DC flush, depth cache flush, 
> 0x00efa008:      0x00000000:    destination address
> 0x00efa00c:      0x00000000:    immediate dword low
> 0x00efa010:      0x00000000:    immediate dword high
> 0x00efa014:      0x61010008: STATE_BASE_ADDRESS
> 0x00efa018:      0x00000111:    general state base address 0x00000110
> 0x00efa01c:      0x00001001:    surface state base address 0x00001000
> 0x00efa020:      0x00001001:    dynamic state base address 0x00001000
> 0x00efa024:      0x00000001:    indirect state base address 0x00000000
> 0x00efa028:      0x00005001:    instruction state base address 0x00005000
> 0x00efa02c:      0x00000001:    general state upper bound disabled
> 0x00efa030:      0xfffff001:    dynamic state upper bound 0xfffff000
> 0x00efa034:      0x00000001:    indirect state upper bound disabled
> 0x00efa038:      0x00000001:    instruction state upper bound disabled
> 0x00efa03c:      0x7a000003: PIPE_CONTROL
> 0x00efa040:      0x00000c04:    no write, instruction cache invalidate, texture cache invalidate, state cache invalida>
> 0x00efa044:      0x00000000:    destination address
> 0x00efa048:      0x00000000:    immediate dword low
> 0x00efa04c:      0x00000000:    immediate dword high
> 
> No idea why there's an end of pipe flush as the first thing in the batch,
> and no idea how that could possibly hang due to stuff that was done in
> another batch/context.

Yeah, that is suspect. :|

Waitasec qword write to 0? That seems fishy.
-Chris
Stanislav Lisovskiy Oct. 3, 2018, 1:17 p.m. UTC | #4
On Wed, 2018-10-03 at 12:29 +0100, Chris Wilson wrote:
> Quoting Stanislav Lisovskiy (2018-10-02 10:38:53)
> > diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
> > index 6669af9d..ef88d1f9 100644
> > --- a/src/sna/sna_render.h
> > +++ b/src/sna/sna_render.h
> > @@ -139,20 +139,25 @@ struct sna_composite_op {
> >  
> >                 struct {
> >                         uint32_t flags;
> > +                       uint8_t wm_kernel;
> >                 } gen6;
> >  
> >                 struct {
> >                         uint32_t flags;
> > +                       uint8_t wm_kernel;
> >                 } gen7;
> >  
> >                 struct {
> >                         uint32_t flags;
> > +                       uint8_t wm_kernel;
> >                 } gen8;
> >  
> >                 struct {
> >                         uint32_t flags;
> > +                       uint8_t wm_kernel;
> >                 } gen9;
> >         } u;
> > +       unsigned long gen9_kernel;
> 
> Do you want to try again without the surplus changes? Maybe ask Ville
> for his patches to base your work on?
> -Chris

Yep, I took part of Ville's patch for the flags issue, required for
gen9+.
Ville Syrjälä Oct. 3, 2018, 1:38 p.m. UTC | #5
On Wed, Oct 03, 2018 at 01:34:47PM +0100, Chris Wilson wrote:
> Quoting Ville Syrjälä (2018-10-03 13:28:30)
> > On Wed, Oct 03, 2018 at 12:29:53PM +0100, Chris Wilson wrote:
> > > Quoting Stanislav Lisovskiy (2018-10-02 10:38:53)
> > > > diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
> > > > index 6669af9d..ef88d1f9 100644
> > > > --- a/src/sna/sna_render.h
> > > > +++ b/src/sna/sna_render.h
> > > > @@ -139,20 +139,25 @@ struct sna_composite_op {
> > > >  
> > > >                 struct {
> > > >                         uint32_t flags;
> > > > +                       uint8_t wm_kernel;
> > > >                 } gen6;
> > > >  
> > > >                 struct {
> > > >                         uint32_t flags;
> > > > +                       uint8_t wm_kernel;
> > > >                 } gen7;
> > > >  
> > > >                 struct {
> > > >                         uint32_t flags;
> > > > +                       uint8_t wm_kernel;
> > > >                 } gen8;
> > > >  
> > > >                 struct {
> > > >                         uint32_t flags;
> > > > +                       uint8_t wm_kernel;
> > > >                 } gen9;
> > > >         } u;
> > > > +       unsigned long gen9_kernel;
> > > 
> > > Do you want to try again without the surplus changes? Maybe ask Ville
> > > for his patches to base your work on?
> > 
> > Unfortunaltely I still haven't managed to figure out why chrome
> > becomes a bit hangy on my ivb when I start to emit
> > 3DSTATE_CONSTANT_* in the ddx.
> > 
> > The error state is somewhat peculiar BTW. It always hangs at the
> > start of a batch like so:
> > 
> >   ACTHD: 0x00000000 00efa014
> > 
> > batch (rcs0 (submitted by chrome [23031], ctx 2 [5], score 0)) at 0x00000000_00efa000
> > 0x00efa000:      0x7a000003: PIPE_CONTROL
> > 0x00efa004:      0x00105021:    qword write, cs stall, render target cache flush, DC flush, depth cache flush, 
> > 0x00efa008:      0x00000000:    destination address
> > 0x00efa00c:      0x00000000:    immediate dword low
> > 0x00efa010:      0x00000000:    immediate dword high
> > 0x00efa014:      0x61010008: STATE_BASE_ADDRESS
> > 0x00efa018:      0x00000111:    general state base address 0x00000110
> > 0x00efa01c:      0x00001001:    surface state base address 0x00001000
> > 0x00efa020:      0x00001001:    dynamic state base address 0x00001000
> > 0x00efa024:      0x00000001:    indirect state base address 0x00000000
> > 0x00efa028:      0x00005001:    instruction state base address 0x00005000
> > 0x00efa02c:      0x00000001:    general state upper bound disabled
> > 0x00efa030:      0xfffff001:    dynamic state upper bound 0xfffff000
> > 0x00efa034:      0x00000001:    indirect state upper bound disabled
> > 0x00efa038:      0x00000001:    instruction state upper bound disabled
> > 0x00efa03c:      0x7a000003: PIPE_CONTROL
> > 0x00efa040:      0x00000c04:    no write, instruction cache invalidate, texture cache invalidate, state cache invalida>
> > 0x00efa044:      0x00000000:    destination address
> > 0x00efa048:      0x00000000:    immediate dword low
> > 0x00efa04c:      0x00000000:    immediate dword high
> > 
> > No idea why there's an end of pipe flush as the first thing in the batch,
> > and no idea how that could possibly hang due to stuff that was done in
> > another batch/context.
> 
> Yeah, that is suspect. :|
> 
> Waitasec qword write to 0? That seems fishy.

Yeah that one looked a bit odd to me as well, however looks like there
is something there:

Active (rcs0) [18]:
    00000000_03085000    20480 3f 00 00 dirty LLC
    00000000_00000000     4096 3e 02 00 dirty LLC

Full error state attached, in case you're curious about other details.
diff mbox series

Patch

diff --git a/src/render_program/Makefile.am b/src/render_program/Makefile.am
index dc58138f..e35ffa52 100644
--- a/src/render_program/Makefile.am
+++ b/src/render_program/Makefile.am
@@ -196,6 +196,7 @@  INTEL_G7B =				\
 INTEL_G8A =				\
 	exa_wm_src_affine.g8a 		\
 	exa_wm_src_sample_argb.g8a 	\
+	exa_wm_src_sample_argb_ayuv.g8a \
 	exa_wm_src_sample_nv12.g8a 	\
 	exa_wm_src_sample_planar.g8a 	\
 	exa_wm_write.g8a 		\
@@ -205,6 +206,7 @@  INTEL_G8A =				\
 
 INTEL_G8B =				\
 	exa_wm_src_affine.g8b 		\
+	exa_wm_src_sample_argb_ayuv.g8b \
 	exa_wm_src_sample_argb.g8b 	\
 	exa_wm_src_sample_nv12.g8b 	\
 	exa_wm_src_sample_planar.g8b 	\
diff --git a/src/render_program/exa_wm_src_sample_argb_ayuv.g8a b/src/render_program/exa_wm_src_sample_argb_ayuv.g8a
new file mode 100644
index 00000000..d79840ac
--- /dev/null
+++ b/src/render_program/exa_wm_src_sample_argb_ayuv.g8a
@@ -0,0 +1,60 @@ 
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang@intel.com>
+ *    Keith Packard <keithp@keithp.com>
+ */
+
+/* Sample the src surface */
+
+include(`exa_wm.g4i')
+
+undefine(`src_msg')
+undefine(`src_msg_ind')
+
+define(`src_msg',       `g65')
+define(`src_msg_ind',   `65')
+
+/* prepare sampler read back gX register, which would be written back to output */
+
+/* use simd16 sampler, param 0 is u, param 1 is v. */
+/* 'payload' loading, assuming tex coord start from g4 */
+
+/* load argb */
+mov (1) g0.8<1>UD	0x00000000UD { align1 mask_disable };
+mov (8) src_msg<1>UD g0<8,8,1>UD { align1 }; /* copy to msg start reg*/
+
+/* src_msg will be copied with g0, as it contains send desc */
+/* emit sampler 'send' cmd */
+send (16) src_msg_ind	/* msg reg index */
+	src_sample_base<1>UW /* readback */
+	null
+	sampler (1,0,F)	/* sampler message description, (binding_table,sampler_index,datatype)
+				/* here(src->dst) we should use src_sampler and src_surface */
+	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
+
+/* Put CbCr into right places */
+mov (16) src_sample_b<1>UD src_sample_r<1>UD  { align1 };
+mov (16) src_sample_r<1>UD src_sample_a<1>UD  { align1 };
+mov (16) src_sample_a<1>F 1.0F;
+
diff --git a/src/render_program/exa_wm_src_sample_argb_ayuv.g8b b/src/render_program/exa_wm_src_sample_argb_ayuv.g8b
new file mode 100644
index 00000000..4f439141
--- /dev/null
+++ b/src/render_program/exa_wm_src_sample_argb_ayuv.g8b
@@ -0,0 +1,6 @@ 
+   { 0x00000001, 0x2008060c, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x28200208, 0x008d0000, 0x00000000 },
+   { 0x02800031, 0x21c00a48, 0x06000820, 0x0a8c0001 },
+   { 0x00800001, 0x22400208, 0x002001c0, 0x00000000 },
+   { 0x00800001, 0x21c00208, 0x00200280, 0x00000000 },
+   { 0x00800001, 0x22803ee8, 0x38000000, 0x3f800000 },
diff --git a/src/sna/gen9_render.c b/src/sna/gen9_render.c
index 505b98af..29447d25 100644
--- a/src/sna/gen9_render.c
+++ b/src/sna/gen9_render.c
@@ -129,6 +129,20 @@  static const uint32_t ps_kernel_planar_bt709[][4] = {
 #include "exa_wm_write.g8b"
 };
 
+static const uint32_t ps_kernel_packed_ayuv_bt601[][4] = {
+#include "exa_wm_src_affine.g8b"
+#include "exa_wm_src_sample_argb_ayuv.g8b"
+#include "exa_wm_yuv_rgb_bt601.g8b"
+#include "exa_wm_write.g8b"
+};
+
+static const uint32_t ps_kernel_packed_ayuv_bt709[][4] = {
+#include "exa_wm_src_affine.g8b"
+#include "exa_wm_src_sample_argb_ayuv.g8b"
+#include "exa_wm_yuv_rgb_bt709.g8b"
+#include "exa_wm_write.g8b"
+};
+
 static const uint32_t ps_kernel_nv12_bt709[][4] = {
 #include "exa_wm_src_affine.g8b"
 #include "exa_wm_src_sample_nv12.g8b"
@@ -177,6 +191,8 @@  static const struct wm_kernel_info {
 	KERNEL(VIDEO_PLANAR_BT709, ps_kernel_planar_bt709, 7),
 	KERNEL(VIDEO_NV12_BT709, ps_kernel_nv12_bt709, 7),
 	KERNEL(VIDEO_PACKED_BT709, ps_kernel_packed_bt709, 2),
+	KERNEL(VIDEO_PACKED_AYUV_BT601, ps_kernel_packed_ayuv_bt601, 2),
+	KERNEL(VIDEO_PACKED_AYUV_BT709, ps_kernel_packed_ayuv_bt709, 2),
 	KERNEL(VIDEO_RGB, ps_kernel_rgb, 2),
 #endif
 };
@@ -226,19 +242,18 @@  static const struct blendinfo {
 
 #define COPY_SAMPLER 0
 #define COPY_VERTEX VERTEX_2s2s
-#define COPY_FLAGS(a) GEN9_SET_FLAGS(COPY_SAMPLER, (a) == GXcopy ? NO_BLEND : CLEAR, GEN9_WM_KERNEL_NOMASK, COPY_VERTEX)
+#define COPY_FLAGS(a) GEN9_SET_FLAGS(COPY_SAMPLER, (a) == GXcopy ? NO_BLEND : CLEAR, COPY_VERTEX)
 
 #define FILL_SAMPLER 1
 #define FILL_VERTEX VERTEX_2s2s
-#define FILL_FLAGS(op, format) GEN9_SET_FLAGS(FILL_SAMPLER, gen9_get_blend((op), false, (format)), GEN9_WM_KERNEL_NOMASK, FILL_VERTEX)
-#define FILL_FLAGS_NOBLEND GEN9_SET_FLAGS(FILL_SAMPLER, NO_BLEND, GEN9_WM_KERNEL_NOMASK, FILL_VERTEX)
+#define FILL_FLAGS(op, format) GEN9_SET_FLAGS(FILL_SAMPLER, gen9_get_blend((op), false, (format)), FILL_VERTEX)
+#define FILL_FLAGS_NOBLEND GEN9_SET_FLAGS(FILL_SAMPLER, NO_BLEND, FILL_VERTEX)
 
 #define GEN9_SAMPLER(f) (((f) >> 20) & 0xfff)
 #define GEN9_BLEND(f) (((f) >> 4) & 0x7ff)
 #define GEN9_READS_DST(f) (((f) >> 15) & 1)
-#define GEN9_KERNEL(f) (((f) >> 16) & 0xf)
 #define GEN9_VERTEX(f) (((f) >> 0) & 0xf)
-#define GEN9_SET_FLAGS(S, B, K, V)  ((S) << 20 | (K) << 16 | (B) | (V))
+#define GEN9_SET_FLAGS(S, B, V)  ((S) << 20 | (B) | (V))
 
 #define OUT_BATCH(v) batch_emit(sna, v)
 #define OUT_BATCH64(v) batch_emit64(sna, v)
@@ -1349,7 +1364,7 @@  gen9_emit_state(struct sna *sna,
 	gen9_emit_cc(sna, GEN9_BLEND(op->u.gen9.flags));
 	gen9_emit_sampler(sna, GEN9_SAMPLER(op->u.gen9.flags));
 	gen9_emit_sf(sna, GEN9_VERTEX(op->u.gen9.flags) >> 2);
-	gen9_emit_wm(sna, GEN9_KERNEL(op->u.gen9.flags));
+	gen9_emit_wm(sna, op->u.gen9.wm_kernel);
 	gen9_emit_vertex_elements(sna, op);
 	gen9_emit_binding_table(sna, wm_binding_table);
 
@@ -1618,7 +1633,7 @@  static int gen9_get_rectangles__flush(struct sna *sna,
 		if (gen9_magic_ca_pass(sna, op)) {
 			gen9_emit_pipe_invalidate(sna);
 			gen9_emit_cc(sna, GEN9_BLEND(op->u.gen9.flags));
-			gen9_emit_wm(sna, GEN9_KERNEL(op->u.gen9.flags));
+			gen9_emit_wm(sna, op->u.gen9.wm_kernel);
 		}
 	}
 
@@ -2548,12 +2563,16 @@  gen9_render_composite(struct sna *sna,
 			       gen9_get_blend(tmp->op,
 					      tmp->has_component_alpha,
 					      tmp->dst.format),
-			       gen9_choose_composite_kernel(tmp->op,
-							    tmp->mask.bo != NULL,
-							    tmp->has_component_alpha,
-							    tmp->is_affine),
 			       gen4_choose_composite_emitter(sna, tmp));
+	tmp->u.gen9.wm_kernel = gen9_choose_composite_kernel(tmp->op,
+							     tmp->mask.bo != NULL,
+							     tmp->has_component_alpha,
+							     tmp->is_affine);
 
+        tmp->gen9_kernel = gen9_choose_composite_kernel(tmp->op,
+							    tmp->mask.bo != NULL,
+							    tmp->has_component_alpha,
+							    tmp->is_affine);
 	tmp->blt   = gen9_render_composite_blt;
 	tmp->box   = gen9_render_composite_box;
 	tmp->boxes = gen9_render_composite_boxes__blt;
@@ -2781,9 +2800,11 @@  gen9_render_composite_spans(struct sna *sna,
 					      SAMPLER_FILTER_NEAREST,
 					      SAMPLER_EXTEND_PAD),
 			       gen9_get_blend(tmp->base.op, false, tmp->base.dst.format),
-			       GEN9_WM_KERNEL_OPACITY | !tmp->base.is_affine,
 			       gen4_choose_spans_emitter(sna, tmp));
+	tmp->base.u.gen9.wm_kernel =
+		GEN9_WM_KERNEL_OPACITY | !tmp->base.is_affine;
 
+	tmp->base.gen9_kernel = GEN9_WM_KERNEL_OPACITY | !tmp->base.is_affine;
 	tmp->box   = gen9_render_composite_spans_box;
 	tmp->boxes = gen9_render_composite_spans_boxes;
 	if (tmp->emit_boxes)
@@ -3045,6 +3066,7 @@  fallback_blt:
 	tmp.need_magic_ca_pass = 0;
 
 	tmp.u.gen9.flags = COPY_FLAGS(alu);
+	tmp.u.gen9.wm_kernel = GEN9_WM_KERNEL_NOMASK;
 
 	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp.dst.bo);
 	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, tmp.src.bo, NULL)) {
@@ -3214,6 +3236,7 @@  fallback:
 	op->base.floats_per_rect = 6;
 
 	op->base.u.gen9.flags = COPY_FLAGS(alu);
+	op->base.u.gen9.wm_kernel = GEN9_WM_KERNEL_NOMASK;
 
 	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
 	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
@@ -3366,6 +3389,7 @@  gen9_render_fill_boxes(struct sna *sna,
 	tmp.need_magic_ca_pass = false;
 
 	tmp.u.gen9.flags = FILL_FLAGS(op, format);
+	tmp.u.gen9.wm_kernel = GEN9_WM_KERNEL_NOMASK;
 
 	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
 	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
@@ -3552,6 +3576,7 @@  gen9_render_fill(struct sna *sna, uint8_t alu,
 	op->base.floats_per_rect = 6;
 
 	op->base.u.gen9.flags = FILL_FLAGS_NOBLEND;
+	op->base.u.gen9.wm_kernel = GEN9_WM_KERNEL_NOMASK;
 
 	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
 	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
@@ -3637,6 +3662,7 @@  gen9_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	tmp.need_magic_ca_pass = false;
 
 	tmp.u.gen9.flags = FILL_FLAGS_NOBLEND;
+	tmp.u.gen9.wm_kernel = GEN9_WM_KERNEL_NOMASK;
 
 	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
 	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
@@ -3723,6 +3749,7 @@  gen9_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 	tmp.need_magic_ca_pass = false;
 
 	tmp.u.gen9.flags = FILL_FLAGS_NOBLEND;
+	tmp.u.gen9.wm_kernel = GEN9_WM_KERNEL_NOMASK;
 
 	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
 	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
@@ -3847,6 +3874,8 @@  static void gen9_emit_video_state(struct sna *sna,
 			src_surf_format[0] = SURFACEFORMAT_B8G8R8X8_UNORM;
 		else if (frame->id == FOURCC_UYVY)
 			src_surf_format[0] = SURFACEFORMAT_YCRCB_SWAPY;
+		else if (is_ayuv_fourcc(frame->id))
+			src_surf_format[0] = SURFACEFORMAT_B8G8R8A8_UNORM;
 		else
 			src_surf_format[0] = SURFACEFORMAT_YCRCB_NORMAL;
 
@@ -3897,6 +3926,9 @@  static unsigned select_video_kernel(const struct sna_video *video,
 	case FOURCC_RGB565:
 		return GEN9_WM_KERNEL_VIDEO_RGB;
 
+	case FOURCC_AYUV:
+		return GEN9_WM_KERNEL_VIDEO_PACKED_AYUV_BT601;
+
 	default:
 		return video->colorspace ?
 			GEN9_WM_KERNEL_VIDEO_PACKED_BT709 :
@@ -3964,8 +3996,8 @@  gen9_render_video(struct sna *sna,
 		GEN9_SET_FLAGS(SAMPLER_OFFSET(filter, SAMPLER_EXTEND_PAD,
 					      SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE),
 			       NO_BLEND,
-			       select_video_kernel(video, frame),
 			       2);
+	tmp.u.gen9.wm_kernel = select_video_kernel(video, frame);
 	tmp.priv = frame;
 
 	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp.dst.bo);
@@ -4074,6 +4106,7 @@  static void gen9_render_fini(struct sna *sna)
 	kgem_bo_destroy(&sna->kgem, sna->render_state.gen9.general_bo);
 }
 
+
 static bool gen9_render_setup(struct sna *sna)
 {
 	struct gen9_render_state *state = &sna->render_state.gen9;
@@ -4135,6 +4168,9 @@  static bool gen9_render_setup(struct sna *sna)
 		assert(state->wm_kernel[m][0]|state->wm_kernel[m][1]|state->wm_kernel[m][2]);
 	}
 
+	COMPILE_TIME_ASSERT(GEN9_WM_KERNEL_COUNT <=
+			    1 << (sizeof(((struct sna_composite_op *)NULL)->u.gen9.wm_kernel) * 8));
+
 	COMPILE_TIME_ASSERT(SAMPLER_OFFSET(FILTER_COUNT, EXTEND_COUNT, FILTER_COUNT, EXTEND_COUNT) <= 0x7ff);
 	ss = sna_static_stream_map(&general,
 				   2 * sizeof(*ss) *
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index 6669af9d..ef88d1f9 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -139,20 +139,25 @@  struct sna_composite_op {
 
 		struct {
 			uint32_t flags;
+			uint8_t wm_kernel;
 		} gen6;
 
 		struct {
 			uint32_t flags;
+			uint8_t wm_kernel;
 		} gen7;
 
 		struct {
 			uint32_t flags;
+			uint8_t wm_kernel;
 		} gen8;
 
 		struct {
 			uint32_t flags;
+			uint8_t wm_kernel;
 		} gen9;
 	} u;
+	unsigned long gen9_kernel;
 
 	void *priv;
 };
@@ -616,6 +621,9 @@  enum {
 	GEN9_WM_KERNEL_VIDEO_NV12_BT709,
 	GEN9_WM_KERNEL_VIDEO_PACKED_BT709,
 
+	GEN9_WM_KERNEL_VIDEO_PACKED_AYUV_BT601,
+	GEN9_WM_KERNEL_VIDEO_PACKED_AYUV_BT709,
+
 	GEN9_WM_KERNEL_VIDEO_RGB,
 	GEN9_WM_KERNEL_COUNT
 };
diff --git a/src/sna/sna_video.c b/src/sna/sna_video.c
index 55405f81..1504b304 100644
--- a/src/sna/sna_video.c
+++ b/src/sna/sna_video.c
@@ -281,6 +281,7 @@  sna_video_frame_set_rotation(struct sna_video *video,
 	} else {
 		switch (frame->id) {
 		case FOURCC_RGB888:
+		case FOURCC_AYUV:
 			if (rotation & (RR_Rotate_90 | RR_Rotate_270)) {
 				frame->pitch[0] = ALIGN((height << 2), align);
 				frame->size = (int)frame->pitch[0] * width;
@@ -584,6 +585,125 @@  sna_copy_packed_data(struct sna_video *video,
 	}
 }
 
+static void
+sna_copy_packed_data_ayuv(struct sna_video *video,
+		     const struct sna_video_frame *frame,
+		     const uint8_t *buf,
+		     uint8_t *dst)
+{
+	int pitch = frame->width << 2;
+	const uint8_t *src, *s;
+	int x, y, w, h;
+	int i, j;
+
+	if (video->textured) {
+		/* XXX support copying cropped extents */
+		x = y = 0;
+		w = frame->width;
+		h = frame->height;
+	} else {
+		x = frame->image.x1;
+		y = frame->image.y1;
+		w = frame->image.x2 - frame->image.x1;
+		h = frame->image.y2 - frame->image.y1;
+	}
+
+	src = buf + (y * pitch) + (x << 2);
+	switch (frame->rotation) {
+	case RR_Rotate_0:
+		w <<= 2;
+		for (i = 0; i < h; i++) {
+			for (j = 0;j < w; j += 4) {
+				uint32_t reverse_dw, dw = *((uint32_t*)(&src[i * frame->pitch[0] + j]));
+				if (!video->textured) {
+					/* For textured we do byte reversing in shader */
+					reverse_dw = 0;
+					reverse_dw |= ((dw & 0x000000ff) << 24);
+					reverse_dw |= ((dw & 0x0000ff00) << 8);
+					reverse_dw |= ((dw & 0x00ff0000) >> 8);
+					reverse_dw |= (dw >> 24);
+				}
+				else
+					reverse_dw = dw;
+				*((uint32_t*)&dst[i * frame->pitch[0] + j]) = reverse_dw;
+			}
+		}
+		break;
+	case RR_Rotate_90:
+		h <<= 2;
+		for (i = 0; i < h; i += 4) {
+			for (j = 0;j < w; j++) {
+				uint32_t reverse_dw, dw;
+				dw = 0;
+				dw |= (src[i * frame->pitch[0] + j]);
+				dw |= ((uint32_t)src[(i + 1) * frame->pitch[0] + j] << 8);
+				dw |= ((uint32_t)src[(i + 2) * frame->pitch[0] + j] << 16);
+				dw |= ((uint32_t)src[(i + 3) * frame->pitch[0] + j] << 24);
+				if (!video->textured) {
+					/* For textured we do byte reversing in shader */
+					reverse_dw = 0;
+					reverse_dw |= ((dw & 0x000000ff) << 24);
+					reverse_dw |= ((dw & 0x0000ff00) << 8);
+					reverse_dw |= ((dw & 0x00ff0000) >> 8);
+					reverse_dw |= (dw >> 24);
+				}
+				else
+					reverse_dw = dw;
+				*((uint32_t*)&dst[(w - j - 1) * h + i]) = reverse_dw;
+			}
+		}
+		break;
+	case RR_Rotate_180:
+		w <<= 2;
+		for (i = 0; i < h; i++) {
+			for (j = 0;j < w; j += 4) {
+				uint32_t reverse_dw, dw;
+				dw = 0;
+				dw |= (src[i * frame->pitch[0] + j + 3]);
+				dw |= ((uint32_t)src[i * frame->pitch[0] + j + 2] << 8);
+				dw |= ((uint32_t)src[i * frame->pitch[0] + j + 1] << 16);
+				dw |= ((uint32_t)src[i * frame->pitch[0]] << 24);
+				if (!video->textured) {
+					/* For textured we do byte reversing in shader */
+					reverse_dw = 0;
+					reverse_dw |= ((dw & 0x000000ff) << 24);
+					reverse_dw |= ((dw & 0x0000ff00) << 8);
+					reverse_dw |= ((dw & 0x00ff0000) >> 8);
+					reverse_dw |= (dw >> 24);
+				}
+				else
+					reverse_dw = dw;
+				*((uint32_t*)&dst[(h - i - 1) * w + (w - j - 4)]) = reverse_dw;
+			}
+		}
+		break;
+	case RR_Rotate_270:
+		h <<= 2;
+		for (i = 0; i < h; i += 4) {
+			for (j = 0; j < w; j++) {
+				uint32_t reverse_dw, dw;
+				dw = 0;
+				dw |= (src[(i + 3) * frame->pitch[0] + j]);
+				dw |= ((uint32_t)src[(i + 2) * frame->pitch[0] + j] << 8);
+				dw |= ((uint32_t)src[(i + 1) * frame->pitch[0] + j] << 16);
+				dw |= ((uint32_t)src[i * frame->pitch[0] + j] << 24);
+				if (!video->textured) {
+					/* For textured we do byte reversing in shader */
+					reverse_dw = 0;
+					reverse_dw |= ((dw & 0x000000ff) << 24);
+					reverse_dw |= ((dw & 0x0000ff00) << 8);
+					reverse_dw |= ((dw & 0x00ff0000) >> 8);
+					reverse_dw |= (dw >> 24);
+				}
+				else
+					reverse_dw = dw;
+				*((uint32_t*)&dst[j * h + (h - i - 4)]) = reverse_dw;
+			}
+		}
+		break;
+	}
+}
+
 bool
 sna_video_copy_data(struct sna_video *video,
 		    struct sna_video_frame *frame,
@@ -709,6 +829,9 @@  use_gtt: /* copy data, must use GTT so that we keep the overlay uncached */
 		sna_copy_nv12_data(video, frame, buf, dst);
 	else if (is_planar_fourcc(frame->id))
 		sna_copy_planar_data(video, frame, buf, dst);
+	else if (is_ayuv_fourcc(frame->id))
+		/* Some hardcoding is done in default sna_copy_packed_data, so added a specific function */
+		sna_copy_packed_data_ayuv(video, frame, buf, dst);
 	else
 		sna_copy_packed_data(video, frame, buf, dst);
 
diff --git a/src/sna/sna_video.h b/src/sna/sna_video.h
index bbd3f0fd..d18c79e5 100644
--- a/src/sna/sna_video.h
+++ b/src/sna/sna_video.h
@@ -39,6 +39,7 @@  THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define FOURCC_RGB565 ((16 << 24) + ('B' << 16) + ('G' << 8) + 'R')
 #define FOURCC_RGB888 ((24 << 24) + ('B' << 16) + ('G' << 8) + 'R')
 #define FOURCC_NV12 (('2' << 24) + ('1' << 16) + ('V' << 8) + 'N')
+#define FOURCC_AYUV (('V' << 24) + ('U' << 16) + ('Y' << 8) + 'A')
 
 /*
  * Below, a dummy picture type that is used in XvPutImage
@@ -79,6 +80,15 @@  THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 	XvTopToBottom \
 }
 
+#define XVIMAGE_AYUV { \
+	FOURCC_AYUV, XvYUV, LSBFirst, \
+	{'P', 'A', 'S', 'S', 'T', 'H', 'R', 'O', 'U', 'G', 'H', ' ', 'A', 'Y', 'U', 'V'}, \
+	32, XvPacked, 1, 24, 0xff<<16, 0xff<<8, 0xff<<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+	{'V', 'U', 'Y', 'X', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \
+	XvTopToBottom \
+}
+
+
 struct sna_video {
 	struct sna *sna;
 
@@ -189,6 +199,16 @@  static inline int is_nv12_fourcc(int id)
 	}
 }
 
+static inline int is_ayuv_fourcc(int id)
+{
+	switch (id) {
+	case FOURCC_AYUV:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
 bool
 sna_video_clip_helper(struct sna_video *video,
 		      struct sna_video_frame *frame,
diff --git a/src/sna/sna_video_sprite.c b/src/sna/sna_video_sprite.c
index 8b7ae8ae..b6882195 100644
--- a/src/sna/sna_video_sprite.c
+++ b/src/sna/sna_video_sprite.c
@@ -47,7 +47,7 @@ 
 #define DRM_FORMAT_YUYV         fourcc_code('Y', 'U', 'Y', 'V') /* [31:0] Cr0:Y1:Cb0:Y0 8:8:8:8 little endian */
 #define DRM_FORMAT_UYVY         fourcc_code('U', 'Y', 'V', 'Y') /* [31:0] Y1:Cr0:Y0:Cb0 8:8:8:8 little endian */
 #define DRM_FORMAT_NV12         fourcc_code('N', 'V', '1', '2') /* 2x2 subsampled Cr:Cb plane */
-
+#define DRM_FORMAT_XYUV         fourcc_code('X', 'Y', 'U', 'V') /* 2x2 subsampled Cr:Cb plane */
 #define has_hw_scaling(sna, video) ((sna)->kgem.gen < 071 || \
 				    (sna)->kgem.gen >= 0110)
 
@@ -74,11 +74,11 @@  static Atom xvColorKey, xvAlwaysOnTop, xvSyncToVblank, xvColorspace;
 
 static XvFormatRec formats[] = { {15}, {16}, {24} };
 static const XvImageRec images[] = { XVIMAGE_YUY2, XVIMAGE_UYVY,
-				     XVMC_RGB888 };
+				     XVMC_RGB888, XVIMAGE_AYUV };
 static const XvImageRec images_rgb565[] = { XVIMAGE_YUY2, XVIMAGE_UYVY,
-					    XVMC_RGB888, XVMC_RGB565 };
+					    XVMC_RGB888, XVMC_RGB565, XVIMAGE_AYUV };
 static const XvImageRec images_nv12[] = { XVIMAGE_YUY2, XVIMAGE_UYVY,
-					  XVIMAGE_NV12, XVMC_RGB888, XVMC_RGB565 };
+					  XVIMAGE_NV12, XVMC_RGB888, XVMC_RGB565, XVIMAGE_AYUV };
 static const XvAttributeRec attribs[] = {
 	{ XvSettable | XvGettable, 0, 1, (char *)"XV_COLORSPACE" }, /* BT.601, BT.709 */
 	{ XvSettable | XvGettable, 0, 0xffffff, (char *)"XV_COLORKEY" },
@@ -364,6 +364,10 @@  sna_video_sprite_show(struct sna *sna,
 		case FOURCC_UYVY:
 			f.pixel_format = DRM_FORMAT_UYVY;
 			break;
+		case FOURCC_AYUV:
+			/* i915 doesn't support alpha, so we use XYUV */
+			f.pixel_format = DRM_FORMAT_XYUV;
+			break;
 		case FOURCC_YUY2:
 		default:
 			f.pixel_format = DRM_FORMAT_YUYV;
@@ -705,7 +709,12 @@  static int sna_video_sprite_query(ddQueryImageAttributes_ARGS)
 		tmp *= (*h >> 1);
 		size += tmp;
 		break;
-
+	case FOURCC_AYUV:
+		tmp = *w << 2;
+		if (pitches)
+			pitches[0] = tmp;
+		size = *h * tmp;
+		break;
 	default:
 		*w = (*w + 1) & ~1;
 		*h = (*h + 1) & ~1;
diff --git a/src/sna/sna_video_textured.c b/src/sna/sna_video_textured.c
index a784fe2e..21c5f379 100644
--- a/src/sna/sna_video_textured.c
+++ b/src/sna/sna_video_textured.c
@@ -68,6 +68,8 @@  static const XvImageRec gen4_Images[] = {
 	XVIMAGE_I420,
 	XVIMAGE_NV12,
 	XVIMAGE_UYVY,
+	XVIMAGE_AYUV,
+	XVMC_RGB888,
 	XVMC_YUV,
 };
 
@@ -337,6 +339,12 @@  sna_video_textured_query(ddQueryImageAttributes_ARGS)
 			pitches[0] = size;
 		size *= *h;
 		break;
+	case FOURCC_AYUV:
+		size = *w << 2;
+		if (pitches)
+			pitches[0] = size;
+		size *= *h;
+		break;
 	case FOURCC_XVMC:
 		*h = (*h + 1) & ~1;
 		size = sizeof(uint32_t);