diff mbox series

drm/i915: Check ppgtt validity for GVT GEM context

Message ID 1539065300-6345-1-git-send-email-xiong.y.zhang@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915: Check ppgtt validity for GVT GEM context | expand

Commit Message

Zhang, Xiong Y Oct. 9, 2018, 6:08 a.m. UTC
The guest couldn't boot up under GVT-g environment as the following call
trace exists:
[  272.504762] BUG: unable to handle kernel NULL pointer dereference at 0000000000000100
[  272.504834] Call Trace:
[  272.504852]  execlists_context_pin+0x2b2/0x520 [i915]
[  272.504869]  intel_gvt_scan_and_shadow_workload+0x50/0x4d0 [i915]
[  272.504887]  intel_vgpu_create_workload+0x3e2/0x570 [i915]
[  272.504901]  intel_vgpu_submit_execlist+0xc0/0x2a0 [i915]
[  272.504916]  elsp_mmio_write+0xc7/0x130 [i915]
[  272.504930]  intel_vgpu_mmio_reg_rw+0x24a/0x4c0 [i915]
[  272.504944]  intel_vgpu_emulate_mmio_write+0xac/0x240 [i915]
[  272.504947]  intel_vgpu_rw+0x22d/0x270 [kvmgt]
[  272.504949]  intel_vgpu_write+0x164/0x1f0 [kvmgt]

GVT GEM context is created by i915_gem_context_create_gvt() which doesn't
allocate ppgtt. So GVT GEM context structure doesn't have a valid
i915_hw_ppgtt.

GVT maintain a shadow ppgtt for each guest ppgtt, and attach this shadow
ppgtt to gpu when the corresponding guest ppgtt will be scheduled onto gpu.
The structure of shadow ppgtt is different from i915_hw_ppgtt, a lot of gvt
refactor work should be done in order to use i915_hw_ppgtt structure.
So let's fix the regression first.

Fixes:4a3d3f6785be("drm/i915: Match code to comment and enforce ppgtt for execlists")

Signed-off-by: Xiong Zhang <xiong.y.zhang@intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

Comments

Zhenyu Wang Oct. 11, 2018, 2:45 a.m. UTC | #1
On 2018.10.09 14:08:20 +0800, Xiong Zhang wrote:
> The guest couldn't boot up under GVT-g environment as the following call
> trace exists:
> [  272.504762] BUG: unable to handle kernel NULL pointer dereference at 0000000000000100
> [  272.504834] Call Trace:
> [  272.504852]  execlists_context_pin+0x2b2/0x520 [i915]
> [  272.504869]  intel_gvt_scan_and_shadow_workload+0x50/0x4d0 [i915]
> [  272.504887]  intel_vgpu_create_workload+0x3e2/0x570 [i915]
> [  272.504901]  intel_vgpu_submit_execlist+0xc0/0x2a0 [i915]
> [  272.504916]  elsp_mmio_write+0xc7/0x130 [i915]
> [  272.504930]  intel_vgpu_mmio_reg_rw+0x24a/0x4c0 [i915]
> [  272.504944]  intel_vgpu_emulate_mmio_write+0xac/0x240 [i915]
> [  272.504947]  intel_vgpu_rw+0x22d/0x270 [kvmgt]
> [  272.504949]  intel_vgpu_write+0x164/0x1f0 [kvmgt]
> 
> GVT GEM context is created by i915_gem_context_create_gvt() which doesn't
> allocate ppgtt. So GVT GEM context structure doesn't have a valid
> i915_hw_ppgtt.
> 
> GVT maintain a shadow ppgtt for each guest ppgtt, and attach this shadow
> ppgtt to gpu when the corresponding guest ppgtt will be scheduled onto gpu.
> The structure of shadow ppgtt is different from i915_hw_ppgtt, a lot of gvt
> refactor work should be done in order to use i915_hw_ppgtt structure.
> So let's fix the regression first.
> 
> Fixes:4a3d3f6785be("drm/i915: Match code to comment and enforce ppgtt for execlists")
> 
> Signed-off-by: Xiong Zhang <xiong.y.zhang@intel.com>
> ---

Chris, any comment? Without this, current gvt breaks on drm tip with kernel oops.
Could we fix the regression first then check gvt ctx stub ppgtt handling later?
Which would require more change on gvt side, not sure if need any more helper from
i915 ppgtt code with a clean fix..

Thanks

>  drivers/gpu/drm/i915/intel_lrc.c | 5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index ff0e2b3..d604d8a 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -385,7 +385,7 @@ static u64 execlists_update_context(struct i915_request *rq)
>  	 * PML4 is allocated during ppgtt init, so this is not needed
>  	 * in 48-bit mode.
>  	 */
> -	if (!i915_vm_is_48bit(&ppgtt->vm))
> +	if (ppgtt && !i915_vm_is_48bit(&ppgtt->vm))
>  		execlists_update_context_pdps(ppgtt, reg_state);
>  
>  	return ce->lrc_desc;
> @@ -1210,7 +1210,6 @@ execlists_context_pin(struct intel_engine_cs *engine,
>  	struct intel_context *ce = to_intel_context(ctx, engine);
>  
>  	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
> -	GEM_BUG_ON(!ctx->ppgtt);
>  
>  	if (likely(ce->pin_count++))
>  		return ce;
> @@ -2538,7 +2537,7 @@ static void execlists_init_reg_state(u32 *regs,
>  	CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(engine, 0), 0);
>  	CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(engine, 0), 0);
>  
> -	if (i915_vm_is_48bit(&ctx->ppgtt->vm)) {
> +	if (ctx->ppgtt && i915_vm_is_48bit(&ctx->ppgtt->vm)) {
>  		/* 64b PPGTT (48bit canonical)
>  		 * PDP0_DESCRIPTOR contains the base address to PML4 and
>  		 * other PDP Descriptors are ignored.
> -- 
> 2.7.4
>
Chris Wilson Oct. 11, 2018, 6:23 a.m. UTC | #2
Quoting Zhenyu Wang (2018-10-11 03:45:07)
> On 2018.10.09 14:08:20 +0800, Xiong Zhang wrote:
> > The guest couldn't boot up under GVT-g environment as the following call
> > trace exists:
> > [  272.504762] BUG: unable to handle kernel NULL pointer dereference at 0000000000000100
> > [  272.504834] Call Trace:
> > [  272.504852]  execlists_context_pin+0x2b2/0x520 [i915]
> > [  272.504869]  intel_gvt_scan_and_shadow_workload+0x50/0x4d0 [i915]
> > [  272.504887]  intel_vgpu_create_workload+0x3e2/0x570 [i915]
> > [  272.504901]  intel_vgpu_submit_execlist+0xc0/0x2a0 [i915]
> > [  272.504916]  elsp_mmio_write+0xc7/0x130 [i915]
> > [  272.504930]  intel_vgpu_mmio_reg_rw+0x24a/0x4c0 [i915]
> > [  272.504944]  intel_vgpu_emulate_mmio_write+0xac/0x240 [i915]
> > [  272.504947]  intel_vgpu_rw+0x22d/0x270 [kvmgt]
> > [  272.504949]  intel_vgpu_write+0x164/0x1f0 [kvmgt]
> > 
> > GVT GEM context is created by i915_gem_context_create_gvt() which doesn't
> > allocate ppgtt. So GVT GEM context structure doesn't have a valid
> > i915_hw_ppgtt.
> > 
> > GVT maintain a shadow ppgtt for each guest ppgtt, and attach this shadow
> > ppgtt to gpu when the corresponding guest ppgtt will be scheduled onto gpu.
> > The structure of shadow ppgtt is different from i915_hw_ppgtt, a lot of gvt
> > refactor work should be done in order to use i915_hw_ppgtt structure.
> > So let's fix the regression first.
> > 
> > Fixes:4a3d3f6785be("drm/i915: Match code to comment and enforce ppgtt for execlists")
> > 
> > Signed-off-by: Xiong Zhang <xiong.y.zhang@intel.com>
> > ---
> 
> Chris, any comment? Without this, current gvt breaks on drm tip with kernel oops.
> Could we fix the regression first then check gvt ctx stub ppgtt handling later?
> Which would require more change on gvt side, not sure if need any more helper from
> i915 ppgtt code with a clean fix..

We both thought this wasn't the approach we wanted to take. My position
is that this leaves the registers ostensibly programmed to garbage.
-Chris
Zhenyu Wang Oct. 11, 2018, 9:01 a.m. UTC | #3
On 2018.10.11 07:23:11 +0100, Chris Wilson wrote:
> Quoting Zhenyu Wang (2018-10-11 03:45:07)
> > On 2018.10.09 14:08:20 +0800, Xiong Zhang wrote:
> > > The guest couldn't boot up under GVT-g environment as the following call
> > > trace exists:
> > > [  272.504762] BUG: unable to handle kernel NULL pointer dereference at 0000000000000100
> > > [  272.504834] Call Trace:
> > > [  272.504852]  execlists_context_pin+0x2b2/0x520 [i915]
> > > [  272.504869]  intel_gvt_scan_and_shadow_workload+0x50/0x4d0 [i915]
> > > [  272.504887]  intel_vgpu_create_workload+0x3e2/0x570 [i915]
> > > [  272.504901]  intel_vgpu_submit_execlist+0xc0/0x2a0 [i915]
> > > [  272.504916]  elsp_mmio_write+0xc7/0x130 [i915]
> > > [  272.504930]  intel_vgpu_mmio_reg_rw+0x24a/0x4c0 [i915]
> > > [  272.504944]  intel_vgpu_emulate_mmio_write+0xac/0x240 [i915]
> > > [  272.504947]  intel_vgpu_rw+0x22d/0x270 [kvmgt]
> > > [  272.504949]  intel_vgpu_write+0x164/0x1f0 [kvmgt]
> > > 
> > > GVT GEM context is created by i915_gem_context_create_gvt() which doesn't
> > > allocate ppgtt. So GVT GEM context structure doesn't have a valid
> > > i915_hw_ppgtt.
> > > 
> > > GVT maintain a shadow ppgtt for each guest ppgtt, and attach this shadow
> > > ppgtt to gpu when the corresponding guest ppgtt will be scheduled onto gpu.
> > > The structure of shadow ppgtt is different from i915_hw_ppgtt, a lot of gvt
> > > refactor work should be done in order to use i915_hw_ppgtt structure.
> > > So let's fix the regression first.
> > > 
> > > Fixes:4a3d3f6785be("drm/i915: Match code to comment and enforce ppgtt for execlists")
> > > 
> > > Signed-off-by: Xiong Zhang <xiong.y.zhang@intel.com>
> > > ---
> > 
> > Chris, any comment? Without this, current gvt breaks on drm tip with kernel oops.
> > Could we fix the regression first then check gvt ctx stub ppgtt handling later?
> > Which would require more change on gvt side, not sure if need any more helper from
> > i915 ppgtt code with a clean fix..
> 
> We both thought this wasn't the approach we wanted to take. My position
> is that this leaves the registers ostensibly programmed to garbage.

yeah, I agree that's not ideal, my point is that would bring more changes e.g
gvt ctx create function need to be changed accordingly, etc. and I'm
disappointed why CI didn't catch this in first place, as Terrence has
contributed GVT guest boot test in igt, we need to add that for list patch
checking!

Current gvt nightly test is using this one to kick, we'll see if a quick
mitigation can be made.
Chris Wilson Oct. 11, 2018, 9:13 a.m. UTC | #4
Quoting Zhenyu Wang (2018-10-11 10:01:26)
> On 2018.10.11 07:23:11 +0100, Chris Wilson wrote:
> > Quoting Zhenyu Wang (2018-10-11 03:45:07)
> > > On 2018.10.09 14:08:20 +0800, Xiong Zhang wrote:
> > > > The guest couldn't boot up under GVT-g environment as the following call
> > > > trace exists:
> > > > [  272.504762] BUG: unable to handle kernel NULL pointer dereference at 0000000000000100
> > > > [  272.504834] Call Trace:
> > > > [  272.504852]  execlists_context_pin+0x2b2/0x520 [i915]
> > > > [  272.504869]  intel_gvt_scan_and_shadow_workload+0x50/0x4d0 [i915]
> > > > [  272.504887]  intel_vgpu_create_workload+0x3e2/0x570 [i915]
> > > > [  272.504901]  intel_vgpu_submit_execlist+0xc0/0x2a0 [i915]
> > > > [  272.504916]  elsp_mmio_write+0xc7/0x130 [i915]
> > > > [  272.504930]  intel_vgpu_mmio_reg_rw+0x24a/0x4c0 [i915]
> > > > [  272.504944]  intel_vgpu_emulate_mmio_write+0xac/0x240 [i915]
> > > > [  272.504947]  intel_vgpu_rw+0x22d/0x270 [kvmgt]
> > > > [  272.504949]  intel_vgpu_write+0x164/0x1f0 [kvmgt]
> > > > 
> > > > GVT GEM context is created by i915_gem_context_create_gvt() which doesn't
> > > > allocate ppgtt. So GVT GEM context structure doesn't have a valid
> > > > i915_hw_ppgtt.
> > > > 
> > > > GVT maintain a shadow ppgtt for each guest ppgtt, and attach this shadow
> > > > ppgtt to gpu when the corresponding guest ppgtt will be scheduled onto gpu.
> > > > The structure of shadow ppgtt is different from i915_hw_ppgtt, a lot of gvt
> > > > refactor work should be done in order to use i915_hw_ppgtt structure.
> > > > So let's fix the regression first.
> > > > 
> > > > Fixes:4a3d3f6785be("drm/i915: Match code to comment and enforce ppgtt for execlists")
> > > > 
> > > > Signed-off-by: Xiong Zhang <xiong.y.zhang@intel.com>
> > > > ---
> > > 
> > > Chris, any comment? Without this, current gvt breaks on drm tip with kernel oops.
> > > Could we fix the regression first then check gvt ctx stub ppgtt handling later?
> > > Which would require more change on gvt side, not sure if need any more helper from
> > > i915 ppgtt code with a clean fix..
> > 
> > We both thought this wasn't the approach we wanted to take. My position
> > is that this leaves the registers ostensibly programmed to garbage.
> 
> yeah, I agree that's not ideal, my point is that would bring more changes e.g
> gvt ctx create function need to be changed accordingly, etc. and I'm
> disappointed why CI didn't catch this in first place, as Terrence has
> contributed GVT guest boot test in igt, we need to add that for list patch
> checking!
> 
> Current gvt nightly test is using this one to kick, we'll see if a quick
> mitigation can be made.

Easiest is just to allocate the ctx->ppgtt. We waste a few pages, once.
Later on we can fine tune the integration.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index ff0e2b3..d604d8a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -385,7 +385,7 @@  static u64 execlists_update_context(struct i915_request *rq)
 	 * PML4 is allocated during ppgtt init, so this is not needed
 	 * in 48-bit mode.
 	 */
-	if (!i915_vm_is_48bit(&ppgtt->vm))
+	if (ppgtt && !i915_vm_is_48bit(&ppgtt->vm))
 		execlists_update_context_pdps(ppgtt, reg_state);
 
 	return ce->lrc_desc;
@@ -1210,7 +1210,6 @@  execlists_context_pin(struct intel_engine_cs *engine,
 	struct intel_context *ce = to_intel_context(ctx, engine);
 
 	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
-	GEM_BUG_ON(!ctx->ppgtt);
 
 	if (likely(ce->pin_count++))
 		return ce;
@@ -2538,7 +2537,7 @@  static void execlists_init_reg_state(u32 *regs,
 	CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(engine, 0), 0);
 	CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(engine, 0), 0);
 
-	if (i915_vm_is_48bit(&ctx->ppgtt->vm)) {
+	if (ctx->ppgtt && i915_vm_is_48bit(&ctx->ppgtt->vm)) {
 		/* 64b PPGTT (48bit canonical)
 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
 		 * other PDP Descriptors are ignored.