Message ID | 1252853462-9236-6-git-send-email-bgamari.foss@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
A couple of things just caught my eye while looking through the patch, perhaps to consider tweaking? On Sun, Sep 13, 2009 at 3:51 PM, Ben Gamari <bgamari.foss@gmail.com> wrote: > This patch puts in place the machinery to attempt to reset the GPU. This > will be used when attempting to recover from a GPU hang. > > Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org> > Signed-off-by: Ben Gamari <bgamari.foss@gmail.com> > --- >  drivers/gpu/drm/i915/i915_dma.c |   8 ++ >  drivers/gpu/drm/i915/i915_drv.c |  141 +++++++++++++++++++++++++++++++++++++++ >  drivers/gpu/drm/i915/i915_drv.h |   1 + >  drivers/gpu/drm/i915/i915_irq.c |   8 ++ >  drivers/gpu/drm/i915/i915_reg.h |   4 + >  5 files changed, 162 insertions(+), 0 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c > index aabe41b..25f8e75 100644 > --- a/drivers/gpu/drm/i915/i915_dma.c > +++ b/drivers/gpu/drm/i915/i915_dma.c > @@ -1173,6 +1173,9 @@ static int i915_load_modeset_init(struct drm_device *dev, >     drm_mm_init(&dev_priv->vram, 0, prealloc_size); >     DRM_INFO("set up %ldM of stolen space\n", prealloc_size / (1024*1024)); > > +    /* We're off and running w/KMS */ > +    dev_priv->mm.suspended = 0; > + >     /* Let GEM Manage from end of prealloc space to end of aperture. >     * >     * However, leave one page at the end still bound to the scratch page. > @@ -1184,7 +1187,9 @@ static int i915_load_modeset_init(struct drm_device *dev, >     */ >     i915_gem_do_init(dev, prealloc_size, agp_size - 4096); > > +    mutex_lock(&dev->struct_mutex); >     ret = i915_gem_init_ringbuffer(dev); > +    mutex_unlock(&dev->struct_mutex); >     if (ret) >         goto out; > > @@ -1433,6 +1438,9 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags) >         return ret; >     } > > +    /* Start out suspended */ > +    dev_priv->mm.suspended = 1; > + >     if (drm_core_check_feature(dev, DRIVER_MODESET)) { >         ret = i915_load_modeset_init(dev, prealloc_start, >                       prealloc_size, agp_size); > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c > index dbe568c..0c03a2a 100644 > --- a/drivers/gpu/drm/i915/i915_drv.c > +++ b/drivers/gpu/drm/i915/i915_drv.c > @@ -127,6 +127,147 @@ static int i915_resume(struct drm_device *dev) >     return ret; >  } > > +/** > + * i965_reset - reset chip after a hang > + * @dev: drm device to reset > + * @flags: reset domains > + * > + * Reset the chip.  Useful if a hang is detected. > + * > + * Procedure is fairly simple: > + *  - reset the chip using the reset reg > + *  - re-init context state > + *  - re-init hardware status page > + *  - re-init ring buffer > + *  - re-init interrupt state > + *  - re-init display > + */ > +void i965_reset(struct drm_device *dev, u8 flags) > +{ > +    drm_i915_private_t *dev_priv = dev->dev_private; > +    unsigned long timeout; > +    u8 gdrst; > +    bool need_display = true; //!(flags & (GDRST_RENDER | GDRST_MEDIA)); As the kernel's coding style isn't C99, is it worth using /* foo */ here to be compliant? > + > +#if defined(CONFIG_SMP) > +    timeout = jiffies + msecs_to_jiffies(500); > +    do { > +        udelay(100); > +    } while (mutex_is_locked(&dev->struct_mutex) && time_after(timeout, jiffies)); > + > +    if (mutex_is_locked(&dev->struct_mutex)) { > +#if 1 > +        DRM_ERROR("i915 struct_mutex lock is still held by %s. Giving on up reset.\n", dev->struct_mutex.owner->task->comm); > +        return; > +#else > +        struct task_struct *task = dev->struct_mutex.owner->task; > +        DRM_ERROR("Killing process %d (%s) for holding i915 device mutex\n", > +                task->pid, task->comm); > +        force_sig(SIGILL, task); Should be SIGKILL here? > +#endif > +    } > +#else > +    BUG_ON(mutex_is_locked(&dev->struct_mutex)); > +#endif > + > +    debug_show_all_locks(); > +    mutex_lock(&dev->struct_mutex); > + > +    /* > +     * Clear request list > +     */ > +    i915_gem_retire_requests(dev); > + > +    if (need_display) > +        i915_save_display(dev); > + > +    if (IS_I965G(dev) || IS_G4X(dev)) { > +        /* > +         * Set the domains we want to reset, then the reset bit (bit 0). > +         * Clear the reset bit after a while and wait for hardware status > +         * bit (bit 1) to be set > +         */ > +        pci_read_config_byte(dev->pdev, GDRST, &gdrst); > +        //TODO: Set domains Consider using kernel coding style perhaps? > +        pci_write_config_byte(dev->pdev, GDRST, gdrst | flags | ((flags == GDRST_FULL) ? 0x1 : 0x0)); > +        udelay(50); > +        pci_write_config_byte(dev->pdev, GDRST, gdrst & 0xfe); > + > +        /* ...we don't want to loop forever though, 500ms should be plenty */ > +        timeout = jiffies + msecs_to_jiffies(500); > +        do { > +            udelay(100); > +            pci_read_config_byte(dev->pdev, GDRST, &gdrst); > +        } while ((gdrst & 0x1) && time_after(timeout, jiffies)); > + > +        if (gdrst & 0x1) { > +            DRM_ERROR("Failed to reset chip. You're screwed."); Perhaps drop the second sentence, to keep things looking neat. > +            mutex_unlock(&dev->struct_mutex); > +            return; > +        } > +    } else { > +        DRM_ERROR("Error occurred. Don't know how to reset this chip.\n"); > +        return; > +    } > + > +    /* Ok, now get things going again... */ > + > +    /* > +     * Everything depends on having the GTT running, so we need to start > +     * there.  Fortunately we don't need to do this unless we reset the > +     * chip at a PCI level. > +     * > +     * Next we need to restore the context, but we don't use those > +     * yet either... > +     * > +     * Ring buffer needs to be re-initialized in the KMS case, or if X > +     * was running at the time of the reset (i.e. we weren't VT > +     * switched away). > +     */ > +    if (drm_core_check_feature(dev, DRIVER_MODESET) || > +      !dev_priv->mm.suspended) { > +        drm_i915_ring_buffer_t *ring = &dev_priv->ring; > +        struct drm_gem_object *obj = ring->ring_obj; > +        struct drm_i915_gem_object *obj_priv = obj->driver_private; > +        dev_priv->mm.suspended = 0; > + > +        /* Stop the ring if it's running. */ > +        I915_WRITE(PRB0_CTL, 0); > +        I915_WRITE(PRB0_TAIL, 0); > +        I915_WRITE(PRB0_HEAD, 0); > + > +        /* Initialize the ring. */ > +        I915_WRITE(PRB0_START, obj_priv->gtt_offset); > +        I915_WRITE(PRB0_CTL, > +              ((obj->size - 4096) & RING_NR_PAGES) | > +              RING_NO_REPORT | > +              RING_VALID); > +        if (!drm_core_check_feature(dev, DRIVER_MODESET)) > +            i915_kernel_lost_context(dev); > +        else { > +            ring->head = I915_READ(PRB0_HEAD) & HEAD_ADDR; > +            ring->tail = I915_READ(PRB0_TAIL) & TAIL_ADDR; > +            ring->space = ring->head - (ring->tail + 8); > +            if (ring->space < 0) > +                ring->space += ring->Size; > +        } > + > +        mutex_unlock(&dev->struct_mutex); > +        drm_irq_uninstall(dev); > +        drm_irq_install(dev); > +        mutex_lock(&dev->struct_mutex); > +    } > + > +    /* > +     * Display needs restore too... > +     */ > +    if (need_display) > +        i915_restore_display(dev); > + > +    mutex_unlock(&dev->struct_mutex); > +} > + > + >  static int __devinit >  i915_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) >  { > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index afbcaa9..8797777 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -624,6 +624,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd, >  extern int i915_emit_box(struct drm_device *dev, >             struct drm_clip_rect *boxes, >             int i, int DR1, int DR4); > +extern void i965_reset(struct drm_device *dev, u8 flags); > >  /* i915_irq.c */ >  void i915_hangcheck_elapsed(unsigned long data); > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index 126696a..dbfcf0a 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -482,6 +482,14 @@ static void i915_handle_error(struct drm_device *dev) >         I915_WRITE(IIR, I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT); >     } > > +    if (dev_priv->mm.wedged) { > +        /* > +         * Wakeup waiting processes so they don't hang > +         */ > +        printk("i915: Waking up sleeping processes\n"); > +        DRM_WAKEUP(&dev_priv->irq_queue); > +    } > + >     queue_work(dev_priv->wq, &dev_priv->error_work); >  } > > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h > index f7c2de8..99981a0 100644 > --- a/drivers/gpu/drm/i915/i915_reg.h > +++ b/drivers/gpu/drm/i915/i915_reg.h > @@ -85,6 +85,10 @@ >  #define  I915_GC_RENDER_CLOCK_200_MHZ (1 << 0) >  #define  I915_GC_RENDER_CLOCK_333_MHZ (4 << 0) >  #define LBB   0xf4 > +#define GDRST 0xc0 > +#define  GDRST_FULL   (0<<2) > +#define  GDRST_RENDER  (1<<2) > +#define  GDRST_MEDIA  (3<<2) > >  /* VGA stuff */ >
On Sun, 13 Sep 2009 18:27:48 +0100 Daniel J Blueman <daniel.blueman@gmail.com> wrote: > A couple of things just caught my eye while looking through the patch, > perhaps to consider tweaking? Thanks for looking. > > +void i965_reset(struct drm_device *dev, u8 flags) > > +{ > > + Â Â Â drm_i915_private_t *dev_priv = dev->dev_private; > > + Â Â Â unsigned long timeout; > > + Â Â Â u8 gdrst; > > + Â Â Â bool need_display = true; //!(flags & (GDRST_RENDER | > > GDRST_MEDIA)); > > As the kernel's coding style isn't C99, is it worth using /* foo */ > here to be compliant? Or just drop it and add a comment about the other two. We'd need to experiment some more before trying to use them. > > +#if 1 > > + Â Â Â Â Â Â Â DRM_ERROR("i915 struct_mutex lock is still held by > > %s. Giving on up reset.\n", dev->struct_mutex.owner->task->comm); > > + Â Â Â Â Â Â Â return; > > +#else > > + Â Â Â Â Â Â Â struct task_struct *task = > > dev->struct_mutex.owner->task; > > + Â Â Â Â Â Â Â DRM_ERROR("Killing process %d (%s) for holding i915 > > device mutex\n", > > + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â task->pid, task->comm); > > + Â Â Â Â Â Â Â force_sig(SIGILL, task); > > Should be SIGKILL here? We should just decide whether we want the warning here or the kill. Ben, I assume you tested the warning mostly? Killing a waiter will probably just kill the unlucky task who happened to be waiting for the hung batch; might be friendlier to let it continue, but it also makes the reset less reliable. > > + Â Â Â Â Â Â Â pci_read_config_byte(dev->pdev, GDRST, &gdrst); > > + Â Â Â Â Â Â Â //TODO: Set domains > > Consider using kernel coding style perhaps? Yeah, or just drop since domains are the render, media etc mentioned above. > > > + Â Â Â Â Â Â Â pci_write_config_byte(dev->pdev, GDRST, gdrst | > > flags | ((flags == GDRST_FULL) ? 0x1 : 0x0)); > > + Â Â Â Â Â Â Â udelay(50); > > + Â Â Â Â Â Â Â pci_write_config_byte(dev->pdev, GDRST, gdrst & > > 0xfe); + > > + Â Â Â Â Â Â Â /* ...we don't want to loop forever though, 500ms > > should be plenty */ > > + Â Â Â Â Â Â Â timeout = jiffies + msecs_to_jiffies(500); > > + Â Â Â Â Â Â Â do { > > + Â Â Â Â Â Â Â Â Â Â Â udelay(100); > > + Â Â Â Â Â Â Â Â Â Â Â pci_read_config_byte(dev->pdev, GDRST, > > &gdrst); > > + Â Â Â Â Â Â Â } while ((gdrst & 0x1) && time_after(timeout, > > jiffies)); + > > + Â Â Â Â Â Â Â if (gdrst & 0x1) { > > + Â Â Â Â Â Â Â Â Â Â Â DRM_ERROR("Failed to reset chip. You're > > screwed."); > > Perhaps drop the second sentence, to keep things looking neat. We might want to panic in that case?
On Sun, Sep 13, 2009 at 11:21:35AM -0700, Jesse Barnes wrote: > On Sun, 13 Sep 2009 18:27:48 +0100 > Daniel J Blueman <daniel.blueman@gmail.com> wrote: > > > A couple of things just caught my eye while looking through the patch, > > perhaps to consider tweaking? > > Thanks for looking. > > > > +void i965_reset(struct drm_device *dev, u8 flags) > > > +{ > > > + Â Â Â drm_i915_private_t *dev_priv = dev->dev_private; > > > + Â Â Â unsigned long timeout; > > > + Â Â Â u8 gdrst; > > > + Â Â Â bool need_display = true; //!(flags & (GDRST_RENDER | > > > GDRST_MEDIA)); > > > > As the kernel's coding style isn't C99, is it worth using /* foo */ > > here to be compliant? > > Or just drop it and add a comment about the other two. We'd need to > experiment some more before trying to use them. Yeah, I think this is the right thing to do. > > > > +#if 1 > > > + Â Â Â Â Â Â Â DRM_ERROR("i915 struct_mutex lock is still held by > > > %s. Giving on up reset.\n", dev->struct_mutex.owner->task->comm); > > > + Â Â Â Â Â Â Â return; > > > +#else > > > + Â Â Â Â Â Â Â struct task_struct *task = > > > dev->struct_mutex.owner->task; > > > + Â Â Â Â Â Â Â DRM_ERROR("Killing process %d (%s) for holding i915 > > > device mutex\n", > > > + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â task->pid, task->comm); > > > + Â Â Â Â Â Â Â force_sig(SIGILL, task); > > > > Should be SIGKILL here? > > We should just decide whether we want the warning here or the kill. > Ben, I assume you tested the warning mostly? Killing a waiter will > probably just kill the unlucky task who happened to be waiting for the > hung batch; might be friendlier to let it continue, but it also makes > the reset less reliable. We do need to decide what to do with the task that gave us the bad batch. This particular code, however, doesn't handle that case. By the time we get here we have already woken up the waiters with the wedged flag set so in theory no one should be holding the lock. In the event that someone is, we really have three options, 1) Try waking it up again and hope it doesn't take it this time 2) Wait for it to release it (probably will never happen) 3) Kill the process I opted to kill the process. > > > > + Â Â Â Â Â Â Â pci_read_config_byte(dev->pdev, GDRST, &gdrst); > > > + Â Â Â Â Â Â Â //TODO: Set domains > > > > Consider using kernel coding style perhaps? > > Yeah, or just drop since domains are the render, media etc mentioned > above. Yep, this definitely slipped through. > > > > > > + Â Â Â Â Â Â Â pci_write_config_byte(dev->pdev, GDRST, gdrst | > > > flags | ((flags == GDRST_FULL) ? 0x1 : 0x0)); > > > + Â Â Â Â Â Â Â udelay(50); > > > + Â Â Â Â Â Â Â pci_write_config_byte(dev->pdev, GDRST, gdrst & > > > 0xfe); + > > > + Â Â Â Â Â Â Â /* ...we don't want to loop forever though, 500ms > > > should be plenty */ > > > + Â Â Â Â Â Â Â timeout = jiffies + msecs_to_jiffies(500); > > > + Â Â Â Â Â Â Â do { > > > + Â Â Â Â Â Â Â Â Â Â Â udelay(100); > > > + Â Â Â Â Â Â Â Â Â Â Â pci_read_config_byte(dev->pdev, GDRST, > > > &gdrst); > > > + Â Â Â Â Â Â Â } while ((gdrst & 0x1) && time_after(timeout, > > > jiffies)); + > > > + Â Â Â Â Â Â Â if (gdrst & 0x1) { > > > + Â Â Â Â Â Â Â Â Â Â Â DRM_ERROR("Failed to reset chip. You're > > > screwed."); > > > > Perhaps drop the second sentence, to keep things looking neat. > > We might want to panic in that case? Meh, this seems a little much. Perhaps just WARN? Thanks for looking this over. - Ben
On Sun, Sep 13, 2009 at 03:24:03PM -0400, Ben Gamari wrote: > On Sun, Sep 13, 2009 at 11:21:35AM -0700, Jesse Barnes wrote: > > On Sun, 13 Sep 2009 18:27:48 +0100 > > Daniel J Blueman <daniel.blueman@gmail.com> wrote: > > > > > A couple of things just caught my eye while looking through the patch, > > > perhaps to consider tweaking? > > > > Thanks for looking. > > > > > > +void i965_reset(struct drm_device *dev, u8 flags) > > > > +{ > > > > + ? ? ? drm_i915_private_t *dev_priv = dev->dev_private; > > > > + ? ? ? unsigned long timeout; > > > > + ? ? ? u8 gdrst; > > > > + ? ? ? bool need_display = true; //!(flags & (GDRST_RENDER | > > > > GDRST_MEDIA)); > > > > > > As the kernel's coding style isn't C99, is it worth using /* foo */ > > > here to be compliant? > > > > Or just drop it and add a comment about the other two. We'd need to > > experiment some more before trying to use them. > > Yeah, I think this is the right thing to do. > > > > > > > +#if 1 > > > > + ? ? ? ? ? ? ? DRM_ERROR("i915 struct_mutex lock is still held by > > > > %s. Giving on up reset.\n", dev->struct_mutex.owner->task->comm); > > > > + ? ? ? ? ? ? ? return; > > > > +#else > > > > + ? ? ? ? ? ? ? struct task_struct *task = > > > > dev->struct_mutex.owner->task; > > > > + ? ? ? ? ? ? ? DRM_ERROR("Killing process %d (%s) for holding i915 > > > > device mutex\n", > > > > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? task->pid, task->comm); > > > > + ? ? ? ? ? ? ? force_sig(SIGILL, task); > > > > > > Should be SIGKILL here? > > > > We should just decide whether we want the warning here or the kill. > > Ben, I assume you tested the warning mostly? Killing a waiter will > > probably just kill the unlucky task who happened to be waiting for the > > hung batch; might be friendlier to let it continue, but it also makes > > the reset less reliable. > > We do need to decide what to do with the task that gave us the bad > batch. This particular code, however, doesn't handle that case. By the > time we get here we have already woken up the waiters with the wedged > flag set so in theory no one should be holding the lock. In the event > that someone is, we really have three options, > > 1) Try waking it up again and hope it doesn't take it this time > 2) Wait for it to release it (probably will never happen) > 3) Kill the process > > I opted to kill the process. my version of the code doesn't actually do this and it managed fine. where we'll be locked (and blocking) is essentially whenever we hit i915_wait_request. now, that fucntion in it's sleeping loop does check whether wedged is set, if so it EIO's out of there. So we set wedged (we MUST do that first), then we wakeup all sleepers. Now, anyone sleeping (and since they always sleep with the lock) will check the condition and return EIO, where they're release the lock and dump that return back to userland. Anything else means that something else is buggy. such contention problems on openbsd i've not seen. The only other option I can see is in wait_ring, where we should really also check for wedged and if so cede the lock and try again, but that's a different problem, really and has been a problem for $LONG_TIME -0-
On Sun, Sep 13, 2009 at 09:40:11PM +0100, Owain Ainsworth wrote: > > my version of the code doesn't actually do this and it managed fine. > > where we'll be locked (and blocking) is essentially whenever we hit > i915_wait_request. now, that fucntion in it's sleeping loop does check > whether wedged is set, if so it EIO's out of there. So we set wedged (we > MUST do that first), then we wakeup all sleepers. Now, anyone sleeping > (and since they always sleep with the lock) will check the condition and > return EIO, where they're release the lock and dump that return back to > userland. Anything else means that something else is buggy. > > such contention problems on openbsd i've not seen. The only other option > I can see is in wait_ring, where we should really also check for wedged > and if so cede the lock and try again, but that's a different problem, > really and has been a problem for $LONG_TIME Certainly. This code should be redundant in almost all cases. While debugging the patch I saw contention issues but these were probably triggered by bugs in my code. I kept the code, however, for cases such as the one you mention (and because I didn't think to remove it). While theoretically the locking semantics should preclude these cases, one never knows. If we can agree to clean up all of the locking paths to honor .wedged, then I agree that this code serves no purpose. - Ben
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index aabe41b..25f8e75 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -1173,6 +1173,9 @@ static int i915_load_modeset_init(struct drm_device *dev, drm_mm_init(&dev_priv->vram, 0, prealloc_size); DRM_INFO("set up %ldM of stolen space\n", prealloc_size / (1024*1024)); + /* We're off and running w/KMS */ + dev_priv->mm.suspended = 0; + /* Let GEM Manage from end of prealloc space to end of aperture. * * However, leave one page at the end still bound to the scratch page. @@ -1184,7 +1187,9 @@ static int i915_load_modeset_init(struct drm_device *dev, */ i915_gem_do_init(dev, prealloc_size, agp_size - 4096); + mutex_lock(&dev->struct_mutex); ret = i915_gem_init_ringbuffer(dev); + mutex_unlock(&dev->struct_mutex); if (ret) goto out; @@ -1433,6 +1438,9 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags) return ret; } + /* Start out suspended */ + dev_priv->mm.suspended = 1; + if (drm_core_check_feature(dev, DRIVER_MODESET)) { ret = i915_load_modeset_init(dev, prealloc_start, prealloc_size, agp_size); diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index dbe568c..0c03a2a 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -127,6 +127,147 @@ static int i915_resume(struct drm_device *dev) return ret; } +/** + * i965_reset - reset chip after a hang + * @dev: drm device to reset + * @flags: reset domains + * + * Reset the chip. Useful if a hang is detected. + * + * Procedure is fairly simple: + * - reset the chip using the reset reg + * - re-init context state + * - re-init hardware status page + * - re-init ring buffer + * - re-init interrupt state + * - re-init display + */ +void i965_reset(struct drm_device *dev, u8 flags) +{ + drm_i915_private_t *dev_priv = dev->dev_private; + unsigned long timeout; + u8 gdrst; + bool need_display = true; //!(flags & (GDRST_RENDER | GDRST_MEDIA)); + +#if defined(CONFIG_SMP) + timeout = jiffies + msecs_to_jiffies(500); + do { + udelay(100); + } while (mutex_is_locked(&dev->struct_mutex) && time_after(timeout, jiffies)); + + if (mutex_is_locked(&dev->struct_mutex)) { +#if 1 + DRM_ERROR("i915 struct_mutex lock is still held by %s. Giving on up reset.\n", dev->struct_mutex.owner->task->comm); + return; +#else + struct task_struct *task = dev->struct_mutex.owner->task; + DRM_ERROR("Killing process %d (%s) for holding i915 device mutex\n", + task->pid, task->comm); + force_sig(SIGILL, task); +#endif + } +#else + BUG_ON(mutex_is_locked(&dev->struct_mutex)); +#endif + + debug_show_all_locks(); + mutex_lock(&dev->struct_mutex); + + /* + * Clear request list + */ + i915_gem_retire_requests(dev); + + if (need_display) + i915_save_display(dev); + + if (IS_I965G(dev) || IS_G4X(dev)) { + /* + * Set the domains we want to reset, then the reset bit (bit 0). + * Clear the reset bit after a while and wait for hardware status + * bit (bit 1) to be set + */ + pci_read_config_byte(dev->pdev, GDRST, &gdrst); + //TODO: Set domains + pci_write_config_byte(dev->pdev, GDRST, gdrst | flags | ((flags == GDRST_FULL) ? 0x1 : 0x0)); + udelay(50); + pci_write_config_byte(dev->pdev, GDRST, gdrst & 0xfe); + + /* ...we don't want to loop forever though, 500ms should be plenty */ + timeout = jiffies + msecs_to_jiffies(500); + do { + udelay(100); + pci_read_config_byte(dev->pdev, GDRST, &gdrst); + } while ((gdrst & 0x1) && time_after(timeout, jiffies)); + + if (gdrst & 0x1) { + DRM_ERROR("Failed to reset chip. You're screwed."); + mutex_unlock(&dev->struct_mutex); + return; + } + } else { + DRM_ERROR("Error occurred. Don't know how to reset this chip.\n"); + return; + } + + /* Ok, now get things going again... */ + + /* + * Everything depends on having the GTT running, so we need to start + * there. Fortunately we don't need to do this unless we reset the + * chip at a PCI level. + * + * Next we need to restore the context, but we don't use those + * yet either... + * + * Ring buffer needs to be re-initialized in the KMS case, or if X + * was running at the time of the reset (i.e. we weren't VT + * switched away). + */ + if (drm_core_check_feature(dev, DRIVER_MODESET) || + !dev_priv->mm.suspended) { + drm_i915_ring_buffer_t *ring = &dev_priv->ring; + struct drm_gem_object *obj = ring->ring_obj; + struct drm_i915_gem_object *obj_priv = obj->driver_private; + dev_priv->mm.suspended = 0; + + /* Stop the ring if it's running. */ + I915_WRITE(PRB0_CTL, 0); + I915_WRITE(PRB0_TAIL, 0); + I915_WRITE(PRB0_HEAD, 0); + + /* Initialize the ring. */ + I915_WRITE(PRB0_START, obj_priv->gtt_offset); + I915_WRITE(PRB0_CTL, + ((obj->size - 4096) & RING_NR_PAGES) | + RING_NO_REPORT | + RING_VALID); + if (!drm_core_check_feature(dev, DRIVER_MODESET)) + i915_kernel_lost_context(dev); + else { + ring->head = I915_READ(PRB0_HEAD) & HEAD_ADDR; + ring->tail = I915_READ(PRB0_TAIL) & TAIL_ADDR; + ring->space = ring->head - (ring->tail + 8); + if (ring->space < 0) + ring->space += ring->Size; + } + + mutex_unlock(&dev->struct_mutex); + drm_irq_uninstall(dev); + drm_irq_install(dev); + mutex_lock(&dev->struct_mutex); + } + + /* + * Display needs restore too... + */ + if (need_display) + i915_restore_display(dev); + + mutex_unlock(&dev->struct_mutex); +} + + static int __devinit i915_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index afbcaa9..8797777 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -624,6 +624,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd, extern int i915_emit_box(struct drm_device *dev, struct drm_clip_rect *boxes, int i, int DR1, int DR4); +extern void i965_reset(struct drm_device *dev, u8 flags); /* i915_irq.c */ void i915_hangcheck_elapsed(unsigned long data); diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 126696a..dbfcf0a 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -482,6 +482,14 @@ static void i915_handle_error(struct drm_device *dev) I915_WRITE(IIR, I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT); } + if (dev_priv->mm.wedged) { + /* + * Wakeup waiting processes so they don't hang + */ + printk("i915: Waking up sleeping processes\n"); + DRM_WAKEUP(&dev_priv->irq_queue); + } + queue_work(dev_priv->wq, &dev_priv->error_work); } diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index f7c2de8..99981a0 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -85,6 +85,10 @@ #define I915_GC_RENDER_CLOCK_200_MHZ (1 << 0) #define I915_GC_RENDER_CLOCK_333_MHZ (4 << 0) #define LBB 0xf4 +#define GDRST 0xc0 +#define GDRST_FULL (0<<2) +#define GDRST_RENDER (1<<2) +#define GDRST_MEDIA (3<<2) /* VGA stuff */