diff mbox

solo6010 modprobe lockup since e1ceb25a (v4.3 regression)

Message ID 20160915130441.ji3f3jiiebsnsbct@acer (mailing list archive)
State New, archived
Headers show

Commit Message

Andrey Utkin Sept. 15, 2016, 1:04 p.m. UTC
Hi Krzysztof,

Me and one more solo6010 board user experience machine lockup when
solo6x10 module is loaded on kernel series starting with 4.3 (despite
solo6110 board probes just fine on all kernels). That is, 3.16, 3.18,
4.1 and 4.2 are tested and fine, and 4.3, 4.4, and others up to current
linux-next are bad.
So regression slipped in between 4.2 and 4.3. The diff between
stable/linux-4.2.y and ...-4.3.y (which were tested) is not large, and
my suspect fell on ripoff of register writing procedures complexity,
which was introduced in e1ceb25a (see below). Reversion of that fixes
lockup.  However, if, on top of reversion of e1ceb25a, i drop barrier
stuff and pci_read_config... (see
https://github.com/bluecherrydvr/linux/commit/d59aaf3), leaving the
spinlock stuff, it locks up again.  This is a matter in which I'm not
quite qualified, so I have no idea what that code copes with and why
this workaround works for solo6010.  For now I think I'll tell the
customer to use kernel with e1ceb25a reverted, but for upstream fix, I'm
interested in more in-depth investigation. I'll be able to provide dmesg
logs a bit later.

The breaking commit is quoted below.

commit e1ceb25a1569ce5b61b9c496dd32d038ba8cb936
Author: Krzysztof Hałasa <khalasa@piap.pl>
Date:   Mon Jun 8 10:42:24 2015 -0300

    [media] SOLO6x10: remove unneeded register locking and barriers
    
    readl() and writel() are atomic, we don't need the spin lock.
    Also, flushing posted write buffer isn't required. Especially on read :-)
    
    Signed-off-by: Krzysztof Ha?asa <khalasa@piap.pl>
    Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
    Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>

--
To unsubscribe from this list: send the line "unsubscribe linux-media" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Hans Verkuil Sept. 15, 2016, 1:15 p.m. UTC | #1
It could be related to the fact that a PCI write may be delayed unless
it is followed by a read (see also the comments in drivers/media/pci/ivtv/ivtv-driver.h).

That was probably the reason for the pci_read_config_word in the reg_write
code. Try putting that back (and just that).

Regards,

	Hans

On 09/15/2016 03:04 PM, Andrey Utkin wrote:
> Hi Krzysztof,
> 
> Me and one more solo6010 board user experience machine lockup when
> solo6x10 module is loaded on kernel series starting with 4.3 (despite
> solo6110 board probes just fine on all kernels). That is, 3.16, 3.18,
> 4.1 and 4.2 are tested and fine, and 4.3, 4.4, and others up to current
> linux-next are bad.
> So regression slipped in between 4.2 and 4.3. The diff between
> stable/linux-4.2.y and ...-4.3.y (which were tested) is not large, and
> my suspect fell on ripoff of register writing procedures complexity,
> which was introduced in e1ceb25a (see below). Reversion of that fixes
> lockup.  However, if, on top of reversion of e1ceb25a, i drop barrier
> stuff and pci_read_config... (see
> https://github.com/bluecherrydvr/linux/commit/d59aaf3), leaving the
> spinlock stuff, it locks up again.  This is a matter in which I'm not
> quite qualified, so I have no idea what that code copes with and why
> this workaround works for solo6010.  For now I think I'll tell the
> customer to use kernel with e1ceb25a reverted, but for upstream fix, I'm
> interested in more in-depth investigation. I'll be able to provide dmesg
> logs a bit later.
> 
> The breaking commit is quoted below.
> 
> commit e1ceb25a1569ce5b61b9c496dd32d038ba8cb936
> Author: Krzysztof Hałasa <khalasa@piap.pl>
> Date:   Mon Jun 8 10:42:24 2015 -0300
> 
>     [media] SOLO6x10: remove unneeded register locking and barriers
>     
>     readl() and writel() are atomic, we don't need the spin lock.
>     Also, flushing posted write buffer isn't required. Especially on read :-)
>     
>     Signed-off-by: Krzysztof Ha?asa <khalasa@piap.pl>
>     Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
>     Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
> 
> diff --git a/drivers/media/pci/solo6x10/solo6x10-core.c b/drivers/media/pci/solo6x10/solo6x10-core.c
> index 84627e6..9c948b1 100644
> --- a/drivers/media/pci/solo6x10/solo6x10-core.c
> +++ b/drivers/media/pci/solo6x10/solo6x10-core.c
> @@ -483,7 +483,6 @@ static int solo_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  
>  	solo_dev->type = id->driver_data;
>  	solo_dev->pdev = pdev;
> -	spin_lock_init(&solo_dev->reg_io_lock);
>  	ret = v4l2_device_register(&pdev->dev, &solo_dev->v4l2_dev);
>  	if (ret)
>  		goto fail_probe;
> diff --git a/drivers/media/pci/solo6x10/solo6x10.h b/drivers/media/pci/solo6x10/solo6x10.h
> index 1ca54b0..27423d7 100644
> --- a/drivers/media/pci/solo6x10/solo6x10.h
> +++ b/drivers/media/pci/solo6x10/solo6x10.h
> @@ -199,7 +199,6 @@ struct solo_dev {
>  	int			nr_ext;
>  	u32			irq_mask;
>  	u32			motion_mask;
> -	spinlock_t		reg_io_lock;
>  	struct v4l2_device	v4l2_dev;
>  
>  	/* tw28xx accounting */
> @@ -281,36 +280,13 @@ struct solo_dev {
>  
>  static inline u32 solo_reg_read(struct solo_dev *solo_dev, int reg)
>  {
> -	unsigned long flags;
> -	u32 ret;
> -	u16 val;
> -
> -	spin_lock_irqsave(&solo_dev->reg_io_lock, flags);
> -
> -	ret = readl(solo_dev->reg_base + reg);
> -	rmb();
> -	pci_read_config_word(solo_dev->pdev, PCI_STATUS, &val);
> -	rmb();
> -
> -	spin_unlock_irqrestore(&solo_dev->reg_io_lock, flags);
> -
> -	return ret;
> +	return readl(solo_dev->reg_base + reg);
>  }
>  
>  static inline void solo_reg_write(struct solo_dev *solo_dev, int reg,
>  				  u32 data)
>  {
> -	unsigned long flags;
> -	u16 val;
> -
> -	spin_lock_irqsave(&solo_dev->reg_io_lock, flags);
> -
>  	writel(data, solo_dev->reg_base + reg);
> -	wmb();
> -	pci_read_config_word(solo_dev->pdev, PCI_STATUS, &val);
> -	rmb();
> -
> -	spin_unlock_irqrestore(&solo_dev->reg_io_lock, flags);
>  }
>  
>  static inline void solo_irq_on(struct solo_dev *dev, u32 mask)
> --
> To unsubscribe from this list: send the line "unsubscribe linux-media" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-media" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrey Utkin Sept. 15, 2016, 1:19 p.m. UTC | #2
On Thu, Sep 15, 2016 at 03:15:53PM +0200, Hans Verkuil wrote:
> It could be related to the fact that a PCI write may be delayed unless
> it is followed by a read (see also the comments in drivers/media/pci/ivtv/ivtv-driver.h).

Thanks for explanation!

> That was probably the reason for the pci_read_config_word in the reg_write
> code. Try putting that back (and just that).

In this case reg_write becomes not atomic, thus spinlock would be
required again here, right?
--
To unsubscribe from this list: send the line "unsubscribe linux-media" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hans Verkuil Sept. 15, 2016, 1:25 p.m. UTC | #3
On 09/15/2016 03:19 PM, Andrey Utkin wrote:
> On Thu, Sep 15, 2016 at 03:15:53PM +0200, Hans Verkuil wrote:
>> It could be related to the fact that a PCI write may be delayed unless
>> it is followed by a read (see also the comments in drivers/media/pci/ivtv/ivtv-driver.h).
> 
> Thanks for explanation!
> 
>> That was probably the reason for the pci_read_config_word in the reg_write
>> code. Try putting that back (and just that).
> 
> In this case reg_write becomes not atomic, thus spinlock would be
> required again here, right?

That depends on whether you can have calls to this function in parallel.

But I get the feeling that it might be easier to just revert the patch.

Regards,

	Hans
--
To unsubscribe from this list: send the line "unsubscribe linux-media" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alan Cox Sept. 15, 2016, 1:58 p.m. UTC | #4
On Thu, 15 Sep 2016 16:19:52 +0300
Andrey Utkin <andrey_utkin@fastmail.com> wrote:

> On Thu, Sep 15, 2016 at 03:15:53PM +0200, Hans Verkuil wrote:
> > It could be related to the fact that a PCI write may be delayed unless
> > it is followed by a read (see also the comments in drivers/media/pci/ivtv/ivtv-driver.h).  
> 
> Thanks for explanation!
> 
> > That was probably the reason for the pci_read_config_word in the reg_write
> > code. Try putting that back (and just that).  
> 
> In this case reg_write becomes not atomic, thus spinlock would be
> required again here, right?

No - PCI writes are ordered but may not complete until the next read or
config access. That ordering isn't affected by things like spin locking
as it is a property of the bus.

Usually this only matters in obscure cases - timing is one of them, and
the other is when freeing memory because writes are posted both ways so
you need to write to stop the transfer, read to ensure the transfer has
completed and then free the target memory.

Alan
--
To unsubscribe from this list: send the line "unsubscribe linux-media" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Krzysztof Hałasa Sept. 21, 2016, 1:16 p.m. UTC | #5
Hans Verkuil <hverkuil@xs4all.nl> writes:

> That was probably the reason for the pci_read_config_word in the reg_write
> code. Try putting that back (and just that).

Yes. I guess a single pci_read_config_word() would suffice.

Though it would obviously be much better to identify the place in the
driver which needs to have the write buffers flushed, and add a read()
just there.

The interrupt handler maybe (e.g. just before the return IRQ_HANDLED)?

OTOH this may be some sort of timing problem, I mean the faster code may
put too much stress on the SOLO chip.

Doesn't happen here so I can't test the cure.
Andrey Utkin Sept. 21, 2016, 1:45 p.m. UTC | #6
On Wed, Sep 21, 2016 at 03:16:57PM +0200, Krzysztof Hałasa wrote:
> Hans Verkuil <hverkuil@xs4all.nl> writes:
> 
> > That was probably the reason for the pci_read_config_word in the reg_write
> > code. Try putting that back (and just that).
> 
> Yes. I guess a single pci_read_config_word() would suffice.
> 
> Though it would obviously be much better to identify the place in the
> driver which needs to have the write buffers flushed, and add a read()
> just there.
> 
> The interrupt handler maybe (e.g. just before the return IRQ_HANDLED)?
> 
> OTOH this may be some sort of timing problem, I mean the faster code may
> put too much stress on the SOLO chip.
> 
> Doesn't happen here so I can't test the cure.

It happens in solo_disp_init at uploading default motion thresholds
array.

I've got a prints trace with solo6010-fix-lockup branch
https://github.com/bluecherrydvr/linux/tree/solo6010-fix-lockup/drivers/media/pci/solo6x10
the trace itself in jpg:
https://decent.im:5281/upload/3793f393-e285-4514-83dd-bf08d1c8b4a2/e7ad898b-515b-4522-86a9-553daaeb0860.jpg

Indeed, targeted fixing would be more reasonable than making register
r/w routines follow blocking fashion. But the driver is already complete
and was known to be working, and I seems all places in code assume the
blocking fashion of reg r/w, and changing that assumption may lead to
covert bugs anywhere else, not just at probing, which may be hard to
nail down.

For now, I'll try setting pci_read_config_word() back instead of full
revert. Does it need to be just in reg_write? No need for it in
reg_read, right?
--
To unsubscribe from this list: send the line "unsubscribe linux-media" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/media/pci/solo6x10/solo6x10-core.c b/drivers/media/pci/solo6x10/solo6x10-core.c
index 84627e6..9c948b1 100644
--- a/drivers/media/pci/solo6x10/solo6x10-core.c
+++ b/drivers/media/pci/solo6x10/solo6x10-core.c
@@ -483,7 +483,6 @@  static int solo_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	solo_dev->type = id->driver_data;
 	solo_dev->pdev = pdev;
-	spin_lock_init(&solo_dev->reg_io_lock);
 	ret = v4l2_device_register(&pdev->dev, &solo_dev->v4l2_dev);
 	if (ret)
 		goto fail_probe;
diff --git a/drivers/media/pci/solo6x10/solo6x10.h b/drivers/media/pci/solo6x10/solo6x10.h
index 1ca54b0..27423d7 100644
--- a/drivers/media/pci/solo6x10/solo6x10.h
+++ b/drivers/media/pci/solo6x10/solo6x10.h
@@ -199,7 +199,6 @@  struct solo_dev {
 	int			nr_ext;
 	u32			irq_mask;
 	u32			motion_mask;
-	spinlock_t		reg_io_lock;
 	struct v4l2_device	v4l2_dev;
 
 	/* tw28xx accounting */
@@ -281,36 +280,13 @@  struct solo_dev {
 
 static inline u32 solo_reg_read(struct solo_dev *solo_dev, int reg)
 {
-	unsigned long flags;
-	u32 ret;
-	u16 val;
-
-	spin_lock_irqsave(&solo_dev->reg_io_lock, flags);
-
-	ret = readl(solo_dev->reg_base + reg);
-	rmb();
-	pci_read_config_word(solo_dev->pdev, PCI_STATUS, &val);
-	rmb();
-
-	spin_unlock_irqrestore(&solo_dev->reg_io_lock, flags);
-
-	return ret;
+	return readl(solo_dev->reg_base + reg);
 }
 
 static inline void solo_reg_write(struct solo_dev *solo_dev, int reg,
 				  u32 data)
 {
-	unsigned long flags;
-	u16 val;
-
-	spin_lock_irqsave(&solo_dev->reg_io_lock, flags);
-
 	writel(data, solo_dev->reg_base + reg);
-	wmb();
-	pci_read_config_word(solo_dev->pdev, PCI_STATUS, &val);
-	rmb();
-
-	spin_unlock_irqrestore(&solo_dev->reg_io_lock, flags);
 }
 
 static inline void solo_irq_on(struct solo_dev *dev, u32 mask)