diff mbox

libnvdimm: rework region badblocks clearing

Message ID 149355594185.9917.1577772489949690281.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State Accepted
Commit 23f498448362
Headers show

Commit Message

Dan Williams April 30, 2017, 12:39 p.m. UTC
Toshi noticed that the new support for a region-level badblocks missed
the case where errors are cleared due to BTT I/O.

An initial attempt to fix this ran into a "sleeping while atomic"
warning due to taking the nvdimm_bus_lock() in the BTT I/O path to
satisfy the locking requirements of __nvdimm_bus_badblocks_clear().
However, that lock is not needed since we are not acting any data that
is subject to change due to a change of state of the bus / region. The
badblocks instance has its own internal lock to handle mutations of the
error list.

So, to make it clear that we are just acting on region devices and don't
need the lock rename __nvdimm_bus_badblocks_clear() to
nvdimm_clear_badblocks_regions(). Eliminate the lock and consolidate all
routines in drivers/nvdimm/bus.c. Also, make some cleanups to remove
unnecessary casts, make the calling convention of
nvdimm_clear_badblocks_regions() clearer by replacing struct resource
with the minimal struct clear_badblocks_context, and use the DEVICE_ATTR
macro.

Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Reported-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c         |   76 ++++++++++++++++++++++++++++++------------
 drivers/nvdimm/region.c      |   25 --------------
 drivers/nvdimm/region_devs.c |   15 +++-----
 include/linux/libnvdimm.h    |    3 --
 4 files changed, 59 insertions(+), 60 deletions(-)

Comments

Kani, Toshi May 1, 2017, 3:34 p.m. UTC | #1
On Sun, 2017-04-30 at 05:39 -0700, Dan Williams wrote:
> Toshi noticed that the new support for a region-level badblocks

> missed the case where errors are cleared due to BTT I/O.

> 

> An initial attempt to fix this ran into a "sleeping while atomic"

> warning due to taking the nvdimm_bus_lock() in the BTT I/O path to

> satisfy the locking requirements of __nvdimm_bus_badblocks_clear().

> However, that lock is not needed since we are not acting any data

> that is subject to change due to a change of state of the bus /

> region. The badblocks instance has its own internal lock to handle

> mutations of the error list.

> 

> So, to make it clear that we are just acting on region devices and

> don't need the lock rename __nvdimm_bus_badblocks_clear() to

> nvdimm_clear_badblocks_regions(). Eliminate the lock and consolidate

> all routines in drivers/nvdimm/bus.c. Also, make some cleanups to

> remove unnecessary casts, make the calling convention of

> nvdimm_clear_badblocks_regions() clearer by replacing struct resource

> with the minimal struct clear_badblocks_context, and use the

> DEVICE_ATTR macro.


Hi Dan,

I was testing the change with CONFIG_DEBUG_ATOMIC_SLEEP set this time,
and hit the following BUG with BTT.  This is a separate issue (not
introduced by this patch), but it shows that we have an issue with the
DSM call path as well.

[ 1279.712933] nfit ACPI0012:00: acpi_nfit_ctl:bus cmd: 1: func: 1
input length: 16
[ 1279.721111] nvdimm in  00000000: 60000000 00000002 00001000
00000000  ...`............
[ 1279.729799] BUG: sleeping function called from invalid context at
mm/slab.h:432
[ 1279.738005] in_atomic(): 1, irqs_disabled(): 0, pid: 13353, name: dd
[ 1279.745187] INFO: lockdep is turned off.
 :
[ 1279.767908] Call Trace:
[ 1279.771116]  dump_stack+0x86/0xc3
[ 1279.775201]  ___might_sleep+0x17d/0x250
[ 1279.779808]  __might_sleep+0x4a/0x80
[ 1279.784214]  __kmalloc+0x1c0/0x2e0
[ 1279.788388]  acpi_os_allocate_zeroed+0x2d/0x2f
[ 1279.793604]  acpi_evaluate_object+0x59/0x3b1
[ 1279.798640]  acpi_evaluate_dsm+0xbd/0x10c
[ 1279.803458]  acpi_nfit_ctl+0x1ef/0x7c0 [nfit]
[ 1279.808584]  ? nsio_rw_bytes+0x152/0x280
[ 1279.813258]  nvdimm_clear_poison+0x77/0x140
[ 1279.818193]  nsio_rw_bytes+0x18f/0x280
[ 1279.822684]  btt_write_pg+0x1d4/0x3d0 [nd_btt]
[ 1279.827869]  btt_make_request+0x119/0x2d0 [nd_btt]
[ 1279.833398]  ? generic_make_request+0xef/0x3b0
[ 1279.838575]  generic_make_request+0x122/0x3b0
[ 1279.843661]  ? iov_iter_get_pages+0xbd/0x380
[ 1279.848666]  submit_bio+0x73/0x150
[ 1279.852801]  ? bio_iov_iter_get_pages+0xd7/0x120
[ 1279.858166]  ? __blkdev_direct_IO_simple+0x17b/0x340
[ 1279.863877]  __blkdev_direct_IO_simple+0x177/0x340
[ 1279.869453]  ? bdput+0x20/0x20
[ 1279.873231]  blkdev_direct_IO+0x3b1/0x3c0
[ 1279.877963]  ? current_time+0x18/0x70
[ 1279.882344]  generic_file_direct_write+0xba/0x180
[ 1279.887765]  __generic_file_write_iter+0xc0/0x1c0
[ 1279.893185]  ? __clear_user+0x23/0x70
[ 1279.897550]  blkdev_write_iter+0x8b/0x100
[ 1279.902258]  ? __might_sleep+0x4a/0x80
[ 1279.906699]  __vfs_write+0xe8/0x160
[ 1279.910876]  vfs_write+0xcb/0x1f0
[ 1279.914867]  SyS_write+0x58/0xc0
[ 1279.918773]  do_syscall_64+0x6c/0x1f0
[ 1279.923120]  entry_SYSCALL64_slow_path+0x25/0x25

Thanks,
-Toshi
Dan Williams May 1, 2017, 3:43 p.m. UTC | #2
On Mon, May 1, 2017 at 8:34 AM, Kani, Toshimitsu <toshi.kani@hpe.com> wrote:
> On Sun, 2017-04-30 at 05:39 -0700, Dan Williams wrote:
>> Toshi noticed that the new support for a region-level badblocks
>> missed the case where errors are cleared due to BTT I/O.
>>
>> An initial attempt to fix this ran into a "sleeping while atomic"
>> warning due to taking the nvdimm_bus_lock() in the BTT I/O path to
>> satisfy the locking requirements of __nvdimm_bus_badblocks_clear().
>> However, that lock is not needed since we are not acting any data
>> that is subject to change due to a change of state of the bus /
>> region. The badblocks instance has its own internal lock to handle
>> mutations of the error list.
>>
>> So, to make it clear that we are just acting on region devices and
>> don't need the lock rename __nvdimm_bus_badblocks_clear() to
>> nvdimm_clear_badblocks_regions(). Eliminate the lock and consolidate
>> all routines in drivers/nvdimm/bus.c. Also, make some cleanups to
>> remove unnecessary casts, make the calling convention of
>> nvdimm_clear_badblocks_regions() clearer by replacing struct resource
>> with the minimal struct clear_badblocks_context, and use the
>> DEVICE_ATTR macro.
>
> Hi Dan,
>
> I was testing the change with CONFIG_DEBUG_ATOMIC_SLEEP set this time,
> and hit the following BUG with BTT.  This is a separate issue (not
> introduced by this patch), but it shows that we have an issue with the
> DSM call path as well.

Ah, great find, thanks! We don't see this in the unit tests because
the nfit_test infrastructure takes no sleeping actions in its
simulated DSM path. Outside of converting btt to use sleeping locks
I'm not sure I see a path forward. I wonder how bad the performance
impact of that would be? Perhaps with opportunistic spinning it won't
be so bad, but I don't see another choice.
Dan Williams May 1, 2017, 3:52 p.m. UTC | #3
On Mon, May 1, 2017 at 8:43 AM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Mon, May 1, 2017 at 8:34 AM, Kani, Toshimitsu <toshi.kani@hpe.com> wrote:
>> On Sun, 2017-04-30 at 05:39 -0700, Dan Williams wrote:
>>> Toshi noticed that the new support for a region-level badblocks
>>> missed the case where errors are cleared due to BTT I/O.
>>>
>>> An initial attempt to fix this ran into a "sleeping while atomic"
>>> warning due to taking the nvdimm_bus_lock() in the BTT I/O path to
>>> satisfy the locking requirements of __nvdimm_bus_badblocks_clear().
>>> However, that lock is not needed since we are not acting any data
>>> that is subject to change due to a change of state of the bus /
>>> region. The badblocks instance has its own internal lock to handle
>>> mutations of the error list.
>>>
>>> So, to make it clear that we are just acting on region devices and
>>> don't need the lock rename __nvdimm_bus_badblocks_clear() to
>>> nvdimm_clear_badblocks_regions(). Eliminate the lock and consolidate
>>> all routines in drivers/nvdimm/bus.c. Also, make some cleanups to
>>> remove unnecessary casts, make the calling convention of
>>> nvdimm_clear_badblocks_regions() clearer by replacing struct resource
>>> with the minimal struct clear_badblocks_context, and use the
>>> DEVICE_ATTR macro.
>>
>> Hi Dan,
>>
>> I was testing the change with CONFIG_DEBUG_ATOMIC_SLEEP set this time,
>> and hit the following BUG with BTT.  This is a separate issue (not
>> introduced by this patch), but it shows that we have an issue with the
>> DSM call path as well.
>
> Ah, great find, thanks! We don't see this in the unit tests because
> the nfit_test infrastructure takes no sleeping actions in its
> simulated DSM path. Outside of converting btt to use sleeping locks
> I'm not sure I see a path forward. I wonder how bad the performance
> impact of that would be? Perhaps with opportunistic spinning it won't
> be so bad, but I don't see another choice.

It's worse than that. Part of the performance optimization of BTT I/O
was to avoid locking altogether when we could rely on a BTT lane
percpu, so that would also need to be removed.
Kani, Toshi May 1, 2017, 4:12 p.m. UTC | #4
On Mon, 2017-05-01 at 08:52 -0700, Dan Williams wrote:
> On Mon, May 1, 2017 at 8:43 AM, Dan Williams <dan.j.williams@intel.co

> m> wrote:

> > On Mon, May 1, 2017 at 8:34 AM, Kani, Toshimitsu <toshi.kani@hpe.co

> > m> wrote:

> > > On Sun, 2017-04-30 at 05:39 -0700, Dan Williams wrote:

 :
> > > 

> > > Hi Dan,

> > > 

> > > I was testing the change with CONFIG_DEBUG_ATOMIC_SLEEP set this

> > > time, and hit the following BUG with BTT.  This is a separate

> > > issue (not introduced by this patch), but it shows that we have

> > > an issue with the DSM call path as well.

> > 

> > Ah, great find, thanks! We don't see this in the unit tests because

> > the nfit_test infrastructure takes no sleeping actions in its

> > simulated DSM path. Outside of converting btt to use sleeping locks

> > I'm not sure I see a path forward. I wonder how bad the performance

> > impact of that would be? Perhaps with opportunistic spinning it

> > won't be so bad, but I don't see another choice.

> 

> It's worse than that. Part of the performance optimization of BTT I/O

> was to avoid locking altogether when we could rely on a BTT lane

> percpu, so that would also need to be removed.


I do not have a good idea either, but I'd rather disable this clearing
in the regular BTT write path than adding sleeping locks to BTT. 
Clearing a bad block in the BTT write path is difficult/challenging
since it allocates a new block.

Thanks,
-Toshi
Dan Williams May 1, 2017, 4:16 p.m. UTC | #5
On Mon, May 1, 2017 at 9:12 AM, Kani, Toshimitsu <toshi.kani@hpe.com> wrote:
> On Mon, 2017-05-01 at 08:52 -0700, Dan Williams wrote:
>> On Mon, May 1, 2017 at 8:43 AM, Dan Williams <dan.j.williams@intel.co
>> m> wrote:
>> > On Mon, May 1, 2017 at 8:34 AM, Kani, Toshimitsu <toshi.kani@hpe.co
>> > m> wrote:
>> > > On Sun, 2017-04-30 at 05:39 -0700, Dan Williams wrote:
>  :
>> > >
>> > > Hi Dan,
>> > >
>> > > I was testing the change with CONFIG_DEBUG_ATOMIC_SLEEP set this
>> > > time, and hit the following BUG with BTT.  This is a separate
>> > > issue (not introduced by this patch), but it shows that we have
>> > > an issue with the DSM call path as well.
>> >
>> > Ah, great find, thanks! We don't see this in the unit tests because
>> > the nfit_test infrastructure takes no sleeping actions in its
>> > simulated DSM path. Outside of converting btt to use sleeping locks
>> > I'm not sure I see a path forward. I wonder how bad the performance
>> > impact of that would be? Perhaps with opportunistic spinning it
>> > won't be so bad, but I don't see another choice.
>>
>> It's worse than that. Part of the performance optimization of BTT I/O
>> was to avoid locking altogether when we could rely on a BTT lane
>> percpu, so that would also need to be removed.
>
> I do not have a good idea either, but I'd rather disable this clearing
> in the regular BTT write path than adding sleeping locks to BTT.
> Clearing a bad block in the BTT write path is difficult/challenging
> since it allocates a new block.

Actually, that may make things easier. Can we teach BTT to track error
blocks and clear them before they are reassigned?
Kani, Toshi May 1, 2017, 4:20 p.m. UTC | #6
On Mon, 2017-05-01 at 09:16 -0700, Dan Williams wrote:
> On Mon, May 1, 2017 at 9:12 AM, Kani, Toshimitsu <toshi.kani@hpe.com>

> wrote:

> > On Mon, 2017-05-01 at 08:52 -0700, Dan Williams wrote:

> > > On Mon, May 1, 2017 at 8:43 AM, Dan Williams <dan.j.williams@inte

> > > l.co

> > > m> wrote:

> > > > On Mon, May 1, 2017 at 8:34 AM, Kani, Toshimitsu <toshi.kani@hp

> > > > e.co

> > > > m> wrote:

> > > > > On Sun, 2017-04-30 at 05:39 -0700, Dan Williams wrote:

> > 

> >  :

> > > > > 

> > > > > Hi Dan,

> > > > > 

> > > > > I was testing the change with CONFIG_DEBUG_ATOMIC_SLEEP set

> > > > > this time, and hit the following BUG with BTT.  This is a

> > > > > separate issue (not introduced by this patch), but it shows

> > > > > that we have an issue with the DSM call path as well.

> > > > 

> > > > Ah, great find, thanks! We don't see this in the unit tests

> > > > because the nfit_test infrastructure takes no sleeping actions

> > > > in its simulated DSM path. Outside of converting btt to use

> > > > sleeping locks I'm not sure I see a path forward. I wonder how

> > > > bad the performance impact of that would be? Perhaps with

> > > > opportunistic spinning it won't be so bad, but I don't see

> > > > another choice.

> > > 

> > > It's worse than that. Part of the performance optimization of BTT

> > > I/O was to avoid locking altogether when we could rely on a BTT

> > > lane percpu, so that would also need to be removed.

> > 

> > I do not have a good idea either, but I'd rather disable this

> > clearing in the regular BTT write path than adding sleeping locks

> > to BTT. Clearing a bad block in the BTT write path is

> > difficult/challenging since it allocates a new block.

> 

> Actually, that may make things easier. Can we teach BTT to track

> error blocks and clear them before they are reassigned?


I was thinking the same after sending it.  I think we should be able to
do that.

Thanks,
-Toshi
Dan Williams May 1, 2017, 4:38 p.m. UTC | #7
On Mon, May 1, 2017 at 9:20 AM, Kani, Toshimitsu <toshi.kani@hpe.com> wrote:
> On Mon, 2017-05-01 at 09:16 -0700, Dan Williams wrote:
>> On Mon, May 1, 2017 at 9:12 AM, Kani, Toshimitsu <toshi.kani@hpe.com>
>> wrote:
>> > On Mon, 2017-05-01 at 08:52 -0700, Dan Williams wrote:
>> > > On Mon, May 1, 2017 at 8:43 AM, Dan Williams <dan.j.williams@inte
>> > > l.co
>> > > m> wrote:
>> > > > On Mon, May 1, 2017 at 8:34 AM, Kani, Toshimitsu <toshi.kani@hp
>> > > > e.co
>> > > > m> wrote:
>> > > > > On Sun, 2017-04-30 at 05:39 -0700, Dan Williams wrote:
>> >
>> >  :
>> > > > >
>> > > > > Hi Dan,
>> > > > >
>> > > > > I was testing the change with CONFIG_DEBUG_ATOMIC_SLEEP set
>> > > > > this time, and hit the following BUG with BTT.  This is a
>> > > > > separate issue (not introduced by this patch), but it shows
>> > > > > that we have an issue with the DSM call path as well.
>> > > >
>> > > > Ah, great find, thanks! We don't see this in the unit tests
>> > > > because the nfit_test infrastructure takes no sleeping actions
>> > > > in its simulated DSM path. Outside of converting btt to use
>> > > > sleeping locks I'm not sure I see a path forward. I wonder how
>> > > > bad the performance impact of that would be? Perhaps with
>> > > > opportunistic spinning it won't be so bad, but I don't see
>> > > > another choice.
>> > >
>> > > It's worse than that. Part of the performance optimization of BTT
>> > > I/O was to avoid locking altogether when we could rely on a BTT
>> > > lane percpu, so that would also need to be removed.
>> >
>> > I do not have a good idea either, but I'd rather disable this
>> > clearing in the regular BTT write path than adding sleeping locks
>> > to BTT. Clearing a bad block in the BTT write path is
>> > difficult/challenging since it allocates a new block.
>>
>> Actually, that may make things easier. Can we teach BTT to track
>> error blocks and clear them before they are reassigned?
>
> I was thinking the same after sending it.  I think we should be able to
> do that.

Ok, but we obviously can't develop something that detailed while the
merge window is open, so I think that means we need to revert commit
e88da7998d7d "Revert 'libnvdimm: band aid btt vs clear poison
locking'" and leave BTT I/O-error-clearing disabled for this cycle and
try again for 4.13.
Verma, Vishal L May 1, 2017, 4:42 p.m. UTC | #8
On Mon, 2017-05-01 at 09:38 -0700, Dan Williams wrote:
> On Mon, May 1, 2017 at 9:20 AM, Kani, Toshimitsu <toshi.kani@hpe.com>

> wrote:

> > On Mon, 2017-05-01 at 09:16 -0700, Dan Williams wrote:

> > > On Mon, May 1, 2017 at 9:12 AM, Kani, Toshimitsu <toshi.kani@hpe.

> > > com>

> > > wrote:

> > > > On Mon, 2017-05-01 at 08:52 -0700, Dan Williams wrote:

> > > > > On Mon, May 1, 2017 at 8:43 AM, Dan Williams <dan.j.williams@

> > > > > inte

> > > > > l.co

> > > > > m> wrote:

> > > > > > On Mon, May 1, 2017 at 8:34 AM, Kani, Toshimitsu <toshi.kan

> > > > > > i@hp

> > > > > > e.co

> > > > > > m> wrote:

> > > > > > > On Sun, 2017-04-30 at 05:39 -0700, Dan Williams wrote:

> > > > 

> > > >  :

> > > > > > > 

> > > > > > > Hi Dan,

> > > > > > > 

> > > > > > > I was testing the change with CONFIG_DEBUG_ATOMIC_SLEEP

> > > > > > > set

> > > > > > > this time, and hit the following BUG with BTT.  This is a

> > > > > > > separate issue (not introduced by this patch), but it

> > > > > > > shows

> > > > > > > that we have an issue with the DSM call path as well.

> > > > > > 

> > > > > > Ah, great find, thanks! We don't see this in the unit tests

> > > > > > because the nfit_test infrastructure takes no sleeping

> > > > > > actions

> > > > > > in its simulated DSM path. Outside of converting btt to use

> > > > > > sleeping locks I'm not sure I see a path forward. I wonder

> > > > > > how

> > > > > > bad the performance impact of that would be? Perhaps with

> > > > > > opportunistic spinning it won't be so bad, but I don't see

> > > > > > another choice.

> > > > > 

> > > > > It's worse than that. Part of the performance optimization of

> > > > > BTT

> > > > > I/O was to avoid locking altogether when we could rely on a

> > > > > BTT

> > > > > lane percpu, so that would also need to be removed.

> > > > 

> > > > I do not have a good idea either, but I'd rather disable this

> > > > clearing in the regular BTT write path than adding sleeping

> > > > locks

> > > > to BTT. Clearing a bad block in the BTT write path is

> > > > difficult/challenging since it allocates a new block.

> > > 

> > > Actually, that may make things easier. Can we teach BTT to track

> > > error blocks and clear them before they are reassigned?

> > 

> > I was thinking the same after sending it.  I think we should be

> > able to

> > do that.

> 

> Ok, but we obviously can't develop something that detailed while the

> merge window is open, so I think that means we need to revert commit

> e88da7998d7d "Revert 'libnvdimm: band aid btt vs clear poison

> locking'" and leave BTT I/O-error-clearing disabled for this cycle

> and

> try again for 4.13.


Agreed, I'll work on something to track badblocks and clear them
outside the IO path.
Kani, Toshi May 1, 2017, 4:45 p.m. UTC | #9
On Mon, 2017-05-01 at 16:42 +0000, Verma, Vishal L wrote:
> On Mon, 2017-05-01 at 09:38 -0700, Dan Williams wrote:

> > On Mon, May 1, 2017 at 9:20 AM, Kani, Toshimitsu

> > <toshi.kani@hpe.com>

 :
> > > > > > > > Hi Dan,

> > > > > > > > 

> > > > > > > > I was testing the change with CONFIG_DEBUG_ATOMIC_SLEEP

> > > > > > > > set this time, and hit the following BUG with

> > > > > > > > BTT.  This is a separate issue (not introduced by this

> > > > > > > > patch), but it shows that we have an issue with the DSM

> > > > > > > > call path as well.

> > > > > > > 

> > > > > > > Ah, great find, thanks! We don't see this in the unit

> > > > > > > tests because the nfit_test infrastructure takes no

> > > > > > > sleeping actions in its simulated DSM path. Outside of

> > > > > > > converting btt to use sleeping locks I'm not sure I see a

> > > > > > > path forward. I wonder how bad the performance impact of

> > > > > > > that would be? Perhaps with opportunistic spinning it

> > > > > > > won't be so bad, but I don't see another choice.

> > > > > > 

> > > > > > It's worse than that. Part of the performance optimization

> > > > > > of BTT I/O was to avoid locking altogether when we could

> > > > > > rely on a BTT lane percpu, so that would also need to be

> > > > > > removed.

> > > > > 

> > > > > I do not have a good idea either, but I'd rather disable this

> > > > > clearing in the regular BTT write path than adding sleeping

> > > > > locks to BTT. Clearing a bad block in the BTT write path is

> > > > > difficult/challenging since it allocates a new block.

> > > > 

> > > > Actually, that may make things easier. Can we teach BTT to

> > > > track error blocks and clear them before they are reassigned?

> > > 

> > > I was thinking the same after sending it.  I think we should be

> > > able to do that.

> > 

> > Ok, but we obviously can't develop something that detailed while

> > the merge window is open, so I think that means we need to revert

> > commit e88da7998d7d "Revert 'libnvdimm: band aid btt vs clear

> > poison locking'" and leave BTT I/O-error-clearing disabled for this

> > cycle and try again for 4.13.

> 

> Agreed, I'll work on something to track badblocks and clear them

> outside the IO path.


Great!  Thanks Vishal!
-Toshi
Kani, Toshi May 1, 2017, 9:26 p.m. UTC | #10
On Sun, 2017-04-30 at 05:39 -0700, Dan Williams wrote:
> Toshi noticed that the new support for a region-level badblocks
> missed the case where errors are cleared due to BTT I/O.
> 
> An initial attempt to fix this ran into a "sleeping while atomic"
> warning due to taking the nvdimm_bus_lock() in the BTT I/O path to
> satisfy the locking requirements of __nvdimm_bus_badblocks_clear().
> However, that lock is not needed since we are not acting any data
> that is subject to change due to a change of state of the bus /
> region. The badblocks instance has its own internal lock to handle
> mutations of the error list.
> 
> So, to make it clear that we are just acting on region devices and
> don't need the lock rename __nvdimm_bus_badblocks_clear() to
> nvdimm_clear_badblocks_regions(). Eliminate the lock and consolidate
> all routines in drivers/nvdimm/bus.c. Also, make some cleanups to
> remove unnecessary casts, make the calling convention of
> nvdimm_clear_badblocks_regions() clearer by replacing struct resource
> with the minimal struct clear_badblocks_context, and use the
> DEVICE_ATTR macro.
> 
> Cc: Dave Jiang <dave.jiang@intel.com>
> Cc: Vishal Verma <vishal.l.verma@intel.com>
> Reported-by: Toshi Kani <toshi.kani@hpe.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

Confirmed that the clear path to the region-level badblocks works with
the btt workaround and ndctl fix.

Tested-by: Toshi Kani <toshi.kani@hpe.com>

Thanks!
-Toshi
Dan Williams May 1, 2017, 11:09 p.m. UTC | #11
On Mon, May 1, 2017 at 2:26 PM, Kani, Toshimitsu <toshi.kani@hpe.com> wrote:
> On Sun, 2017-04-30 at 05:39 -0700, Dan Williams wrote:
>> Toshi noticed that the new support for a region-level badblocks
>> missed the case where errors are cleared due to BTT I/O.
>>
>> An initial attempt to fix this ran into a "sleeping while atomic"
>> warning due to taking the nvdimm_bus_lock() in the BTT I/O path to
>> satisfy the locking requirements of __nvdimm_bus_badblocks_clear().
>> However, that lock is not needed since we are not acting any data
>> that is subject to change due to a change of state of the bus /
>> region. The badblocks instance has its own internal lock to handle
>> mutations of the error list.
>>
>> So, to make it clear that we are just acting on region devices and
>> don't need the lock rename __nvdimm_bus_badblocks_clear() to
>> nvdimm_clear_badblocks_regions(). Eliminate the lock and consolidate
>> all routines in drivers/nvdimm/bus.c. Also, make some cleanups to
>> remove unnecessary casts, make the calling convention of
>> nvdimm_clear_badblocks_regions() clearer by replacing struct resource
>> with the minimal struct clear_badblocks_context, and use the
>> DEVICE_ATTR macro.
>>
>> Cc: Dave Jiang <dave.jiang@intel.com>
>> Cc: Vishal Verma <vishal.l.verma@intel.com>
>> Reported-by: Toshi Kani <toshi.kani@hpe.com>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>
> Confirmed that the clear path to the region-level badblocks works with
> the btt workaround and ndctl fix.
>
> Tested-by: Toshi Kani <toshi.kani@hpe.com>

Thanks Toshi!
diff mbox

Patch

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 43ddfd487c85..e9361bffe5ee 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -172,6 +172,57 @@  void nvdimm_region_notify(struct nd_region *nd_region, enum nvdimm_event event)
 }
 EXPORT_SYMBOL_GPL(nvdimm_region_notify);
 
+struct clear_badblocks_context {
+	resource_size_t phys, cleared;
+};
+
+static int nvdimm_clear_badblocks_region(struct device *dev, void *data)
+{
+	struct clear_badblocks_context *ctx = data;
+	struct nd_region *nd_region;
+	resource_size_t ndr_end;
+	sector_t sector;
+
+	/* make sure device is a region */
+	if (!is_nd_pmem(dev))
+		return 0;
+
+	nd_region = to_nd_region(dev);
+	ndr_end = nd_region->ndr_start + nd_region->ndr_size - 1;
+
+	/* make sure we are in the region */
+	if (ctx->phys < nd_region->ndr_start
+			|| (ctx->phys + ctx->cleared) > ndr_end)
+		return 0;
+
+	sector = (ctx->phys - nd_region->ndr_start) / 512;
+	badblocks_clear(&nd_region->bb, sector, ctx->cleared / 512);
+
+	return 0;
+}
+
+static void nvdimm_clear_badblocks_regions(struct nvdimm_bus *nvdimm_bus,
+		phys_addr_t phys, u64 cleared)
+{
+	struct clear_badblocks_context ctx = {
+		.phys = phys,
+		.cleared = cleared,
+	};
+
+	device_for_each_child(&nvdimm_bus->dev, &ctx,
+			nvdimm_clear_badblocks_region);
+}
+
+static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus,
+		phys_addr_t phys, u64 cleared)
+{
+	if (cleared > 0)
+		nvdimm_forget_poison(nvdimm_bus, phys, cleared);
+
+	if (cleared > 0 && cleared / 512)
+		nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared);
+}
+
 long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
 		unsigned int len)
 {
@@ -219,22 +270,12 @@  long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
 	if (cmd_rc < 0)
 		return cmd_rc;
 
-	if (clear_err.cleared > 0)
-		nvdimm_forget_poison(nvdimm_bus, phys, clear_err.cleared);
+	nvdimm_account_cleared_poison(nvdimm_bus, phys, clear_err.cleared);
 
 	return clear_err.cleared;
 }
 EXPORT_SYMBOL_GPL(nvdimm_clear_poison);
 
-void __nvdimm_bus_badblocks_clear(struct nvdimm_bus *nvdimm_bus,
-		struct resource *res)
-{
-	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
-	device_for_each_child(&nvdimm_bus->dev, (void *)res,
-			nvdimm_region_badblocks_clear);
-}
-EXPORT_SYMBOL_GPL(__nvdimm_bus_badblocks_clear);
-
 static int nvdimm_bus_match(struct device *dev, struct device_driver *drv);
 
 static struct bus_type nvdimm_bus_type = {
@@ -989,18 +1030,9 @@  static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 
 	if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR && cmd_rc >= 0) {
 		struct nd_cmd_clear_error *clear_err = buf;
-		struct resource res;
-
-		if (clear_err->cleared) {
-			/* clearing the poison list we keep track of */
-			nvdimm_forget_poison(nvdimm_bus, clear_err->address,
-					clear_err->cleared);
 
-			/* now sync the badblocks lists */
-			res.start = clear_err->address;
-			res.end = clear_err->address + clear_err->cleared - 1;
-			__nvdimm_bus_badblocks_clear(nvdimm_bus, &res);
-		}
+		nvdimm_account_cleared_poison(nvdimm_bus, clear_err->address,
+				clear_err->cleared);
 	}
 	nvdimm_bus_unlock(&nvdimm_bus->dev);
 
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 23c4307d254c..869a886c292e 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -131,31 +131,6 @@  static void nd_region_notify(struct device *dev, enum nvdimm_event event)
 	device_for_each_child(dev, &event, child_notify);
 }
 
-int nvdimm_region_badblocks_clear(struct device *dev, void *data)
-{
-	struct resource *res = (struct resource *)data;
-	struct nd_region *nd_region;
-	resource_size_t ndr_end;
-	sector_t sector;
-
-	/* make sure device is a region */
-	if (!is_nd_pmem(dev))
-		return 0;
-
-	nd_region = to_nd_region(dev);
-	ndr_end = nd_region->ndr_start + nd_region->ndr_size - 1;
-
-	/* make sure we are in the region */
-	if (res->start < nd_region->ndr_start || res->end > ndr_end)
-		return 0;
-
-	sector = (res->start - nd_region->ndr_start) >> 9;
-	badblocks_clear(&nd_region->bb, sector, resource_size(res) >> 9);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(nvdimm_region_badblocks_clear);
-
 static struct nd_device_driver nd_region_driver = {
 	.probe = nd_region_probe,
 	.remove = nd_region_remove,
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 53d1ba4e6d99..07756b2e1cd5 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -477,20 +477,15 @@  static ssize_t read_only_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(read_only);
 
-static ssize_t nd_badblocks_show(struct device *dev,
+static ssize_t region_badblocks_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
 	struct nd_region *nd_region = to_nd_region(dev);
 
 	return badblocks_show(&nd_region->bb, buf, 0);
 }
-static struct device_attribute dev_attr_nd_badblocks = {
-	.attr = {
-		.name = "badblocks",
-		.mode = S_IRUGO
-	},
-	.show = nd_badblocks_show,
-};
+
+static DEVICE_ATTR(badblocks, 0444, region_badblocks_show, NULL);
 
 static ssize_t resource_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
@@ -514,7 +509,7 @@  static struct attribute *nd_region_attributes[] = {
 	&dev_attr_available_size.attr,
 	&dev_attr_namespace_seed.attr,
 	&dev_attr_init_namespaces.attr,
-	&dev_attr_nd_badblocks.attr,
+	&dev_attr_badblocks.attr,
 	&dev_attr_resource.attr,
 	NULL,
 };
@@ -532,7 +527,7 @@  static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
 	if (!is_nd_pmem(dev) && a == &dev_attr_dax_seed.attr)
 		return 0;
 
-	if (!is_nd_pmem(dev) && a == &dev_attr_nd_badblocks.attr)
+	if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr)
 		return 0;
 
 	if (!is_nd_pmem(dev) && a == &dev_attr_resource.attr)
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 98b207611b06..f07b1b14159a 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -162,7 +162,4 @@  void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
 u64 nd_fletcher64(void *addr, size_t len, bool le);
 void nvdimm_flush(struct nd_region *nd_region);
 int nvdimm_has_flush(struct nd_region *nd_region);
-int nvdimm_region_badblocks_clear(struct device *dev, void *data);
-void __nvdimm_bus_badblocks_clear(struct nvdimm_bus *nvdimm_bus,
-		struct resource *res);
 #endif /* __LIBNVDIMM_H__ */