diff mbox

[2/2] tracing/events: block: dev_t via driver core for plug and unplug events

Message ID 20180413130719.22921-3-maier@linux.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Steffen Maier April 13, 2018, 1:07 p.m. UTC
Complements v2.6.31 commit 55782138e47d ("tracing/events: convert block
trace points to TRACE_EVENT()") to be equivalent to traditional blktrace
output. Also this allows event filtering to not always get all (un)plug
events.

NB: The NULL pointer check for q->kobj.parent is certainly racy and
I don't have enough experience if it's good enough for a trace event.
The change did work for my cases (block device read/write I/O on
zfcp-attached SCSI disks and dm-mpath on top).

While I haven't seen any prior art using driver core (parent) relations
for trace events, there are other cases using this when no direct pointer
exists between objects, such as:
 #define to_scsi_target(d)	container_of(d, struct scsi_target, dev)
 static inline struct scsi_target *scsi_target(struct scsi_device *sdev)
 {
	return to_scsi_target(sdev->sdev_gendev.parent);
 }

This is the object model we make use of here:

struct gendisk {
        struct hd_struct {
                struct device {      /*container_of*/
                        struct kobject kobj; <--+
                        dev_t  devt; /*deref*/  |
                } __dev;                        |
        } part0;                                |
        struct request_queue *queue; ..+        |
}                                      :        |
                                       :        |
struct request_queue {  <..............+        |
        /* queue kobject */                     |
        struct kobject {                        |
                struct kobject *parent; --------+
        } kobj;
}

The parent pointer comes from:
 #define disk_to_dev(disk)	(&(disk)->part0.__dev)
int blk_register_queue(struct gendisk *disk)
	struct device *dev = disk_to_dev(disk);
	struct request_queue *q = disk->queue;
	ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
	                            ^^^parent

$ ls -d /sys/block/sdf/queue
/sys/block/sda/queue
$ cat /sys/block/sdf/dev
80:0

A partition does not have its own request queue:

$ cat /sys/block/sdf/sdf1/dev
8:81
$ ls -d /sys/block/sdf/sdf1/queue
ls: cannot access '/sys/block/sdf/sdf1/queue': No such file or directory

The difference to blktrace parsed output is that block events don't use the
partition's minor number but the containing block device's minor number:

$ dd if=/dev/sdf1 count=1

$ cat /sys/kernel/debug/tracing/trace
block_bio_remap: 8,80 R 2048 + 32 <- (8,81) 0
block_bio_queue: 8,80 R 2048 + 32 [dd]
block_getrq: 8,80 R 2048 + 32 [dd]
block_plug: 8,80 [dd]
            ^^^^
block_rq_insert: 8,80 R 16384 () 2048 + 32 [dd]
block_unplug: 8,80 [dd] 1 explicit
              ^^^^
block_rq_issue: 8,80 R 16384 () 2048 + 32 [dd]
block_rq_complete: 8,80 R () 2048 + 32 [0]

$ btrace /dev/sdf1
  8,80   1        1     0.000000000 240240  A   R 2048 + 32 <- (8,81) 0
  8,81   1        2     0.000220890 240240  Q   R 2048 + 32 [dd]
  8,81   1        3     0.000229639 240240  G   R 2048 + 32 [dd]
  8,81   1        4     0.000231805 240240  P   N [dd]
    ^^
  8,81   1        5     0.000234671 240240  I   R 2048 + 32 [dd]
  8,81   1        6     0.000236365 240240  U   N [dd] 1
    ^^
  8,81   1        7     0.000238527 240240  D   R 2048 + 32 [dd]
  8,81   2        2     0.000613741     0  C   R 2048 + 32 [0]

Signed-off-by: Steffen Maier <maier@linux.ibm.com>
---
 include/trace/events/block.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

Comments

Greg Kroah-Hartman April 15, 2018, 8:31 a.m. UTC | #1
On Fri, Apr 13, 2018 at 03:07:18PM +0200, Steffen Maier wrote:
> Complements v2.6.31 commit 55782138e47d ("tracing/events: convert block
> trace points to TRACE_EVENT()") to be equivalent to traditional blktrace
> output. Also this allows event filtering to not always get all (un)plug
> events.
> 
> NB: The NULL pointer check for q->kobj.parent is certainly racy and
> I don't have enough experience if it's good enough for a trace event.
> The change did work for my cases (block device read/write I/O on
> zfcp-attached SCSI disks and dm-mpath on top).
> 
> While I haven't seen any prior art using driver core (parent) relations
> for trace events, there are other cases using this when no direct pointer
> exists between objects, such as:
>  #define to_scsi_target(d)	container_of(d, struct scsi_target, dev)
>  static inline struct scsi_target *scsi_target(struct scsi_device *sdev)
>  {
> 	return to_scsi_target(sdev->sdev_gendev.parent);
>  }

That is because you "know" the parent of a target device is a
scsi_target.

> This is the object model we make use of here:
> 
> struct gendisk {
>         struct hd_struct {
>                 struct device {      /*container_of*/
>                         struct kobject kobj; <--+
>                         dev_t  devt; /*deref*/  |
>                 } __dev;                        |
>         } part0;                                |
>         struct request_queue *queue; ..+        |
> }                                      :        |
>                                        :        |
> struct request_queue {  <..............+        |
>         /* queue kobject */                     |
>         struct kobject {                        |
>                 struct kobject *parent; --------+

Are you sure about this?

>         } kobj;
> }
> 
> The parent pointer comes from:
>  #define disk_to_dev(disk)	(&(disk)->part0.__dev)
> int blk_register_queue(struct gendisk *disk)
> 	struct device *dev = disk_to_dev(disk);
> 	struct request_queue *q = disk->queue;
> 	ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
> 	                            ^^^parent
> 
> $ ls -d /sys/block/sdf/queue
> /sys/block/sda/queue
> $ cat /sys/block/sdf/dev
> 80:0
> 
> A partition does not have its own request queue:
> 
> $ cat /sys/block/sdf/sdf1/dev
> 8:81
> $ ls -d /sys/block/sdf/sdf1/queue
> ls: cannot access '/sys/block/sdf/sdf1/queue': No such file or directory
> 
> The difference to blktrace parsed output is that block events don't use the
> partition's minor number but the containing block device's minor number:

Why do you want the block device's minor number here?  What is wrong
with the partition's minor number?  I would think you want that instead.

> 
> $ dd if=/dev/sdf1 count=1
> 
> $ cat /sys/kernel/debug/tracing/trace
> block_bio_remap: 8,80 R 2048 + 32 <- (8,81) 0
> block_bio_queue: 8,80 R 2048 + 32 [dd]
> block_getrq: 8,80 R 2048 + 32 [dd]
> block_plug: 8,80 [dd]
>             ^^^^
> block_rq_insert: 8,80 R 16384 () 2048 + 32 [dd]
> block_unplug: 8,80 [dd] 1 explicit
>               ^^^^
> block_rq_issue: 8,80 R 16384 () 2048 + 32 [dd]
> block_rq_complete: 8,80 R () 2048 + 32 [0]
> 
> $ btrace /dev/sdf1
>   8,80   1        1     0.000000000 240240  A   R 2048 + 32 <- (8,81) 0
>   8,81   1        2     0.000220890 240240  Q   R 2048 + 32 [dd]
>   8,81   1        3     0.000229639 240240  G   R 2048 + 32 [dd]
>   8,81   1        4     0.000231805 240240  P   N [dd]
>     ^^
>   8,81   1        5     0.000234671 240240  I   R 2048 + 32 [dd]
>   8,81   1        6     0.000236365 240240  U   N [dd] 1
>     ^^
>   8,81   1        7     0.000238527 240240  D   R 2048 + 32 [dd]
>   8,81   2        2     0.000613741     0  C   R 2048 + 32 [0]
> 
> Signed-off-by: Steffen Maier <maier@linux.ibm.com>
> ---
>  include/trace/events/block.h | 13 +++++++++++--
>  1 file changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/include/trace/events/block.h b/include/trace/events/block.h
> index a13613d27cee..cffedc26e8a3 100644
> --- a/include/trace/events/block.h
> +++ b/include/trace/events/block.h
> @@ -460,14 +460,18 @@ TRACE_EVENT(block_plug,
>  	TP_ARGS(q),
>  
>  	TP_STRUCT__entry(
> +		__field( dev_t,		dev			)
>  		__array( char,		comm,	TASK_COMM_LEN	)
>  	),
>  
>  	TP_fast_assign(
> +		__entry->dev = q->kobj.parent ?
> +		container_of(q->kobj.parent, struct device, kobj)->devt : 0;

That really really really scares me.  It feels very fragile and messing
with parent pointers is ripe for things breaking in the future in odd
and unexplainable ways.

And how can the parent be NULL?

>  		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
>  	),
>  
> -	TP_printk("[%s]", __entry->comm)
> +	TP_printk("%d,%d [%s]",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->comm)
>  );
>  
>  #define show_block_unplug_explicit(val)		\
> @@ -482,18 +486,23 @@ DECLARE_EVENT_CLASS(block_unplug,
>  	TP_ARGS(q, depth, explicit),
>  
>  	TP_STRUCT__entry(
> +		__field( dev_t,		dev			)
>  		__field( int,		nr_rq			)
>  		__field( bool,		explicit		)
>  		__array( char,		comm,	TASK_COMM_LEN	)
>  	),
>  
>  	TP_fast_assign(
> +		__entry->dev   = q->kobj.parent ?
> +		container_of(q->kobj.parent, struct device, kobj)->devt : 0;

Again, how can the parent be null?

>  		__entry->nr_rq = depth;
>  		__entry->explicit = explicit;
>  		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
>  	),
>  
> -	TP_printk("[%s] %d %s", __entry->comm, __entry->nr_rq,
> +	TP_printk("%d,%d [%s] %d %s",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),

I don't know the block layer but this feels very wrong to me.  Are you
sure there isn't some other way to get this info?

thanks,

greg k-h
Steffen Maier April 16, 2018, 4:33 p.m. UTC | #2
Hi Greg,

On 04/15/2018 10:31 AM, Greg Kroah-Hartman wrote:
> On Fri, Apr 13, 2018 at 03:07:18PM +0200, Steffen Maier wrote:
>> Complements v2.6.31 commit 55782138e47d ("tracing/events: convert block
>> trace points to TRACE_EVENT()") to be equivalent to traditional blktrace
>> output. Also this allows event filtering to not always get all (un)plug
>> events.
>>
>> NB: The NULL pointer check for q->kobj.parent is certainly racy and
>> I don't have enough experience if it's good enough for a trace event.
>> The change did work for my cases (block device read/write I/O on
>> zfcp-attached SCSI disks and dm-mpath on top).
>>
>> While I haven't seen any prior art using driver core (parent) relations
>> for trace events, there are other cases using this when no direct pointer
>> exists between objects, such as:
>>   #define to_scsi_target(d)	container_of(d, struct scsi_target, dev)
>>   static inline struct scsi_target *scsi_target(struct scsi_device *sdev)
>>   {
>> 	return to_scsi_target(sdev->sdev_gendev.parent);
>>   }
> 
> That is because you "know" the parent of a target device is a
> scsi_target.

true

>> This is the object model we make use of here:
>>
>> struct gendisk {
>>          struct hd_struct {
>>                  struct device {      /*container_of*/
>>                          struct kobject kobj; <--+
>>                          dev_t  devt; /*deref*/  |
>>                  } __dev;                        |
>>          } part0;                                |
>>          struct request_queue *queue; ..+        |
>> }                                      :        |
>>                                         :        |
>> struct request_queue {  <..............+        |
>>          /* queue kobject */                     |
>>          struct kobject {                        |
>>                  struct kobject *parent; --------+
> 
> Are you sure about this?

I double checked it with crash on a running system chasing pointers and 
looking at structure debug symbols.
But of course I cannot guarantee it's always been like this and will be.

>>          } kobj;
>> }
>>

>> The difference to blktrace parsed output is that block events don't use the
>> partition's minor number but the containing block device's minor number:
> 
> Why do you want the block device's minor number here?  What is wrong
> with the partition's minor number?  I would think you want that instead.

No change introduced with my patch. I just describe state of the art 
since the mentioned 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55782138e47d.
It (or even its predecessor) used request_queue as trace function 
argument (plus mostly either request or bio). So that's the currently 
available context for these events. My change is consistent with that.
But then again, it's not much of a problem as we do have the remap event 
which shows the mapping from partition to blockdev.

blktrace, hooking with callbacks on the block trace events, has its own 
context information [struct blk_trace] and can get to e.g. the dev_t 
with its own real pointers without using driver core relations. But I 
had the impression that's only wired if one uses the blktrace IOCTL or 
the blk tracer [do_blk_trace_setup()], not for "pure" block events.
> static void blk_add_trace_plug(void *ignore, struct request_queue *q)
> {
> 	struct blk_trace *bt = q->blk_trace;
                                ^^^^^^^^^^^^
> 
> 	if (bt)
> 		__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL);
> }
> 
> static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
> 				    unsigned int depth, bool explicit)
> {
> 	struct blk_trace *bt = q->blk_trace;
                                ^^^^^^^^^^^^
> 
> 	if (bt) {
> 		__be64 rpdu = cpu_to_be64(depth);
> 		u32 what;
> 
> 		if (explicit)
> 			what = BLK_TA_UNPLUG_IO;
> 		else
> 			what = BLK_TA_UNPLUG_TIMER;
> 
> 		__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL);
> 	}
> }

> struct blk_trace {
> 	int trace_state;
> 	struct rchan *rchan;
> 	unsigned long __percpu *sequence;
> 	unsigned char __percpu *msg_data;
> 	u16 act_mask;
> 	u64 start_lba;
> 	u64 end_lba;
> 	u32 pid;
> 	u32 dev;
             ^^^
> 	struct dentry *dir;
> 	struct dentry *dropped_file;
> 	struct dentry *msg_file;
> 	struct list_head running_list;
> 	atomic_t dropped;
> };



>> $ dd if=/dev/sdf1 count=1
>>
>> $ cat /sys/kernel/debug/tracing/trace
>> block_bio_remap: 8,80 R 2048 + 32 <- (8,81) 0
>> block_bio_queue: 8,80 R 2048 + 32 [dd]
>> block_getrq: 8,80 R 2048 + 32 [dd]
>> block_plug: 8,80 [dd]
>>              ^^^^
>> block_rq_insert: 8,80 R 16384 () 2048 + 32 [dd]
>> block_unplug: 8,80 [dd] 1 explicit
>>                ^^^^
>> block_rq_issue: 8,80 R 16384 () 2048 + 32 [dd]
>> block_rq_complete: 8,80 R () 2048 + 32 [0]

>> diff --git a/include/trace/events/block.h b/include/trace/events/block.h
>> index a13613d27cee..cffedc26e8a3 100644
>> --- a/include/trace/events/block.h
>> +++ b/include/trace/events/block.h
>> @@ -460,14 +460,18 @@ TRACE_EVENT(block_plug,
>>   	TP_ARGS(q),
>>   
>>   	TP_STRUCT__entry(
>> +		__field( dev_t,		dev			)
>>   		__array( char,		comm,	TASK_COMM_LEN	)
>>   	),
>>   
>>   	TP_fast_assign(
>> +		__entry->dev = q->kobj.parent ?
>> +		container_of(q->kobj.parent, struct device, kobj)->devt : 0;
> 
> That really really really scares me.  It feels very fragile and messing
> with parent pointers is ripe for things breaking in the future in odd
> and unexplainable ways.
> 
> And how can the parent be NULL?

I'm hoping for help by block layer experts.

I suppose block device unplug/removal could be a case. But I don't know 
the details how this works and if object release is protected while I/O 
is pending and new I/O is rejected beforehand. That might make my 
approach safe as it would not call the trace functions while the parent 
pointer changes.

> I don't know the block layer but this feels very wrong to me.  Are you
> sure there isn't some other way to get this info?

No, I'm not sure at all. But I'm no block layer expert either. This is 
just an idea I had which did work for my cases and I'm looking for 
confirmation or denial by the experts. So if it's not safe from a block 
layer point of view either, then I have to ditch it.
Omar Sandoval April 19, 2018, 7:24 p.m. UTC | #3
On Mon, Apr 16, 2018 at 06:33:27PM +0200, Steffen Maier wrote:
> Hi Greg,
> 
> On 04/15/2018 10:31 AM, Greg Kroah-Hartman wrote:
> > On Fri, Apr 13, 2018 at 03:07:18PM +0200, Steffen Maier wrote:
> > > diff --git a/include/trace/events/block.h b/include/trace/events/block.h
> > > index a13613d27cee..cffedc26e8a3 100644
> > > --- a/include/trace/events/block.h
> > > +++ b/include/trace/events/block.h
> > > @@ -460,14 +460,18 @@ TRACE_EVENT(block_plug,
> > >   	TP_ARGS(q),
> > >   	TP_STRUCT__entry(
> > > +		__field( dev_t,		dev			)
> > >   		__array( char,		comm,	TASK_COMM_LEN	)
> > >   	),
> > >   	TP_fast_assign(
> > > +		__entry->dev = q->kobj.parent ?
> > > +		container_of(q->kobj.parent, struct device, kobj)->devt : 0;
> > 
> > That really really really scares me.  It feels very fragile and messing
> > with parent pointers is ripe for things breaking in the future in odd
> > and unexplainable ways.
> > 
> > And how can the parent be NULL?
> 
> I'm hoping for help by block layer experts.
> 
> I suppose block device unplug/removal could be a case. But I don't know the
> details how this works and if object release is protected while I/O is
> pending and new I/O is rejected beforehand. That might make my approach safe
> as it would not call the trace functions while the parent pointer changes.

We quiesce and freeze the queue before tearing it down, so it won't be
NULL while we're still doing I/O. Cc'ing Bart in case I'm lying to you,
though ;)

> > I don't know the block layer but this feels very wrong to me.  Are you
> > sure there isn't some other way to get this info?
> 
> No, I'm not sure at all. But I'm no block layer expert either. This is just
> an idea I had which did work for my cases and I'm looking for confirmation
> or denial by the experts. So if it's not safe from a block layer point of
> view either, then I have to ditch it.

There's not a pretty way to do this, but using the proper helpers would
be preferred:

disk_devt(dev_to_disk(kobj_to_dev(q->kobj.parent)))

The parent of a request_queue is always a gendisk, so this should always
work.

> -- 
> Mit freundlichen Grüßen / Kind regards
> Steffen Maier
> 
> Linux on z Systems Development
> 
> IBM Deutschland Research & Development GmbH
> Vorsitzende des Aufsichtsrats: Martina Koederitz
> Geschaeftsfuehrung: Dirk Wittkopp
> Sitz der Gesellschaft: Boeblingen
> Registergericht: Amtsgericht Stuttgart, HRB 243294
>
Bart Van Assche April 19, 2018, 8:56 p.m. UTC | #4
On Thu, 2018-04-19 at 12:24 -0700, Omar Sandoval wrote:
> We quiesce and freeze the queue before tearing it down, so it won't be

> NULL while we're still doing I/O. Cc'ing Bart in case I'm lying to you,

> though ;)


blk_cleanup_queue() waits until all I/O requests have finished. Since the
block layer tracing code is triggered from inside the code that processes a
request it is safe to access the request pointer from inside the tracing code.
But I think the question was about the parent of the request queue kobj
instead of about the request queue pointer, the device structure that is
embedded in struct gendisk? I think that parent can disappear at any time.
Most block drivers call del_gendisk() before they call blk_cleanup_queue().
Unless I'm overlooking something I think the request queue will need to
obtain a reference to the gendisk device from inside blk_register_queue()
and drop that reference from inside blk_cleanup_queue() to make it safe for
the tracing code to access struct gendisk.

Bart.
Steffen Maier April 24, 2018, 2:49 p.m. UTC | #5
On 04/19/2018 10:56 PM, Bart Van Assche wrote:
> On Thu, 2018-04-19 at 12:24 -0700, Omar Sandoval wrote:
>> We quiesce and freeze the queue before tearing it down, so it won't be
>> NULL while we're still doing I/O. Cc'ing Bart in case I'm lying to you,
>> though ;)
> 
> blk_cleanup_queue() waits until all I/O requests have finished. Since the
> block layer tracing code is triggered from inside the code that processes a
> request it is safe to access the request pointer from inside the tracing code.
> But I think the question was about the parent of the request queue kobj
> instead of about the request queue pointer, the device structure that is
> embedded in struct gendisk? I think that parent can disappear at any time.
> Most block drivers call del_gendisk() before they call blk_cleanup_queue().
> Unless I'm overlooking something I think the request queue will need to
> obtain a reference to the gendisk device from inside blk_register_queue()
> and drop that reference from inside blk_cleanup_queue() to make it safefor
> the tracing code to access struct gendisk.

Got it, thanks, makes sense.

I did some research, tried the refcount approach, and failed.

Experiment Preparation:
Stopped any udevd and lvm services to avoid additional I/O.
Used the following debug/tracing setup:
modprobe sd_mod
echo -n 'module core +pt; module kobject +pt' > /sys/kernel/debug/dynamic_debug/control
trace-cmd start -e block -f "(dev >= 0x800000 && dev <= 0x80000f) || dev == 0" -e scsi -p function -l "*_gendisk" -l blk_alloc_queue -l blk_register_queue -l blk_unregister_queue -l blk_cleanup_queue
echo "blk_alloc_queue_node:stacktrace" >> /sys/kernel/debug/tracing/set_ftrace_filter
echo "scsi_execute:stacktrace" >> /sys/kernel/debug/tracing/set_ftrace_filter
echo "blk_register_queue:stacktrace" >> /sys/kernel/debug/tracing/set_ftrace_filter
echo "blk_unregister_queue:stacktrace" >> /sys/kernel/debug/tracing/set_ftrace_filter
echo "blk_cleanup_queue:stacktrace" >> /sys/kernel/debug/tracing/set_ftrace_filter

Experiment:
modprobe scsi_debug num_parts=1
sleep 3
echo "HOTUNPLUG" > /sys/kernel/debug/tracing/trace_marker
echo 1 > /sys/class/scsi_device/0:0:0:0/device/delete
echo 0 > /sys/kernel/debug/tracing/tracing_on

Also tried a similar sequence but hotplugged & hotunplugged
a DASD disk (non-SCSI) incl. mq for comparison.

The object life cycle seems to be:

(1) blk_alloc_queue() also creates gendisk kobj
(1a) SCSI does LUN probing I/O
     most of that is blktrace RWBS field value 'N' i.e. non-R/W with dev_t==0
      since q->kobj.parent is still NULL we cannot get gendisk and thus dev_t
     near the end is the first regular read block I/O with dev_t!=0
      as bio!=NULL and bi_disk or rq_disk are non-NULL
(2) blk_register_queue() also creates queue kobj with gendisk parent
    now we can follow q->kobj.parent to gendisk to get dev_t,
    e.g. for RWBS 'N' such as implicit SCSI test unit ready on blk dev close
(3) optionally "regular" I/O
(4) blk_unregister_queue() called by del_gendisk()
    does kobject_del(&q->kobj) which NULLifies q->kobj.parent again
    the last put of gendisk reference releases gendisk kobj
(5) blk_cleanup_queue()
    the last put of queue (/mq) reference releases queue (/mq) kobj

The patch set would have made dev_t meaningful between and incl. (2) and (3)
for (un)plug and even for other block events with RWBS 'N'.
I noticed that I missed one more spot in block_rq_complete [PATCH 3/3],
which explains why I had missed some rq_completions while filtering
block trace events for specific (non-zero) dev_t.
It still wouldn't have been perfect as (1a) events still are dev_t==0
even though the gendisk exists since (1).

I tried the same but increased scsi_debug/delay to a huge value
and made one I/O request pending with "dd" before the hotunplug
to see how flushing would work. For SCSI, at least,
all flushing including a full (and unsuccessful due to eh TUR tmo)
scsi_eh escalation for several minutes happened between
'block' kobj release during (4) and gendisk release during (5),
i.e. before the gendisk was gone but q->kobj.parent was already NULL.
The 'queue' kobj was released last finally.

Since I cannot prove it's always like this, I followed Bart's suggestion
and added another refcount get at (2) blk_register_queue().
However, when I want to put that additional refcount at (5) blk_cleanup_queue(),
we only get a queue pointer argument as context
and q->kobj.parent==NULL since (4),
so I don't know how to get to the gendisk object for the refcount put.
Probably the additional refcount would not help either as (4) has the
unconditional kobject_del(&q->kobj) NULLifying q->kobj.parent anyway...
I guess I came full circle.



Side notes:
The existing code holds a reference on gendisk, but between (2) and (4).

Apparently block trace events can easily observe parts such as (1a)
which are difficult for traditional blktrace to be started via blk dev ioctl
(or for ftrace 'blk' tracer which can be enabled between (2) and (4)).



Independently, I started to wonder if there is a refcount leak in the early error out if kobject_add() failed:

> int blk_register_queue(struct gendisk *disk)
>         struct device *dev = disk_to_dev(disk);
>         struct request_queue *q = disk->queue;
>         ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
>                                     ^^^ref+1
> 	if (ret < 0) {
> 		blk_trace_remove_sysfs(dev);

Above kobject_get already did its side-effect to provide the second argument when calling kobject_add. Even if kobject_add failed, we still have the refcount on the gendisk but don't put it?

> 		goto unlock;
> 	}

> 	if (q->request_fn || (q->mq_ops && q->elevator)) {
> 		ret = elv_register_queue(q);
> 		if (ret) {
> 			mutex_unlock(&q->sysfs_lock);
> 			kobject_uevent(&q->kobj, KOBJ_REMOVE);
> 			kobject_del(&q->kobj);
> 			blk_trace_remove_sysfs(dev);
> 			kobject_put(&dev->kobj);

In contrast, this early return seems to do the pairwise put.

> 			return ret;
> 		}
> 	}

> unlock:
> 	mutex_unlock(&q->sysfs_lock);
> 	return ret;
> }
Bart Van Assche April 27, 2018, 4:38 p.m. UTC | #6
On Tue, 2018-04-24 at 16:49 +0200, Steffen Maier wrote:
> The object life cycle seems to be:

> 

> (1) blk_alloc_queue() also creates gendisk kobj


I think this is a misinterpretation. blk_alloc_queue_node() initializes the
request queue kobj as follows:

	kobject_init(&q->kobj, &blk_queue_ktype);

register_disk() creates the /sys/class/block/<disk> node and /sys/block/<disk>
link as follows:

	if (device_add(ddev))
		return;
	if (!sysfs_deprecated) {
		err = sysfs_create_link(block_depr, &ddev->kobj,
					kobject_name(&ddev->kobj));
		if (err) {
			device_del(ddev);
			return;
		}
	}

> (1a) SCSI does LUN probing I/O

>      most of that is blktrace RWBS field value 'N' i.e. non-R/W with dev_t==0

>       since q->kobj.parent is still NULL we cannot get gendisk and thus dev_t

>      near the end is the first regular read block I/O with dev_t!=0

>       as bio!=NULL and bi_disk or rq_disk are non-NULL

> (2) blk_register_queue() also creates queue kobj with gendisk parent

>     now we can follow q->kobj.parent to gendisk to get dev_t,

>     e.g. for RWBS 'N' such as implicit SCSI test unit ready on blk dev close


Please have a look at the device_add_disk() call in sd_probe_async(). I think
that function triggers a call of blk_register_queue(). That last function
associates the request queue with the gendisk as follows:

	ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");

> (3) optionally "regular" I/O

> (4) blk_unregister_queue() called by del_gendisk()

>     does kobject_del(&q->kobj) which NULLifies q->kobj.parent again

>     the last put of gendisk reference releases gendisk kobj

> (5) blk_cleanup_queue()

>     the last put of queue (/mq) reference releases queue (/mq) kobj


The /mq directory corresponds to q->mq_kobj. The block layer core drops its
reference to the queue kobject (q->kobj) by calling blk_put_queue() at the
end of blk_cleanup_queue().

> Since I cannot prove it's always like this, I followed Bart's suggestion

> and added another refcount get at (2) blk_register_queue().

> However, when I want to put that additional refcount at (5) blk_cleanup_queue(),

> we only get a queue pointer argument as context

> and q->kobj.parent==NULL since (4),

> so I don't know how to get to the gendisk object for the refcount put.


Have you tried to modify blk_register_queue() such that it stores the disk
pointer in a new struct request_queue member?

Thanks,

Bart.
diff mbox

Patch

diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index a13613d27cee..cffedc26e8a3 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -460,14 +460,18 @@  TRACE_EVENT(block_plug,
 	TP_ARGS(q),
 
 	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
 		__array( char,		comm,	TASK_COMM_LEN	)
 	),
 
 	TP_fast_assign(
+		__entry->dev = q->kobj.parent ?
+		container_of(q->kobj.parent, struct device, kobj)->devt : 0;
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
-	TP_printk("[%s]", __entry->comm)
+	TP_printk("%d,%d [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->comm)
 );
 
 #define show_block_unplug_explicit(val)		\
@@ -482,18 +486,23 @@  DECLARE_EVENT_CLASS(block_unplug,
 	TP_ARGS(q, depth, explicit),
 
 	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
 		__field( int,		nr_rq			)
 		__field( bool,		explicit		)
 		__array( char,		comm,	TASK_COMM_LEN	)
 	),
 
 	TP_fast_assign(
+		__entry->dev   = q->kobj.parent ?
+		container_of(q->kobj.parent, struct device, kobj)->devt : 0;
 		__entry->nr_rq = depth;
 		__entry->explicit = explicit;
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
-	TP_printk("[%s] %d %s", __entry->comm, __entry->nr_rq,
+	TP_printk("%d,%d [%s] %d %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->comm, __entry->nr_rq,
 		  show_block_unplug_explicit(__entry->explicit))
 );