diff mbox series

[blktests,v1,2/3] nvme/rc: Avoid triggering host nvme-cli autoconnect

Message ID 20230620132703.20648-3-dwagner@suse.de (mailing list archive)
State New, archived
Headers show
Series More fixes for FC enabling | expand

Commit Message

Daniel Wagner June 20, 2023, 1:27 p.m. UTC
When the host has enabled the udev/systemd autoconnect services for the
fc transport it interacts with blktests and make tests break.

nvme-cli learned to ignore connects attemps when using the
--context command line option paired with a volatile configuration. Thus
we can mark all the resources created by blktests and avoid any
interaction with the systemd autoconnect scripts.

Signed-off-by: Daniel Wagner <dwagner@suse.de>
---
 tests/nvme/rc | 73 ++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 63 insertions(+), 10 deletions(-)

Comments

Sagi Grimberg June 20, 2023, 2:07 p.m. UTC | #1
> When the host has enabled the udev/systemd autoconnect services for the
> fc transport it interacts with blktests and make tests break.
> 
> nvme-cli learned to ignore connects attemps when using the
> --context command line option paired with a volatile configuration. Thus
> we can mark all the resources created by blktests and avoid any
> interaction with the systemd autoconnect scripts.

Hmm... is this hapenning with non-fc as well?

> 
> Signed-off-by: Daniel Wagner <dwagner@suse.de>
> ---
>   tests/nvme/rc | 73 ++++++++++++++++++++++++++++++++++++++++++++-------
>   1 file changed, 63 insertions(+), 10 deletions(-)
> 
> diff --git a/tests/nvme/rc b/tests/nvme/rc
> index 191f3e2e0c43..00117d314a38 100644
> --- a/tests/nvme/rc
> +++ b/tests/nvme/rc
> @@ -14,8 +14,8 @@ def_remote_wwnn="0x10001100aa000001"
>   def_remote_wwpn="0x20001100aa000001"
>   def_local_wwnn="0x10001100aa000002"
>   def_local_wwpn="0x20001100aa000002"
> -def_hostnqn="$(cat /etc/nvme/hostnqn 2> /dev/null)"
> -def_hostid="$(cat /etc/nvme/hostid 2> /dev/null)"
> +def_hostnqn="nqn.2014-08.org.nvmexpress:uuid:242d4a24-2484-4a80-8234-d0169409c5e8"
> +def_hostid="242d4a24-2484-4a80-8234-d0169409c5e8"
>   nvme_trtype=${nvme_trtype:-"loop"}
>   nvme_img_size=${nvme_img_size:-"1G"}
>   nvme_num_iter=${nvme_num_iter:-"1000"}
> @@ -161,6 +161,50 @@ _nvme_calc_rand_io_size() {
>   	echo "${io_size_kb}k"
>   }
>   
> +_nvme_cli_supports_context() {
> +	if ! nvme connect --help 2>&1 | grep -q context > /dev/null; then
> +		    return 1
> +	fi
> +	return 0
> +}

Not a great way to check support.

> +
> +_setup_nvme_cli() {
> +	if ! _nvme_cli_supports_context; then
> +		return
> +	fi
> +
> +	mkdir -p /run/nvme
> +	cat >> /run/nvme/blktests.json <<-EOF
> +	[
> +	  {
> +	    "hostnqn": "${def_hostnqn}",
> +	    "hostid": "${def_hostid}",
> +	    "subsystems": [
> +	      {
> +	        "application": "blktests",
> +	        "nqn": "blktests-subsystem-1",
> +	        "ports": [
> +	          {
> +	            "transport": "fc",
> +	            "traddr": "nn-${def_remote_wwnn}:pn-${def_remote_wwpn}",
> +	            "host_traddr": "nn-${def_local_wwnn}:pn-${def_local_wwpn}"
> +	          }
> +	        ]
> +	      }
> +	    ]
> +	  }
> +	]
> +	EOF
> +}
> +
> +_cleanup_nvme_cli() {
> +	if ! _nvme_cli_supports_context; then
> +		return
> +	fi
> +
> +	rm -f /run/nvme/blktests.json
> +}
> +
>   _nvme_fcloop_add_rport() {
>   	local local_wwnn="$1"
>   	local local_wwpn="$2"
> @@ -235,6 +279,8 @@ _cleanup_fcloop() {
>   	_nvme_fcloop_del_lport "${local_wwnn}" "${local_wwpn}"
>   	_nvme_fcloop_del_rport "${local_wwnn}" "${local_wwpn}" \
>   			       "${remote_wwnn}" "${remote_wwpn}"
> +
> +	_cleanup_nvme_cli
>   }
>   
>   _cleanup_nvmet() {
> @@ -337,6 +383,8 @@ _setup_nvmet() {
>   		def_host_traddr=$(printf "nn-%s:pn-%s" \
>   					 "${def_local_wwnn}" \
>   					 "${def_local_wwpn}")
> +
> +		_setup_nvme_cli
>   	fi
>   }
>   
> @@ -436,18 +484,18 @@ _nvme_connect_subsys() {
>   	trtype="$1"
>   	subsysnqn="$2"
>   
> -	ARGS=(-t "${trtype}" -n "${subsysnqn}")
> +	ARGS=(-t "${trtype}"
> +	      -n "${subsysnqn}"
> +	      --hostnqn="${hostnqn}"
> +	      --hostid="${hostid}")
> +	if _nvme_cli_supports_context; then
> +		ARGS+=(--context="blktests")
> +	fi
>   	if [[ "${trtype}" == "fc" ]] ; then
>   		ARGS+=(-a "${traddr}" -w "${host_traddr}")
>   	elif [[ "${trtype}" != "loop" ]]; then
>   		ARGS+=(-a "${traddr}" -s "${trsvcid}")
>   	fi
> -	if [[ "${hostnqn}" != "$def_hostnqn" ]]; then
> -		ARGS+=(--hostnqn="${hostnqn}")
> -	fi
> -	if [[ "${hostid}" != "$def_hostid" ]]; then
> -		ARGS+=(--hostid="${hostid}")
> -	fi
>   	if [[ -n "${hostkey}" ]]; then
>   		ARGS+=(--dhchap-secret="${hostkey}")
>   	fi
> @@ -482,7 +530,12 @@ _nvme_discover() {
>   	local host_traddr="${3:-$def_host_traddr}"
>   	local trsvcid="${3:-$def_trsvcid}"
>   
> -	ARGS=(-t "${trtype}")
> +	ARGS=(-t "${trtype}"
> +	      --hostnqn="${def_hostnqn}"
> +	      --hostid="${def_hostid}")
> +	if _nvme_cli_supports_context; then
> +		ARGS+=(--context="blktests")
> +	fi
>   	if [[ "${trtype}" = "fc" ]]; then
>   		ARGS+=(-a "${traddr}" -w "${host_traddr}")
>   	elif [[ "${trtype}" != "loop" ]]; then
Daniel Wagner June 20, 2023, 5:43 p.m. UTC | #2
On Tue, Jun 20, 2023 at 05:07:43PM +0300, Sagi Grimberg wrote:
> 
> > When the host has enabled the udev/systemd autoconnect services for the
> > fc transport it interacts with blktests and make tests break.
> > 
> > nvme-cli learned to ignore connects attemps when using the
> > --context command line option paired with a volatile configuration. Thus
> > we can mark all the resources created by blktests and avoid any
> > interaction with the systemd autoconnect scripts.
> 
> Hmm... is this hapenning with non-fc as well?

I haven't seen a problem for TCP or RDMA yet but in principle it should also
exists. I can do some tracing to see if we have also problem thern. Two of the
udev rule match on the subsystem and the event type.

> > +_nvme_cli_supports_context() {
> > +	if ! nvme connect --help 2>&1 | grep -q context > /dev/null; then
> > +		    return 1
> > +	fi
> > +	return 0
> > +}
> 
> Not a great way to check support.

Yeah, agree, it's a bit dodgy. I'll try to figure out a different way. Any
suggestions?
Daniel Wagner June 21, 2023, 9:02 a.m. UTC | #3
On Tue, Jun 20, 2023 at 07:43:35PM +0200, Daniel Wagner wrote:
> On Tue, Jun 20, 2023 at 05:07:43PM +0300, Sagi Grimberg wrote:
> > Hmm... is this hapenning with non-fc as well?
> 
> I haven't seen a problem for TCP or RDMA yet but in principle it should also
> exists. I can do some tracing to see if we have also problem thern. Two of the
> udev rule match on the subsystem and the event type.

The only udev rule which gets triggered during blktests execution is this one:

# nvme-fc transport generated events (old-style for compatibility)
ACTION=="change", SUBSYSTEM=="fc", ENV{FC_EVENT}=="nvmediscovery", \
  ENV{NVMEFC_HOST_TRADDR}=="*",  ENV{NVMEFC_TRADDR}=="*", \
  RUN+="@SYSTEMCTL@ --no-block restart nvmf-connect@--device=none\t--transport=fc\t--traddr=$env{NVMEFC_TRADDR}\t--trsvcid=none\t--host-traddr=$env{NVMEFC_HOST_TRADDR}.service"

That explain why blktests didn't get disturbed by the autoconnect rule as this
rule has a match on the fc subsystem.
Shin'ichiro Kawasaki June 27, 2023, 10:22 a.m. UTC | #4
On Jun 20, 2023 / 15:27, Daniel Wagner wrote:
> When the host has enabled the udev/systemd autoconnect services for the
> fc transport it interacts with blktests and make tests break.
> 
> nvme-cli learned to ignore connects attemps when using the
> --context command line option paired with a volatile configuration. Thus
> we can mark all the resources created by blktests and avoid any
> interaction with the systemd autoconnect scripts.

This sounds a good idea. Question, is "--context" option of the nvme command
mandatory to run nvme test with nvme_trtype=fc? Or is it nice-to-have feature
depending on the test system OS? If it is mandatory, it's the better to check
in _nvme_requires.

> 
> Signed-off-by: Daniel Wagner <dwagner@suse.de>
> ---
>  tests/nvme/rc | 73 ++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 63 insertions(+), 10 deletions(-)
> 
> diff --git a/tests/nvme/rc b/tests/nvme/rc
> index 191f3e2e0c43..00117d314a38 100644
> --- a/tests/nvme/rc
> +++ b/tests/nvme/rc
> @@ -14,8 +14,8 @@ def_remote_wwnn="0x10001100aa000001"
>  def_remote_wwpn="0x20001100aa000001"
>  def_local_wwnn="0x10001100aa000002"
>  def_local_wwpn="0x20001100aa000002"
> -def_hostnqn="$(cat /etc/nvme/hostnqn 2> /dev/null)"
> -def_hostid="$(cat /etc/nvme/hostid 2> /dev/null)"
> +def_hostnqn="nqn.2014-08.org.nvmexpress:uuid:242d4a24-2484-4a80-8234-d0169409c5e8"
> +def_hostid="242d4a24-2484-4a80-8234-d0169409c5e8"
>  nvme_trtype=${nvme_trtype:-"loop"}
>  nvme_img_size=${nvme_img_size:-"1G"}
>  nvme_num_iter=${nvme_num_iter:-"1000"}
> @@ -161,6 +161,50 @@ _nvme_calc_rand_io_size() {
>  	echo "${io_size_kb}k"
>  }
>  
> +_nvme_cli_supports_context() {
> +	if ! nvme connect --help 2>&1 | grep -q context > /dev/null; then
> +		    return 1
> +	fi
> +	return 0
> +}
> +
> +_setup_nvme_cli() {
> +	if ! _nvme_cli_supports_context; then
> +		return
> +	fi
> +
> +	mkdir -p /run/nvme
> +	cat >> /run/nvme/blktests.json <<-EOF
> +	[
> +	  {
> +	    "hostnqn": "${def_hostnqn}",
> +	    "hostid": "${def_hostid}",
> +	    "subsystems": [
> +	      {
> +	        "application": "blktests",
> +	        "nqn": "blktests-subsystem-1",
> +	        "ports": [
> +	          {
> +	            "transport": "fc",
> +	            "traddr": "nn-${def_remote_wwnn}:pn-${def_remote_wwpn}",
> +	            "host_traddr": "nn-${def_local_wwnn}:pn-${def_local_wwpn}"
> +	          }
> +	        ]
> +	      }
> +	    ]
> +	  }
> +	]
> +	EOF
> +}
> +
> +_cleanup_nvme_cli() {
> +	if ! _nvme_cli_supports_context; then
> +		return
> +	fi
> +
> +	rm -f /run/nvme/blktests.json
> +}
> +
>  _nvme_fcloop_add_rport() {
>  	local local_wwnn="$1"
>  	local local_wwpn="$2"
> @@ -235,6 +279,8 @@ _cleanup_fcloop() {
>  	_nvme_fcloop_del_lport "${local_wwnn}" "${local_wwpn}"
>  	_nvme_fcloop_del_rport "${local_wwnn}" "${local_wwpn}" \
>  			       "${remote_wwnn}" "${remote_wwpn}"
> +
> +	_cleanup_nvme_cli
>  }
>  
>  _cleanup_nvmet() {
> @@ -337,6 +383,8 @@ _setup_nvmet() {
>  		def_host_traddr=$(printf "nn-%s:pn-%s" \
>  					 "${def_local_wwnn}" \
>  					 "${def_local_wwpn}")
> +
> +		_setup_nvme_cli

Can we move this _setup_nvme_cli call from _setup_nvmet to _setup_fcloop?
_cleanup_nvme_cli is called in _cleanup_fcloop. I think it is a bit cleaner
since those setup/cleanup functions are called at at same level.

>  	fi
>  }
>  
> @@ -436,18 +484,18 @@ _nvme_connect_subsys() {
>  	trtype="$1"
>  	subsysnqn="$2"
>  
> -	ARGS=(-t "${trtype}" -n "${subsysnqn}")
> +	ARGS=(-t "${trtype}"
> +	      -n "${subsysnqn}"
> +	      --hostnqn="${hostnqn}"
> +	      --hostid="${hostid}")
> +	if _nvme_cli_supports_context; then
> +		ARGS+=(--context="blktests")
> +	fi
>  	if [[ "${trtype}" == "fc" ]] ; then
>  		ARGS+=(-a "${traddr}" -w "${host_traddr}")
>  	elif [[ "${trtype}" != "loop" ]]; then
>  		ARGS+=(-a "${traddr}" -s "${trsvcid}")
>  	fi
> -	if [[ "${hostnqn}" != "$def_hostnqn" ]]; then
> -		ARGS+=(--hostnqn="${hostnqn}")
> -	fi
> -	if [[ "${hostid}" != "$def_hostid" ]]; then
> -		ARGS+=(--hostid="${hostid}")
> -	fi
>  	if [[ -n "${hostkey}" ]]; then
>  		ARGS+=(--dhchap-secret="${hostkey}")
>  	fi
> @@ -482,7 +530,12 @@ _nvme_discover() {
>  	local host_traddr="${3:-$def_host_traddr}"
>  	local trsvcid="${3:-$def_trsvcid}"
>  
> -	ARGS=(-t "${trtype}")
> +	ARGS=(-t "${trtype}"
> +	      --hostnqn="${def_hostnqn}"
> +	      --hostid="${def_hostid}")
> +	if _nvme_cli_supports_context; then
> +		ARGS+=(--context="blktests")
> +	fi
>  	if [[ "${trtype}" = "fc" ]]; then
>  		ARGS+=(-a "${traddr}" -w "${host_traddr}")
>  	elif [[ "${trtype}" != "loop" ]]; then
> -- 
> 2.41.0
>
Daniel Wagner June 28, 2023, 6:09 a.m. UTC | #5
On Tue, Jun 27, 2023 at 10:22:53AM +0000, Shinichiro Kawasaki wrote:
> On Jun 20, 2023 / 15:27, Daniel Wagner wrote:
> > When the host has enabled the udev/systemd autoconnect services for the
> > fc transport it interacts with blktests and make tests break.
> > 
> > nvme-cli learned to ignore connects attemps when using the
> > --context command line option paired with a volatile configuration. Thus
> > we can mark all the resources created by blktests and avoid any
> > interaction with the systemd autoconnect scripts.
> 
> This sounds a good idea. Question, is "--context" option of the nvme command
> mandatory to run nvme test with nvme_trtype=fc?

If nvme-cli is called without the '--context' option, the command will be
executed. Though if '--context' is provided as option and there is a
configuration which matches the connect parameters but doesn't match the context
it will ignore the operation.

The blktests tests expects that nothing behind it's back is fiddling on the
setup while it is running. So far udev didn't trigger for rdma/tcp but with fc
it will.

Thus, it's mandatory to use either the '--context' parameter or alternatively
disable the rule with

  ln -s /etc/udev/rules.d/70-nvmf-autoconnect.rules /dev/null

BTW, when the udev rule is active I observed crashes when running blktests. So
there is more to fix, though one thing at the time.

> Or is it nice-to-have feature
> depending on the test system OS? If it is mandatory, it's the better to check
> in _nvme_requires.

Well, I didn't want to make this a hard requirement for all tests. I guess we
could make it for fc only if this is what you had in mind. The question should
it only test for nvme-cli supporting --context or should it be really clever and
test if the udev rule is also active (no idea how but I assume it is possible)?

> >  _cleanup_nvmet() {
> > @@ -337,6 +383,8 @@ _setup_nvmet() {
> >  		def_host_traddr=$(printf "nn-%s:pn-%s" \
> >  					 "${def_local_wwnn}" \
> >  					 "${def_local_wwpn}")
> > +
> > +		_setup_nvme_cli
> 
> Can we move this _setup_nvme_cli call from _setup_nvmet to _setup_fcloop?
> _cleanup_nvme_cli is called in _cleanup_fcloop. I think it is a bit cleaner
> since those setup/cleanup functions are called at at same level.

Sure, no problem.
Shin'ichiro Kawasaki June 28, 2023, 10:04 a.m. UTC | #6
On Jun 28, 2023 / 08:09, Daniel Wagner wrote:
> On Tue, Jun 27, 2023 at 10:22:53AM +0000, Shinichiro Kawasaki wrote:
> > On Jun 20, 2023 / 15:27, Daniel Wagner wrote:
> > > When the host has enabled the udev/systemd autoconnect services for the
> > > fc transport it interacts with blktests and make tests break.
> > > 
> > > nvme-cli learned to ignore connects attemps when using the
> > > --context command line option paired with a volatile configuration. Thus
> > > we can mark all the resources created by blktests and avoid any
> > > interaction with the systemd autoconnect scripts.
> > 
> > This sounds a good idea. Question, is "--context" option of the nvme command
> > mandatory to run nvme test with nvme_trtype=fc?
> 
> If nvme-cli is called without the '--context' option, the command will be
> executed. Though if '--context' is provided as option and there is a
> configuration which matches the connect parameters but doesn't match the context
> it will ignore the operation.
> 
> The blktests tests expects that nothing behind it's back is fiddling on the
> setup while it is running. So far udev didn't trigger for rdma/tcp but with fc
> it will.
> 
> Thus, it's mandatory to use either the '--context' parameter or alternatively
> disable the rule with
> 
>   ln -s /etc/udev/rules.d/70-nvmf-autoconnect.rules /dev/null
> 
> BTW, when the udev rule is active I observed crashes when running blktests. So
> there is more to fix, though one thing at the time.
> 
> > Or is it nice-to-have feature
> > depending on the test system OS? If it is mandatory, it's the better to check
> > in _nvme_requires.
> 
> Well, I didn't want to make this a hard requirement for all tests. I guess we
> could make it for fc only if this is what you had in mind. The question should
> it only test for nvme-cli supporting --context or should it be really clever and
> test if the udev rule is also active (no idea how but I assume it is possible)?

Thanks for the explanations. It looks that the requirement check I suggested i
_nvme_requires will be will be too much. And I don't have good idea for the udev
rule check either, so let's settle this change without the checks.
Daniel Wagner June 28, 2023, 3 p.m. UTC | #7
On Wed, Jun 28, 2023 at 10:04:01AM +0000, Shinichiro Kawasaki wrote:
> > BTW, when the udev rule is active I observed crashes when running blktests. So
> > there is more to fix, though one thing at the time.

FTR, this is typical hanger/crash I see when the udev rule is active:

 run blktests nvme/005 at 2023-06-28 16:30:07
 loop0: detected capacity change from 0 to 32768
 nvmet: adding nsid 1 to subsystem blktests-subsystem-1
 nvme nvme2: NVME-FC{0}: create association : host wwpn 0x20001100aa000002  rport wwpn 0x20001100aa000001: NQN "blktests-subsystem-1"
 (NULL device *): {0:0} Association created
 [43] nvmet: ctrl 1 start keep-alive timer for 5 secs
 nvmet: creating nvm controller 1 for subsystem blktests-subsystem-1 for NQN nqn.2014-08.org.nvmexpress:uuid:242d4a24-2484-4a80-8234-d0169409c5e8.
 [7] nvmet: adding queue 1 to ctrl 1.
 [363] nvmet: adding queue 2 to ctrl 1.
 [43] nvmet: adding queue 3 to ctrl 1.
 [45] nvmet: adding queue 4 to ctrl 1.
 nvme nvme2: NVME-FC{0}: new ctrl: NQN "blktests-subsystem-1"
 nvme nvme3: NVME-FC{1}: create association : host wwpn 0x20001100aa000002  rport wwpn 0x20001100aa000001: NQN "nqn.2014-08.org.nvmexpress.discovery"
 (NULL device *): {0:1} Association created
 [43] nvmet: ctrl 2 start keep-alive timer for 120 secs
 nvmet: creating discovery controller 2 for subsystem nqn.2014-08.org.nvmexpress.discovery for NQN nqn.2014-08.org.nvmexpress:uuid:242d4a24-2484-4a80-8234-d0169409c5e8.
 nvme nvme3: NVME-FC{1}: new ctrl: NQN "nqn.2014-08.org.nvmexpress.discovery"
 nvme nvme2: NVME-FC{0}: create association : host wwpn 0x20001100aa000002  rport wwpn 0x20001100aa000001: NQN "blktests-subsystem-1"
 (NULL device *): {0:2} Association created
 [24] nvmet: ctrl 3 start keep-alive timer for 5 secs
 nvmet: creating nvm controller 3 for subsystem blktests-subsystem-1 for NQN nqn.2014-08.org.nvmexpress:uuid:242d4a24-2484-4a80-8234-d0169409c5e8.
 [7] nvmet: adding queue 1 to ctrl 3.
 [24] nvmet: adding queue 2 to ctrl 3.
 [43] nvmet: adding queue 3 to ctrl 3.
 [45] nvmet: adding queue 4 to ctrl 3.
 nvme nvme2: NVME-FC{0}: controller connect complete
 nvme nvme2: Removing ctrl: NQN "blktests-subsystem-1"
 block nvme2n1: no available path - failing I/O
 block nvme2n1: no available path - failing I/O
 Buffer I/O error on dev nvme2n1, logical block 0, async page read
 [363] nvmet: ctrl 1 stop keep-alive
 (NULL device *): {0:0} Association deleted
 (NULL device *): {0:0} Association freed
 (NULL device *): Disconnect LS failed: No Association
 general protection fault, probably for non-canonical address 0xdffffc00000000a4: 0000 [#1] PREEMPT SMP KASAN NOPTI
 KASAN: null-ptr-deref in range [0x0000000000000520-0x0000000000000527]
 CPU: 1 PID: 363 Comm: kworker/1:4 Tainted: G    B   W          6.4.0-rc2+ #4 451dee9b3635bb59af0c91bc19227b1b3bbbbf3f
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown unknown
 Workqueue: nvmet-wq fcloop_fcp_recv_work [nvme_fcloop]
 RIP: 0010:nvmet_execute_disc_get_log_page+0x23f/0x8c0 [nvmet]
 Code: e8 c6 12 63 e3 4c 89 6c 24 40 48 89 5c 24 08 4c 8b 3b 49 8d 9f 20 05 00 00 48 89 d8 48 c1 e8 03 48 b9 00 00 00 00 00 fc ff df <80> 3c 08 00 74 08 48 89 df e8 93 12 63 e3 4c 89 74 24 30 4c 8b 2b
 RSP: 0018:ffff88811ab378a0 EFLAGS: 00010202
 RAX: 00000000000000a4 RBX: 0000000000000520 RCX: dffffc0000000000
 RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffffffffa95d2d70
 RBP: ffff88811ab37ab0 R08: dffffc0000000000 R09: fffffbfff52ba5af
 R10: 0000000000000000 R11: dffffc0000000001 R12: 1ffff11023566f20
 R13: ffff888107db3260 R14: ffff888107db3270 R15: 0000000000000000
 FS:  0000000000000000(0000) GS:ffff88815a600000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000002394000 CR3: 000000011e5e8006 CR4: 0000000000370ee0
 Call Trace:
  <TASK>
  ? prepare_alloc_pages+0x1c5/0x580
  ? __cfi_nvmet_execute_disc_get_log_page+0x10/0x10 [nvmet f0c9e080f6cf521f454ee104d12dafd11a5a7613]
  ? __alloc_pages+0x30e/0x650
  ? slab_post_alloc_hook+0x67/0x350
  ? __cfi___alloc_pages+0x10/0x10
  ? alloc_pages+0x30e/0x530
  ? sgl_alloc_order+0x118/0x320
  nvmet_fc_queue_fcp_req+0xa19/0xda0 [nvmet_fc f192e753efa6aca39b2b3b72c07e648aac01348b]
  ? nvmet_fc_rcv_fcp_req+0x9c0/0x9c0 [nvmet_fc f192e753efa6aca39b2b3b72c07e648aac01348b]
  ? do_raw_spin_unlock+0x116/0x8a0
  ? nvmet_fc_rcv_fcp_req+0x4de/0x9c0 [nvmet_fc f192e753efa6aca39b2b3b72c07e648aac01348b]
  nvmet_fc_rcv_fcp_req+0x4f0/0x9c0 [nvmet_fc f192e753efa6aca39b2b3b72c07e648aac01348b]
  fcloop_fcp_recv_work+0x173/0x440 [nvme_fcloop dcecfb508887e13f93cf8f37ef42c0fab90cbd00]
  process_one_work+0x80f/0xfb0
  ? rescuer_thread+0x1130/0x1130
  ? do_raw_spin_trylock+0xc9/0x1f0
  ? lock_acquired+0x4a/0x9a0
  ? wq_worker_sleeping+0x1e/0x200
  worker_thread+0x91e/0x1260
  ? do_raw_spin_unlock+0x116/0x8a0
  kthread+0x25d/0x2f0
  ? __cfi_worker_thread+0x10/0x10
  ? __cfi_kthread+0x10/0x10
  ret_from_fork+0x29/0x50
  </TASK>
Max Gurtovoy July 6, 2023, 4:11 p.m. UTC | #8
On 20/06/2023 16:27, Daniel Wagner wrote:
> When the host has enabled the udev/systemd autoconnect services for the
> fc transport it interacts with blktests and make tests break.
> 
> nvme-cli learned to ignore connects attemps when using the
> --context command line option paired with a volatile configuration. Thus
> we can mark all the resources created by blktests and avoid any
> interaction with the systemd autoconnect scripts.

why would we like to ignore connect attempts during blktests ?
We define unique nqn/ids/etc. So we never should disturb other running 
services, unless it uses the same parameters, which shouldn't happen.

Agree on setting the hard coded values for hostnqn and hostid instead of 
reading the /etc/nvme/* files. This should do the work IMO, isn't it ?

> 
> Signed-off-by: Daniel Wagner <dwagner@suse.de>
> ---
>   tests/nvme/rc | 73 ++++++++++++++++++++++++++++++++++++++++++++-------
>   1 file changed, 63 insertions(+), 10 deletions(-)
> 
> diff --git a/tests/nvme/rc b/tests/nvme/rc
> index 191f3e2e0c43..00117d314a38 100644
> --- a/tests/nvme/rc
> +++ b/tests/nvme/rc
> @@ -14,8 +14,8 @@ def_remote_wwnn="0x10001100aa000001"
>   def_remote_wwpn="0x20001100aa000001"
>   def_local_wwnn="0x10001100aa000002"
>   def_local_wwpn="0x20001100aa000002"
> -def_hostnqn="$(cat /etc/nvme/hostnqn 2> /dev/null)"
> -def_hostid="$(cat /etc/nvme/hostid 2> /dev/null)"
> +def_hostnqn="nqn.2014-08.org.nvmexpress:uuid:242d4a24-2484-4a80-8234-d0169409c5e8"
> +def_hostid="242d4a24-2484-4a80-8234-d0169409c5e8"
>   nvme_trtype=${nvme_trtype:-"loop"}
>   nvme_img_size=${nvme_img_size:-"1G"}
>   nvme_num_iter=${nvme_num_iter:-"1000"}
> @@ -161,6 +161,50 @@ _nvme_calc_rand_io_size() {
>   	echo "${io_size_kb}k"
>   }
>   
> +_nvme_cli_supports_context() {
> +	if ! nvme connect --help 2>&1 | grep -q context > /dev/null; then
> +		    return 1
> +	fi
> +	return 0
> +}
> +
> +_setup_nvme_cli() {
> +	if ! _nvme_cli_supports_context; then
> +		return
> +	fi
> +
> +	mkdir -p /run/nvme
> +	cat >> /run/nvme/blktests.json <<-EOF
> +	[
> +	  {
> +	    "hostnqn": "${def_hostnqn}",
> +	    "hostid": "${def_hostid}",
> +	    "subsystems": [
> +	      {
> +	        "application": "blktests",
> +	        "nqn": "blktests-subsystem-1",
> +	        "ports": [
> +	          {
> +	            "transport": "fc",
> +	            "traddr": "nn-${def_remote_wwnn}:pn-${def_remote_wwpn}",
> +	            "host_traddr": "nn-${def_local_wwnn}:pn-${def_local_wwpn}"
> +	          }
> +	        ]
> +	      }
> +	    ]
> +	  }
> +	]
> +	EOF
> +}
> +
> +_cleanup_nvme_cli() {
> +	if ! _nvme_cli_supports_context; then
> +		return
> +	fi
> +
> +	rm -f /run/nvme/blktests.json
> +}
> +
>   _nvme_fcloop_add_rport() {
>   	local local_wwnn="$1"
>   	local local_wwpn="$2"
> @@ -235,6 +279,8 @@ _cleanup_fcloop() {
>   	_nvme_fcloop_del_lport "${local_wwnn}" "${local_wwpn}"
>   	_nvme_fcloop_del_rport "${local_wwnn}" "${local_wwpn}" \
>   			       "${remote_wwnn}" "${remote_wwpn}"
> +
> +	_cleanup_nvme_cli
>   }
>   
>   _cleanup_nvmet() {
> @@ -337,6 +383,8 @@ _setup_nvmet() {
>   		def_host_traddr=$(printf "nn-%s:pn-%s" \
>   					 "${def_local_wwnn}" \
>   					 "${def_local_wwpn}")
> +
> +		_setup_nvme_cli
>   	fi
>   }
>   
> @@ -436,18 +484,18 @@ _nvme_connect_subsys() {
>   	trtype="$1"
>   	subsysnqn="$2"
>   
> -	ARGS=(-t "${trtype}" -n "${subsysnqn}")
> +	ARGS=(-t "${trtype}"
> +	      -n "${subsysnqn}"
> +	      --hostnqn="${hostnqn}"
> +	      --hostid="${hostid}")
> +	if _nvme_cli_supports_context; then
> +		ARGS+=(--context="blktests")
> +	fi
>   	if [[ "${trtype}" == "fc" ]] ; then
>   		ARGS+=(-a "${traddr}" -w "${host_traddr}")
>   	elif [[ "${trtype}" != "loop" ]]; then
>   		ARGS+=(-a "${traddr}" -s "${trsvcid}")
>   	fi
> -	if [[ "${hostnqn}" != "$def_hostnqn" ]]; then
> -		ARGS+=(--hostnqn="${hostnqn}")
> -	fi
> -	if [[ "${hostid}" != "$def_hostid" ]]; then
> -		ARGS+=(--hostid="${hostid}")
> -	fi
>   	if [[ -n "${hostkey}" ]]; then
>   		ARGS+=(--dhchap-secret="${hostkey}")
>   	fi
> @@ -482,7 +530,12 @@ _nvme_discover() {
>   	local host_traddr="${3:-$def_host_traddr}"
>   	local trsvcid="${3:-$def_trsvcid}"
>   
> -	ARGS=(-t "${trtype}")
> +	ARGS=(-t "${trtype}"
> +	      --hostnqn="${def_hostnqn}"
> +	      --hostid="${def_hostid}")
> +	if _nvme_cli_supports_context; then
> +		ARGS+=(--context="blktests")
> +	fi
>   	if [[ "${trtype}" = "fc" ]]; then
>   		ARGS+=(-a "${traddr}" -w "${host_traddr}")
>   	elif [[ "${trtype}" != "loop" ]]; then
Daniel Wagner July 10, 2023, 8:27 a.m. UTC | #9
On Thu, Jul 06, 2023 at 07:11:58PM +0300, Max Gurtovoy wrote:
> 
> 
> On 20/06/2023 16:27, Daniel Wagner wrote:
> > When the host has enabled the udev/systemd autoconnect services for the
> > fc transport it interacts with blktests and make tests break.
> > 
> > nvme-cli learned to ignore connects attemps when using the
> > --context command line option paired with a volatile configuration. Thus
> > we can mark all the resources created by blktests and avoid any
> > interaction with the systemd autoconnect scripts.
> 
> why would we like to ignore connect attempts during blktests ?

The problem I observed is that there were two connect attempts (one triggered
via udev and one via the test itself) which disturbed the test. The interference
resulted into a complete hang of the test case:

https://lore.kernel.org/linux-nvme/6g53yj5afhhonwf2psf7ft2gkakkwewy7klkix3y3u6uclpa5w@tt2yfzigyxgg/

> We define unique nqn/ids/etc. So we never should disturb other running
> services, unless it uses the same parameters, which shouldn't happen.

Yes, that should do the decoupling between udev and blktests.

> Agree on setting the hard coded values for hostnqn and hostid instead of
> reading the /etc/nvme/* files. This should do the work IMO, isn't it ?

Do you mean that nvme-cli will ignore the udev trigger when a different hostnqn
is invovled? At least for the well-known discover controller the hostqnq
doesn't work. Though not sure if I understood you correctly here.
Max Gurtovoy July 10, 2023, 9:53 a.m. UTC | #10
On 10/07/2023 11:27, Daniel Wagner wrote:
> On Thu, Jul 06, 2023 at 07:11:58PM +0300, Max Gurtovoy wrote:
>>
>>
>> On 20/06/2023 16:27, Daniel Wagner wrote:
>>> When the host has enabled the udev/systemd autoconnect services for the
>>> fc transport it interacts with blktests and make tests break.
>>>
>>> nvme-cli learned to ignore connects attemps when using the
>>> --context command line option paired with a volatile configuration. Thus
>>> we can mark all the resources created by blktests and avoid any
>>> interaction with the systemd autoconnect scripts.
>>
>> why would we like to ignore connect attempts during blktests ?
> 
> The problem I observed is that there were two connect attempts (one triggered
> via udev and one via the test itself) which disturbed the test. The interference
> resulted into a complete hang of the test case:
> 
> https://lore.kernel.org/linux-nvme/6g53yj5afhhonwf2psf7ft2gkakkwewy7klkix3y3u6uclpa5w@tt2yfzigyxgg/
> 
>> We define unique nqn/ids/etc. So we never should disturb other running
>> services, unless it uses the same parameters, which shouldn't happen.
> 
> Yes, that should do the decoupling between udev and blktests.
> 
>> Agree on setting the hard coded values for hostnqn and hostid instead of
>> reading the /etc/nvme/* files. This should do the work IMO, isn't it ?
> 
> Do you mean that nvme-cli will ignore the udev trigger when a different hostnqn
> is invovled? At least for the well-known discover controller the hostqnq
> doesn't work. Though not sure if I understood you correctly here.

All I was saying is that your issue would have been fixed easily by 2-3 
LOC by removing the usage of /etc/nvme/* files that are usually 
configured by system administrator and use a default values for blktests 
hostnqn and hostid.
The usage of --context in blktests is redundant IMO.
Daniel Wagner July 10, 2023, 10:19 a.m. UTC | #11
On Mon, Jul 10, 2023 at 12:53:17PM +0300, Max Gurtovoy wrote:
 > > Agree on setting the hard coded values for hostnqn and hostid instead of
> > > reading the /etc/nvme/* files. This should do the work IMO, isn't it ?
> > 
> > Do you mean that nvme-cli will ignore the udev trigger when a different hostnqn
> > is invovled? At least for the well-known discover controller the hostqnq
> > doesn't work. Though not sure if I understood you correctly here.
> 
> All I was saying is that your issue would have been fixed easily by 2-3 LOC
> by removing the usage of /etc/nvme/* files that are usually configured by
> system administrator and use a default values for blktests hostnqn and
> hostid.

Okay.

> The usage of --context in blktests is redundant IMO.

Right, I made a bit of a mess with the commit message. Sorry about that.
Max Gurtovoy July 10, 2023, 12:31 p.m. UTC | #12
On 10/07/2023 13:19, Daniel Wagner wrote:
> On Mon, Jul 10, 2023 at 12:53:17PM +0300, Max Gurtovoy wrote:
>   > > Agree on setting the hard coded values for hostnqn and hostid instead of
>>>> reading the /etc/nvme/* files. This should do the work IMO, isn't it ?
>>>
>>> Do you mean that nvme-cli will ignore the udev trigger when a different hostnqn
>>> is invovled? At least for the well-known discover controller the hostqnq
>>> doesn't work. Though not sure if I understood you correctly here.
>>
>> All I was saying is that your issue would have been fixed easily by 2-3 LOC
>> by removing the usage of /etc/nvme/* files that are usually configured by
>> system administrator and use a default values for blktests hostnqn and
>> hostid.
> 
> Okay.
> 
>> The usage of --context in blktests is redundant IMO.
> 
> Right, I made a bit of a mess with the commit message. Sorry about that.

I think it is more than just commit message.
A lot of code that we can avoid was added regarding the --context 
cmdline argument.
Maybe it's worth cleaning it..
Daniel Wagner July 10, 2023, 3:03 p.m. UTC | #13
On Mon, Jul 10, 2023 at 03:31:23PM +0300, Max Gurtovoy wrote:
> I think it is more than just commit message.

Okay, starting to understand what's the problem.

> A lot of code that we can avoid was added regarding the --context cmdline
> argument.

Correct and it's not optional to get the tests passing for the fc transport.

> Maybe it's worth cleaning it..

It really solves the problem that the autoconnect setup of nvme-cli is
distrubing the tests (*). The only other way I found to stop the autoconnect is
by disabling the udev rule completely. If autoconnect isn't enabled the context
isn't necessary. Though changing system configuration from blktests seems at bit
excessive.

Another option is to detect if autoconect is enabled and report this when
starting the tests. In this case we could remove the context part completely.
Obviously, I would prefer to keep it but I am certaintaly not against dropping
it and make blktests a bit simpler if this is the preference. I just need to
remember to disable the autoconnect stuff when using blktests.

(*) Sure we can fix this but at this point. Though I think it's a bit strange
for a test suite to depend/interact with external components in this way.
Max Gurtovoy July 10, 2023, 4:30 p.m. UTC | #14
On 10/07/2023 18:03, Daniel Wagner wrote:
> On Mon, Jul 10, 2023 at 03:31:23PM +0300, Max Gurtovoy wrote:
>> I think it is more than just commit message.
> 
> Okay, starting to understand what's the problem.
> 
>> A lot of code that we can avoid was added regarding the --context cmdline
>> argument.
> 
> Correct and it's not optional to get the tests passing for the fc transport.

why the fc needs the --context to pass tests ?

> 
>> Maybe it's worth cleaning it..
> 
> It really solves the problem that the autoconnect setup of nvme-cli is
> distrubing the tests (*). The only other way I found to stop the autoconnect is
> by disabling the udev rule completely. If autoconnect isn't enabled the context
> isn't necessary. Though changing system configuration from blktests seems at bit
> excessive.

we should not stop any autoconnect during blktests. The autoconnect and 
all the system admin services should run normally.
The blktests should not interfere with regular system configuration and 
should not use any sysadmin file/services/devices.

It should create its own subsystems, nqn's, id's, namespaces, etc..

> 
> Another option is to detect if autoconect is enabled and report this when
> starting the tests. In this case we could remove the context part completely.

the --context might be used in the some sysadmin scripts or so. I don't 
see a point to do so in the blktests since we create a new association 
between initiator and target on each test (or on each group of tests).

> Obviously, I would prefer to keep it but I am certaintaly not against dropping
> it and make blktests a bit simpler if this is the preference. I just need to
> remember to disable the autoconnect stuff when using blktests.
> 
> (*) Sure we can fix this but at this point. Though I think it's a bit strange
> for a test suite to depend/interact with external components in this way.
Daniel Wagner July 12, 2023, 12:04 p.m. UTC | #15
On Mon, Jul 10, 2023 at 07:30:20PM +0300, Max Gurtovoy wrote:
> 
> 
> On 10/07/2023 18:03, Daniel Wagner wrote:
> > On Mon, Jul 10, 2023 at 03:31:23PM +0300, Max Gurtovoy wrote:
> > > I think it is more than just commit message.
> > 
> > Okay, starting to understand what's the problem.
> > 
> > > A lot of code that we can avoid was added regarding the --context cmdline
> > > argument.
> > 
> > Correct and it's not optional to get the tests passing for the fc transport.
> 
> why the fc needs the --context to pass tests ?

A typical nvme test consists out of following steps (nvme/004):

// nvme target setup (1)
	_create_nvmet_subsystem "blktests-subsystem-1" "${loop_dev}" \
		"91fdba0d-f87b-4c25-b80f-db7be1418b9e"
	_add_nvmet_subsys_to_port "${port}" "blktests-subsystem-1"

// nvme host setup (2)
	_nvme_connect_subsys "${nvme_trtype}" blktests-subsystem-1

	local nvmedev
	nvmedev=$(_find_nvme_dev "blktests-subsystem-1")
	cat "/sys/block/${nvmedev}n1/uuid"
	cat "/sys/block/${nvmedev}n1/wwid"

// nvme host teardown (3)
	_nvme_disconnect_subsys blktests-subsystem-1

// nvme target teardown (4)
	_remove_nvmet_subsystem_from_port "${port}" "blktests-subsystem-1"
	_remove_nvmet_subsystem "blktests-subsystem-1"


The corresponding output with --context

 run blktests nvme/004 at 2023-07-12 13:49:50
// (1)
 loop0: detected capacity change from 0 to 32768
 nvmet: adding nsid 1 to subsystem blktests-subsystem-1
 nvme nvme2: NVME-FC{0}: create association : host wwpn 0x20001100aa000002  rport wwpn 0x20001100aa000001: NQN "blktests-subsystem-1"
 (NULL device *): {0:0} Association created
 [174] nvmet: ctrl 1 start keep-alive timer for 5 secs
// (2)
 nvmet: creating nvm controller 1 for subsystem blktests-subsystem-1 for NQN nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349.
 [374] nvmet: adding queue 1 to ctrl 1.
 [1138] nvmet: adding queue 2 to ctrl 1.
 [73] nvmet: adding queue 3 to ctrl 1.
 [174] nvmet: adding queue 4 to ctrl 1.
 nvme nvme2: NVME-FC{0}: controller connect complete
 nvme nvme2: NVME-FC{0}: new ctrl: NQN "blktests-subsystem-1"
// (3)
 nvme nvme2: Removing ctrl: NQN "blktests-subsystem-1"
// (4)
 [1138] nvmet: ctrl 1 stop keep-alive
 (NULL device *): {0:0} Association deleted
 (NULL device *): {0:0} Association freed
 (NULL device *): Disconnect LS failed: No Association


and without --context

 run blktests nvme/004 at 2023-07-12 13:50:33
// (1)
 loop1: detected capacity change from 0 to 32768
 nvmet: adding nsid 1 to subsystem blktests-subsystem-1
 nvme nvme2: NVME-FC{0}: create association : host wwpn 0x20001100aa000002  rport wwpn 0x20001100aa000001: NQN "nqn.2014-08.org.nvmexpress.discovery"
 (NULL device *): {0:0} Association created
 [1235] nvmet: ctrl 1 start keep-alive timer for 120 secs
// XXX udev auto connect
 nvmet: creating discovery controller 1 for subsystem nqn.2014-08.org.nvmexpress.discovery for NQN nqn.2014-08.org.nvmexpress:uuid:242d4a24-2484-4a80-8234-d0169409c5e8.
 nvme nvme2: NVME-FC{0}: controller connect complete
 nvme nvme2: NVME-FC{0}: new ctrl: NQN "nqn.2014-08.org.nvmexpress.discovery"
 nvme nvme3: NVME-FC{1}: create association : host wwpn 0x20001100aa000002  rport wwpn 0x20001100aa000001: NQN "blktests-subsystem-1"
 (NULL device *): {0:1} Association created
 [73] nvmet: ctrl 2 start keep-alive timer for 5 secs
// (2)
 nvmet: creating nvm controller 2 for subsystem blktests-subsystem-1 for NQN nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349.
 [374] nvmet: adding queue 1 to ctrl 2.
 [233] nvmet: adding queue 2 to ctrl 2.
 [73] nvmet: adding queue 3 to ctrl 2.
 [1235] nvmet: adding queue 4 to ctrl 2.
 nvme nvme3: NVME-FC{1}: controller connect complete
 nvme nvme3: NVME-FC{1}: new ctrl: NQN "blktests-subsystem-1"
// (3)
 nvme nvme3: Removing ctrl: NQN "blktests-subsystem-1"
 general protection fault, probably for non-canonical address 0xdffffc00000000a4: 0000 [#1] PREEMPT SMP KASAN NOPTI
 KASAN: null-ptr-deref in range [0x0000000000000520-0x0000000000000527]
 CPU: 1 PID: 2076 Comm: kworker/1:1 Tainted: G        W          6.4.0-rc2+ #7 f2a41a58e59b44ee1bb7bc68087ccbe6d76392dd
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown unknown
 Workqueue: nvmet-wq fcloop_fcp_recv_work [nvme_fcloop]
 RIP: 0010:nvmet_execute_disc_get_log_page+0x23f/0x8c0 [nvmet]
 Code: e8 c6 12 c7 e0 4c 89 6c 24 40 48 89 5c 24 08 4c 8b 3b 49 8d 9f 20 05 00 00 48 89 d8 48 c1 e8 03 48 b9 00 00 00 00 00 fc ff df <80> 3c 08 00 74 08 48 89 df e8 93 12 c7 e0 4c 89 74 24 30 4c 8b 2b
 RSP: 0018:ffff888139a778a0 EFLAGS: 00010202
 RAX: 00000000000000a4 RBX: 0000000000000520 RCX: dffffc0000000000
 RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffffffffa8af3a88
 RBP: ffff888139a77ab0 R08: dffffc0000000000 R09: fffffbfff515e752
 R10: 0000000000000000 R11: dffffc0000000001 R12: 1ffff1102734ef20
 R13: ffff888105563260 R14: ffff888105563270 R15: 0000000000000000
 FS:  0000000000000000(0000) GS:ffff88815a600000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000824220 CR3: 0000000106512005 CR4: 0000000000370ee0
 Call Trace:
  <TASK>
  ? prepare_alloc_pages+0x1c5/0x580
  ? __cfi_nvmet_execute_disc_get_log_page+0x10/0x10 [nvmet 1da13efcd047161c3381cb240a95399f951fd98f]
  ? __alloc_pages+0x30e/0x650
  ? slab_post_alloc_hook+0x67/0x350
  ? __cfi___alloc_pages+0x10/0x10
  ? alloc_pages+0x30e/0x530
  ? sgl_alloc_order+0x118/0x320
  nvmet_fc_queue_fcp_req+0xa19/0xda0 [nvmet_fc 11628cdb09a094fd591bfaf88be45b97e3b18e3a]
  ? nvmet_fc_rcv_fcp_req+0x9c0/0x9c0 [nvmet_fc 11628cdb09a094fd591bfaf88be45b97e3b18e3a]
  ? lockdep_hardirqs_on_prepare+0x2aa/0x5e0
  ? nvmet_fc_rcv_fcp_req+0x4de/0x9c0 [nvmet_fc 11628cdb09a094fd591bfaf88be45b97e3b18e3a]
  nvmet_fc_rcv_fcp_req+0x4f0/0x9c0 [nvmet_fc 11628cdb09a094fd591bfaf88be45b97e3b18e3a]
  fcloop_fcp_recv_work+0x173/0x440 [nvme_fcloop 05cf1144b564c4e1626f9f15422ccf61f2af41de]
  process_one_work+0x80f/0xfb0
  ? rescuer_thread+0x1130/0x1130
  ? do_raw_spin_trylock+0xc9/0x1f0
  ? lock_acquired+0x310/0x9a0
  ? worker_thread+0xd5e/0x1260
  worker_thread+0x91e/0x1260
  ? __cfi_lock_release+0x10/0x10
  ? do_raw_spin_unlock+0x116/0x8a0
  kthread+0x25d/0x2f0
  ? __cfi_worker_thread+0x10/0x10
  ? __cfi_kthread+0x10/0x10
  ret_from_fork+0x29/0x50
  </TASK>

> > > Maybe it's worth cleaning it..
> > 
> > It really solves the problem that the autoconnect setup of nvme-cli is
> > distrubing the tests (*). The only other way I found to stop the autoconnect is
> > by disabling the udev rule completely. If autoconnect isn't enabled the context
> > isn't necessary. Though changing system configuration from blktests seems at bit
> > excessive.
> 
> we should not stop any autoconnect during blktests. The autoconnect and all
> the system admin services should run normally.

I do not agree here. The current blktests are not designed for run as
intergration tests. Sure we should also tests this but currently blktests is
just not there and tcp/rdma are not actually covered anyway.

Thanks,
Daniel
Max Gurtovoy July 13, 2023, 12:12 a.m. UTC | #16
On 12/07/2023 15:04, Daniel Wagner wrote:
> On Mon, Jul 10, 2023 at 07:30:20PM +0300, Max Gurtovoy wrote:
>>
>>
>> On 10/07/2023 18:03, Daniel Wagner wrote:
>>> On Mon, Jul 10, 2023 at 03:31:23PM +0300, Max Gurtovoy wrote:
>>>> I think it is more than just commit message.
>>>
>>> Okay, starting to understand what's the problem.
>>>
>>>> A lot of code that we can avoid was added regarding the --context cmdline
>>>> argument.
>>>
>>> Correct and it's not optional to get the tests passing for the fc transport.
>>
>> why the fc needs the --context to pass tests ?
> 
> A typical nvme test consists out of following steps (nvme/004):
> 
> // nvme target setup (1)
> 	_create_nvmet_subsystem "blktests-subsystem-1" "${loop_dev}" \
> 		"91fdba0d-f87b-4c25-b80f-db7be1418b9e"
> 	_add_nvmet_subsys_to_port "${port}" "blktests-subsystem-1"
> 
> // nvme host setup (2)
> 	_nvme_connect_subsys "${nvme_trtype}" blktests-subsystem-1
> 
> 	local nvmedev
> 	nvmedev=$(_find_nvme_dev "blktests-subsystem-1")
> 	cat "/sys/block/${nvmedev}n1/uuid"
> 	cat "/sys/block/${nvmedev}n1/wwid"
> 
> // nvme host teardown (3)
> 	_nvme_disconnect_subsys blktests-subsystem-1
> 
> // nvme target teardown (4)
> 	_remove_nvmet_subsystem_from_port "${port}" "blktests-subsystem-1"
> 	_remove_nvmet_subsystem "blktests-subsystem-1"
> 
> 
> The corresponding output with --context
> 
>   run blktests nvme/004 at 2023-07-12 13:49:50
> // (1)
>   loop0: detected capacity change from 0 to 32768
>   nvmet: adding nsid 1 to subsystem blktests-subsystem-1
>   nvme nvme2: NVME-FC{0}: create association : host wwpn 0x20001100aa000002  rport wwpn 0x20001100aa000001: NQN "blktests-subsystem-1"
>   (NULL device *): {0:0} Association created
>   [174] nvmet: ctrl 1 start keep-alive timer for 5 secs
> // (2)
>   nvmet: creating nvm controller 1 for subsystem blktests-subsystem-1 for NQN nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349.
>   [374] nvmet: adding queue 1 to ctrl 1.
>   [1138] nvmet: adding queue 2 to ctrl 1.
>   [73] nvmet: adding queue 3 to ctrl 1.
>   [174] nvmet: adding queue 4 to ctrl 1.
>   nvme nvme2: NVME-FC{0}: controller connect complete
>   nvme nvme2: NVME-FC{0}: new ctrl: NQN "blktests-subsystem-1"
> // (3)
>   nvme nvme2: Removing ctrl: NQN "blktests-subsystem-1"
> // (4)
>   [1138] nvmet: ctrl 1 stop keep-alive
>   (NULL device *): {0:0} Association deleted
>   (NULL device *): {0:0} Association freed
>   (NULL device *): Disconnect LS failed: No Association
> 
> 
> and without --context
> 
>   run blktests nvme/004 at 2023-07-12 13:50:33
> // (1)
>   loop1: detected capacity change from 0 to 32768
>   nvmet: adding nsid 1 to subsystem blktests-subsystem-1
>   nvme nvme2: NVME-FC{0}: create association : host wwpn 0x20001100aa000002  rport wwpn 0x20001100aa000001: NQN "nqn.2014-08.org.nvmexpress.discovery"

why does this association to discovery controller created ? because of 
some system service ?

can we configure the blktests subsystem not to be discovered or add some 
access list to it ?

>   (NULL device *): {0:0} Association created
>   [1235] nvmet: ctrl 1 start keep-alive timer for 120 secs
> // XXX udev auto connect
>   nvmet: creating discovery controller 1 for subsystem nqn.2014-08.org.nvmexpress.discovery for NQN nqn.2014-08.org.nvmexpress:uuid:242d4a24-2484-4a80-8234-d0169409c5e8.
>   nvme nvme2: NVME-FC{0}: controller connect complete
>   nvme nvme2: NVME-FC{0}: new ctrl: NQN "nqn.2014-08.org.nvmexpress.discovery"
>   nvme nvme3: NVME-FC{1}: create association : host wwpn 0x20001100aa000002  rport wwpn 0x20001100aa000001: NQN "blktests-subsystem-1"
>   (NULL device *): {0:1} Association created
>   [73] nvmet: ctrl 2 start keep-alive timer for 5 secs
> // (2)
>   nvmet: creating nvm controller 2 for subsystem blktests-subsystem-1 for NQN nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349.
>   [374] nvmet: adding queue 1 to ctrl 2.
>   [233] nvmet: adding queue 2 to ctrl 2.
>   [73] nvmet: adding queue 3 to ctrl 2.
>   [1235] nvmet: adding queue 4 to ctrl 2.
>   nvme nvme3: NVME-FC{1}: controller connect complete
>   nvme nvme3: NVME-FC{1}: new ctrl: NQN "blktests-subsystem-1"
> // (3)
>   nvme nvme3: Removing ctrl: NQN "blktests-subsystem-1"

bellow sounds like a bug we should fix :)

>   general protection fault, probably for non-canonical address 0xdffffc00000000a4: 0000 [#1] PREEMPT SMP KASAN NOPTI
>   KASAN: null-ptr-deref in range [0x0000000000000520-0x0000000000000527]
>   CPU: 1 PID: 2076 Comm: kworker/1:1 Tainted: G        W          6.4.0-rc2+ #7 f2a41a58e59b44ee1bb7bc68087ccbe6d76392dd
>   Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown unknown
>   Workqueue: nvmet-wq fcloop_fcp_recv_work [nvme_fcloop]
>   RIP: 0010:nvmet_execute_disc_get_log_page+0x23f/0x8c0 [nvmet]
>   Code: e8 c6 12 c7 e0 4c 89 6c 24 40 48 89 5c 24 08 4c 8b 3b 49 8d 9f 20 05 00 00 48 89 d8 48 c1 e8 03 48 b9 00 00 00 00 00 fc ff df <80> 3c 08 00 74 08 48 89 df e8 93 12 c7 e0 4c 89 74 24 30 4c 8b 2b
>   RSP: 0018:ffff888139a778a0 EFLAGS: 00010202
>   RAX: 00000000000000a4 RBX: 0000000000000520 RCX: dffffc0000000000
>   RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffffffffa8af3a88
>   RBP: ffff888139a77ab0 R08: dffffc0000000000 R09: fffffbfff515e752
>   R10: 0000000000000000 R11: dffffc0000000001 R12: 1ffff1102734ef20
>   R13: ffff888105563260 R14: ffff888105563270 R15: 0000000000000000
>   FS:  0000000000000000(0000) GS:ffff88815a600000(0000) knlGS:0000000000000000
>   CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>   CR2: 0000000000824220 CR3: 0000000106512005 CR4: 0000000000370ee0
>   Call Trace:
>    <TASK>
>    ? prepare_alloc_pages+0x1c5/0x580
>    ? __cfi_nvmet_execute_disc_get_log_page+0x10/0x10 [nvmet 1da13efcd047161c3381cb240a95399f951fd98f]
>    ? __alloc_pages+0x30e/0x650
>    ? slab_post_alloc_hook+0x67/0x350
>    ? __cfi___alloc_pages+0x10/0x10
>    ? alloc_pages+0x30e/0x530
>    ? sgl_alloc_order+0x118/0x320
>    nvmet_fc_queue_fcp_req+0xa19/0xda0 [nvmet_fc 11628cdb09a094fd591bfaf88be45b97e3b18e3a]
>    ? nvmet_fc_rcv_fcp_req+0x9c0/0x9c0 [nvmet_fc 11628cdb09a094fd591bfaf88be45b97e3b18e3a]
>    ? lockdep_hardirqs_on_prepare+0x2aa/0x5e0
>    ? nvmet_fc_rcv_fcp_req+0x4de/0x9c0 [nvmet_fc 11628cdb09a094fd591bfaf88be45b97e3b18e3a]
>    nvmet_fc_rcv_fcp_req+0x4f0/0x9c0 [nvmet_fc 11628cdb09a094fd591bfaf88be45b97e3b18e3a]
>    fcloop_fcp_recv_work+0x173/0x440 [nvme_fcloop 05cf1144b564c4e1626f9f15422ccf61f2af41de]
>    process_one_work+0x80f/0xfb0
>    ? rescuer_thread+0x1130/0x1130
>    ? do_raw_spin_trylock+0xc9/0x1f0
>    ? lock_acquired+0x310/0x9a0
>    ? worker_thread+0xd5e/0x1260
>    worker_thread+0x91e/0x1260
>    ? __cfi_lock_release+0x10/0x10
>    ? do_raw_spin_unlock+0x116/0x8a0
>    kthread+0x25d/0x2f0
>    ? __cfi_worker_thread+0x10/0x10
>    ? __cfi_kthread+0x10/0x10
>    ret_from_fork+0x29/0x50
>    </TASK>
> 
>>>> Maybe it's worth cleaning it..
>>>
>>> It really solves the problem that the autoconnect setup of nvme-cli is
>>> distrubing the tests (*). The only other way I found to stop the autoconnect is
>>> by disabling the udev rule completely. If autoconnect isn't enabled the context
>>> isn't necessary. Though changing system configuration from blktests seems at bit
>>> excessive.
>>
>> we should not stop any autoconnect during blktests. The autoconnect and all
>> the system admin services should run normally.
> 
> I do not agree here. The current blktests are not designed for run as
> intergration tests. Sure we should also tests this but currently blktests is
> just not there and tcp/rdma are not actually covered anyway.

what do you mean tcp/rdma not covered ?

And maybe we should make several changes in the blktests to make it 
standalone without interfering the existing configuration make by some 
system administrator.

> 
> Thanks,
> Daniel
Hannes Reinecke July 13, 2023, 6 a.m. UTC | #17
On 7/13/23 02:12, Max Gurtovoy wrote:
> 
> 
> On 12/07/2023 15:04, Daniel Wagner wrote:
>> On Mon, Jul 10, 2023 at 07:30:20PM +0300, Max Gurtovoy wrote:
>>>
>>>
>>> On 10/07/2023 18:03, Daniel Wagner wrote:
>>>> On Mon, Jul 10, 2023 at 03:31:23PM +0300, Max Gurtovoy wrote:
>>>>> I think it is more than just commit message.
>>>>
>>>> Okay, starting to understand what's the problem.
>>>>
>>>>> A lot of code that we can avoid was added regarding the --context 
>>>>> cmdline
>>>>> argument.
>>>>
>>>> Correct and it's not optional to get the tests passing for the fc 
>>>> transport.
>>>
>>> why the fc needs the --context to pass tests ?
>>
>> A typical nvme test consists out of following steps (nvme/004):
>>
>> // nvme target setup (1)
>>     _create_nvmet_subsystem "blktests-subsystem-1" "${loop_dev}" \
>>         "91fdba0d-f87b-4c25-b80f-db7be1418b9e"
>>     _add_nvmet_subsys_to_port "${port}" "blktests-subsystem-1"
>>
>> // nvme host setup (2)
>>     _nvme_connect_subsys "${nvme_trtype}" blktests-subsystem-1
>>
>>     local nvmedev
>>     nvmedev=$(_find_nvme_dev "blktests-subsystem-1")
>>     cat "/sys/block/${nvmedev}n1/uuid"
>>     cat "/sys/block/${nvmedev}n1/wwid"
>>
>> // nvme host teardown (3)
>>     _nvme_disconnect_subsys blktests-subsystem-1
>>
>> // nvme target teardown (4)
>>     _remove_nvmet_subsystem_from_port "${port}" "blktests-subsystem-1"
>>     _remove_nvmet_subsystem "blktests-subsystem-1"
>>
>>
>> The corresponding output with --context
>>
>>   run blktests nvme/004 at 2023-07-12 13:49:50
>> // (1)
>>   loop0: detected capacity change from 0 to 32768
>>   nvmet: adding nsid 1 to subsystem blktests-subsystem-1
>>   nvme nvme2: NVME-FC{0}: create association : host wwpn 
>> 0x20001100aa000002  rport wwpn 0x20001100aa000001: NQN 
>> "blktests-subsystem-1"
>>   (NULL device *): {0:0} Association created
>>   [174] nvmet: ctrl 1 start keep-alive timer for 5 secs
>> // (2)
>>   nvmet: creating nvm controller 1 for subsystem blktests-subsystem-1 
>> for NQN 
>> nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349.
>>   [374] nvmet: adding queue 1 to ctrl 1.
>>   [1138] nvmet: adding queue 2 to ctrl 1.
>>   [73] nvmet: adding queue 3 to ctrl 1.
>>   [174] nvmet: adding queue 4 to ctrl 1.
>>   nvme nvme2: NVME-FC{0}: controller connect complete
>>   nvme nvme2: NVME-FC{0}: new ctrl: NQN "blktests-subsystem-1"
>> // (3)
>>   nvme nvme2: Removing ctrl: NQN "blktests-subsystem-1"
>> // (4)
>>   [1138] nvmet: ctrl 1 stop keep-alive
>>   (NULL device *): {0:0} Association deleted
>>   (NULL device *): {0:0} Association freed
>>   (NULL device *): Disconnect LS failed: No Association
>>
>>
>> and without --context
>>
>>   run blktests nvme/004 at 2023-07-12 13:50:33
>> // (1)
>>   loop1: detected capacity change from 0 to 32768
>>   nvmet: adding nsid 1 to subsystem blktests-subsystem-1
>>   nvme nvme2: NVME-FC{0}: create association : host wwpn 
>> 0x20001100aa000002  rport wwpn 0x20001100aa000001: NQN 
>> "nqn.2014-08.org.nvmexpress.discovery"
> 
> why does this association to discovery controller created ? because of 
> some system service ?
> 
Yes. There are nvme-autoconnect udev rules and systemd services 
installed per default (in quite some systems now).
And it's really hard (if not impossible) to disable these services (as 
we cannot be sure how they are named, hence we wouldn't know which 
service to disable.

> can we configure the blktests subsystem not to be discovered or add some 
> access list to it ?
> 
But that's precisely what the '--context' thing is attempting to do ...

[ .. ]
>>>>
>>>> It really solves the problem that the autoconnect setup of nvme-cli is
>>>> distrubing the tests (*). The only other way I found to stop the 
>>>> autoconnect is by disabling the udev rule completely. If autoconnect 
>>>> isn't enabled the context isn't necessary.
>>>> Though changing system configuration from blktests seems at bit 
>>>> excessive.
>>>
>>> we should not stop any autoconnect during blktests. The autoconnect 
>>> and all the system admin services should run normally.
>>
>> I do not agree here. The current blktests are not designed for run as
>> intergration tests. Sure we should also tests this but currently 
>> blktests is just not there and tcp/rdma are not actually covered anyway.
> 
> what do you mean tcp/rdma not covered ?
> 
Because there is no autoconnect functionality for tcp/rdma.
For FC we have full topology information, and the driver can emit udev 
messages whenever a NVMe port appears in the fabrics (and the systemd 
machinery will then start autoconnect).
For TCP/RDMA we do not have this, so really there's nothing which could 
send udev events (discounting things like mDNS and nvme-stas for now).

> And maybe we should make several changes in the blktests to make it 
> standalone without interfering the existing configuration make by some 
> system administrator.
> 
??
But this is what we are trying with this patches.
The '--context' flag only needs to be set for the blktests, to inform 
the rest of the system that these subsystems/configuration is special 
and should be exempted from 'normal' system processing.

Cheers,

Hannes
Max Gurtovoy July 13, 2023, 8:49 a.m. UTC | #18
On 13/07/2023 9:00, Hannes Reinecke wrote:
> On 7/13/23 02:12, Max Gurtovoy wrote:
>>
>>
>> On 12/07/2023 15:04, Daniel Wagner wrote:
>>> On Mon, Jul 10, 2023 at 07:30:20PM +0300, Max Gurtovoy wrote:
>>>>
>>>>
>>>> On 10/07/2023 18:03, Daniel Wagner wrote:
>>>>> On Mon, Jul 10, 2023 at 03:31:23PM +0300, Max Gurtovoy wrote:
>>>>>> I think it is more than just commit message.
>>>>>
>>>>> Okay, starting to understand what's the problem.
>>>>>
>>>>>> A lot of code that we can avoid was added regarding the --context 
>>>>>> cmdline
>>>>>> argument.
>>>>>
>>>>> Correct and it's not optional to get the tests passing for the fc 
>>>>> transport.
>>>>
>>>> why the fc needs the --context to pass tests ?
>>>
>>> A typical nvme test consists out of following steps (nvme/004):
>>>
>>> // nvme target setup (1)
>>>     _create_nvmet_subsystem "blktests-subsystem-1" "${loop_dev}" \
>>>         "91fdba0d-f87b-4c25-b80f-db7be1418b9e"
>>>     _add_nvmet_subsys_to_port "${port}" "blktests-subsystem-1"
>>>
>>> // nvme host setup (2)
>>>     _nvme_connect_subsys "${nvme_trtype}" blktests-subsystem-1
>>>
>>>     local nvmedev
>>>     nvmedev=$(_find_nvme_dev "blktests-subsystem-1")
>>>     cat "/sys/block/${nvmedev}n1/uuid"
>>>     cat "/sys/block/${nvmedev}n1/wwid"
>>>
>>> // nvme host teardown (3)
>>>     _nvme_disconnect_subsys blktests-subsystem-1
>>>
>>> // nvme target teardown (4)
>>>     _remove_nvmet_subsystem_from_port "${port}" "blktests-subsystem-1"
>>>     _remove_nvmet_subsystem "blktests-subsystem-1"
>>>
>>>
>>> The corresponding output with --context
>>>
>>>   run blktests nvme/004 at 2023-07-12 13:49:50
>>> // (1)
>>>   loop0: detected capacity change from 0 to 32768
>>>   nvmet: adding nsid 1 to subsystem blktests-subsystem-1
>>>   nvme nvme2: NVME-FC{0}: create association : host wwpn 
>>> 0x20001100aa000002  rport wwpn 0x20001100aa000001: NQN 
>>> "blktests-subsystem-1"
>>>   (NULL device *): {0:0} Association created
>>>   [174] nvmet: ctrl 1 start keep-alive timer for 5 secs
>>> // (2)
>>>   nvmet: creating nvm controller 1 for subsystem blktests-subsystem-1 
>>> for NQN 
>>> nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349.
>>>   [374] nvmet: adding queue 1 to ctrl 1.
>>>   [1138] nvmet: adding queue 2 to ctrl 1.
>>>   [73] nvmet: adding queue 3 to ctrl 1.
>>>   [174] nvmet: adding queue 4 to ctrl 1.
>>>   nvme nvme2: NVME-FC{0}: controller connect complete
>>>   nvme nvme2: NVME-FC{0}: new ctrl: NQN "blktests-subsystem-1"
>>> // (3)
>>>   nvme nvme2: Removing ctrl: NQN "blktests-subsystem-1"
>>> // (4)
>>>   [1138] nvmet: ctrl 1 stop keep-alive
>>>   (NULL device *): {0:0} Association deleted
>>>   (NULL device *): {0:0} Association freed
>>>   (NULL device *): Disconnect LS failed: No Association
>>>
>>>
>>> and without --context
>>>
>>>   run blktests nvme/004 at 2023-07-12 13:50:33
>>> // (1)
>>>   loop1: detected capacity change from 0 to 32768
>>>   nvmet: adding nsid 1 to subsystem blktests-subsystem-1
>>>   nvme nvme2: NVME-FC{0}: create association : host wwpn 
>>> 0x20001100aa000002  rport wwpn 0x20001100aa000001: NQN 
>>> "nqn.2014-08.org.nvmexpress.discovery"
>>
>> why does this association to discovery controller created ? because of 
>> some system service ?
>>
> Yes. There are nvme-autoconnect udev rules and systemd services 
> installed per default (in quite some systems now).
> And it's really hard (if not impossible) to disable these services (as 
> we cannot be sure how they are named, hence we wouldn't know which 
> service to disable.

Right. We shouldn't disable them IMO.

> 
>> can we configure the blktests subsystem not to be discovered or add 
>> some access list to it ?
>>
> But that's precisely what the '--context' thing is attempting to do ...

I'm not sure it is.

Exposing the subsystem is from the target configuration side.
Additionally, the --context (which is in the initiator/host side), 
according to Daniel, is there to distinguish between different 
invocations. I proposed that blktests subsystem will not be part of 
discoverable fabric or protected somehow by access list. Therefore, no 
additional invocation will happen.


> 
> [ .. ]
>>>>>
>>>>> It really solves the problem that the autoconnect setup of nvme-cli is
>>>>> distrubing the tests (*). The only other way I found to stop the 
>>>>> autoconnect is by disabling the udev rule completely. If 
>>>>> autoconnect isn't enabled the context isn't necessary.
>>>>> Though changing system configuration from blktests seems at bit 
>>>>> excessive.
>>>>
>>>> we should not stop any autoconnect during blktests. The autoconnect 
>>>> and all the system admin services should run normally.
>>>
>>> I do not agree here. The current blktests are not designed for run as
>>> intergration tests. Sure we should also tests this but currently 
>>> blktests is just not there and tcp/rdma are not actually covered anyway.
>>
>> what do you mean tcp/rdma not covered ?
>>
> Because there is no autoconnect functionality for tcp/rdma.
> For FC we have full topology information, and the driver can emit udev 
> messages whenever a NVMe port appears in the fabrics (and the systemd 
> machinery will then start autoconnect).
> For TCP/RDMA we do not have this, so really there's nothing which could 
> send udev events (discounting things like mDNS and nvme-stas for now).
> 
>> And maybe we should make several changes in the blktests to make it 
>> standalone without interfering the existing configuration make by some 
>> system administrator.
>>
> ??
> But this is what we are trying with this patches.
> The '--context' flag only needs to be set for the blktests, to inform 
> the rest of the system that these subsystems/configuration is special 
> and should be exempted from 'normal' system processing.

The --context is initiator configuration. I'm referring to changes in 
the target configuration.
This will guarantee that things will work also in the environment where 
we have nvme-cli without the --context flag.

> 
> Cheers,
> 
> Hannes
Hannes Reinecke July 13, 2023, 10:14 a.m. UTC | #19
On 7/13/23 10:49, Max Gurtovoy wrote:
> 
> 
> On 13/07/2023 9:00, Hannes Reinecke wrote:
>> On 7/13/23 02:12, Max Gurtovoy wrote:
>>>
>>>
>>> On 12/07/2023 15:04, Daniel Wagner wrote:
>>>> On Mon, Jul 10, 2023 at 07:30:20PM +0300, Max Gurtovoy wrote:
>>>>>
>>>>>
>>>>> On 10/07/2023 18:03, Daniel Wagner wrote:
>>>>>> On Mon, Jul 10, 2023 at 03:31:23PM +0300, Max Gurtovoy wrote:
>>>>>>> I think it is more than just commit message.
>>>>>>
>>>>>> Okay, starting to understand what's the problem.
>>>>>>
>>>>>>> A lot of code that we can avoid was added regarding the --context 
>>>>>>> cmdline
>>>>>>> argument.
>>>>>>
>>>>>> Correct and it's not optional to get the tests passing for the fc 
>>>>>> transport.
>>>>>
>>>>> why the fc needs the --context to pass tests ?
>>>>
>>>> A typical nvme test consists out of following steps (nvme/004):
>>>>
>>>> // nvme target setup (1)
>>>>     _create_nvmet_subsystem "blktests-subsystem-1" "${loop_dev}" \
>>>>         "91fdba0d-f87b-4c25-b80f-db7be1418b9e"
>>>>     _add_nvmet_subsys_to_port "${port}" "blktests-subsystem-1"
>>>>
>>>> // nvme host setup (2)
>>>>     _nvme_connect_subsys "${nvme_trtype}" blktests-subsystem-1
>>>>
>>>>     local nvmedev
>>>>     nvmedev=$(_find_nvme_dev "blktests-subsystem-1")
>>>>     cat "/sys/block/${nvmedev}n1/uuid"
>>>>     cat "/sys/block/${nvmedev}n1/wwid"
>>>>
>>>> // nvme host teardown (3)
>>>>     _nvme_disconnect_subsys blktests-subsystem-1
>>>>
>>>> // nvme target teardown (4)
>>>>     _remove_nvmet_subsystem_from_port "${port}" "blktests-subsystem-1"
>>>>     _remove_nvmet_subsystem "blktests-subsystem-1"
>>>>
>>>>
>>>> The corresponding output with --context
>>>>
>>>>   run blktests nvme/004 at 2023-07-12 13:49:50
>>>> // (1)
>>>>   loop0: detected capacity change from 0 to 32768
>>>>   nvmet: adding nsid 1 to subsystem blktests-subsystem-1
>>>>   nvme nvme2: NVME-FC{0}: create association : host wwpn 
>>>> 0x20001100aa000002  rport wwpn 0x20001100aa000001: NQN 
>>>> "blktests-subsystem-1"
>>>>   (NULL device *): {0:0} Association created
>>>>   [174] nvmet: ctrl 1 start keep-alive timer for 5 secs
>>>> // (2)
>>>>   nvmet: creating nvm controller 1 for subsystem 
>>>> blktests-subsystem-1 for NQN 
>>>> nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349.
>>>>   [374] nvmet: adding queue 1 to ctrl 1.
>>>>   [1138] nvmet: adding queue 2 to ctrl 1.
>>>>   [73] nvmet: adding queue 3 to ctrl 1.
>>>>   [174] nvmet: adding queue 4 to ctrl 1.
>>>>   nvme nvme2: NVME-FC{0}: controller connect complete
>>>>   nvme nvme2: NVME-FC{0}: new ctrl: NQN "blktests-subsystem-1"
>>>> // (3)
>>>>   nvme nvme2: Removing ctrl: NQN "blktests-subsystem-1"
>>>> // (4)
>>>>   [1138] nvmet: ctrl 1 stop keep-alive
>>>>   (NULL device *): {0:0} Association deleted
>>>>   (NULL device *): {0:0} Association freed
>>>>   (NULL device *): Disconnect LS failed: No Association
>>>>
>>>>
>>>> and without --context
>>>>
>>>>   run blktests nvme/004 at 2023-07-12 13:50:33
>>>> // (1)
>>>>   loop1: detected capacity change from 0 to 32768
>>>>   nvmet: adding nsid 1 to subsystem blktests-subsystem-1
>>>>   nvme nvme2: NVME-FC{0}: create association : host wwpn 
>>>> 0x20001100aa000002  rport wwpn 0x20001100aa000001: NQN 
>>>> "nqn.2014-08.org.nvmexpress.discovery"
>>>
>>> why does this association to discovery controller created ? because 
>>> of some system service ?
>>>
>> Yes. There are nvme-autoconnect udev rules and systemd services 
>> installed per default (in quite some systems now).
>> And it's really hard (if not impossible) to disable these services (as 
>> we cannot be sure how they are named, hence we wouldn't know which 
>> service to disable.
> 
> Right. We shouldn't disable them IMO.
> 
>>
>>> can we configure the blktests subsystem not to be discovered or add 
>>> some access list to it ?
>>>
>> But that's precisely what the '--context' thing is attempting to do ...
> 
> I'm not sure it is.
> 
> Exposing the subsystem is from the target configuration side.
> Additionally, the --context (which is in the initiator/host side), 
> according to Daniel, is there to distinguish between different 
> invocations. I proposed that blktests subsystem will not be part of 
> discoverable fabric or protected somehow by access list. Therefore, no 
> additional invocation will happen.
> 
Hmm. Maybe we can tweak blktest to use it's own HostNQN, and always pass 
that for any nvme-cli call.
Daniel?

Cheers,

Hannes
Daniel Wagner July 13, 2023, 10:30 a.m. UTC | #20
On Thu, Jul 13, 2023 at 12:14:14PM +0200, Hannes Reinecke wrote:
> > Exposing the subsystem is from the target configuration side.
> > Additionally, the --context (which is in the initiator/host side),
> > according to Daniel, is there to distinguish between different
> > invocations. I proposed that blktests subsystem will not be part of
> > discoverable fabric or protected somehow by access list. Therefore, no
> > additional invocation will happen.

I am confused. This is exactly what the whole --context thing is.

> Hmm. Maybe we can tweak blktest to use it's own HostNQN, and always pass
> that for any nvme-cli call.

This is what the current code already does.
Daniel Wagner July 13, 2023, 10:44 a.m. UTC | #21
On Thu, Jul 13, 2023 at 12:30:45PM +0200, Daniel Wagner wrote:
> On Thu, Jul 13, 2023 at 12:14:14PM +0200, Hannes Reinecke wrote:
> > > Exposing the subsystem is from the target configuration side.
> > > Additionally, the --context (which is in the initiator/host side),
> > > according to Daniel, is there to distinguish between different
> > > invocations. I proposed that blktests subsystem will not be part of
> > > discoverable fabric or protected somehow by access list. Therefore, no
> > > additional invocation will happen.
> 
> I am confused. This is exactly what the whole --context thing is.

Ah I think I got it now. You want me to set allow_hosts on the target side too
:)
Max Gurtovoy July 13, 2023, 10:50 a.m. UTC | #22
On 13/07/2023 13:44, Daniel Wagner wrote:
> On Thu, Jul 13, 2023 at 12:30:45PM +0200, Daniel Wagner wrote:
>> On Thu, Jul 13, 2023 at 12:14:14PM +0200, Hannes Reinecke wrote:
>>>> Exposing the subsystem is from the target configuration side.
>>>> Additionally, the --context (which is in the initiator/host side),
>>>> according to Daniel, is there to distinguish between different
>>>> invocations. I proposed that blktests subsystem will not be part of
>>>> discoverable fabric or protected somehow by access list. Therefore, no
>>>> additional invocation will happen.
>>
>> I am confused. This is exactly what the whole --context thing is.
> 
> Ah I think I got it now. You want me to set allow_hosts on the target side too
> :)

Yes.
With the unique hostId/hostNqn for blktests this should work without the 
need for --context mechanism, that was probably added for system 
administrators and not for unit_testing/QA/Verification engineers that 
run blktests.
diff mbox series

Patch

diff --git a/tests/nvme/rc b/tests/nvme/rc
index 191f3e2e0c43..00117d314a38 100644
--- a/tests/nvme/rc
+++ b/tests/nvme/rc
@@ -14,8 +14,8 @@  def_remote_wwnn="0x10001100aa000001"
 def_remote_wwpn="0x20001100aa000001"
 def_local_wwnn="0x10001100aa000002"
 def_local_wwpn="0x20001100aa000002"
-def_hostnqn="$(cat /etc/nvme/hostnqn 2> /dev/null)"
-def_hostid="$(cat /etc/nvme/hostid 2> /dev/null)"
+def_hostnqn="nqn.2014-08.org.nvmexpress:uuid:242d4a24-2484-4a80-8234-d0169409c5e8"
+def_hostid="242d4a24-2484-4a80-8234-d0169409c5e8"
 nvme_trtype=${nvme_trtype:-"loop"}
 nvme_img_size=${nvme_img_size:-"1G"}
 nvme_num_iter=${nvme_num_iter:-"1000"}
@@ -161,6 +161,50 @@  _nvme_calc_rand_io_size() {
 	echo "${io_size_kb}k"
 }
 
+_nvme_cli_supports_context() {
+	if ! nvme connect --help 2>&1 | grep -q context > /dev/null; then
+		    return 1
+	fi
+	return 0
+}
+
+_setup_nvme_cli() {
+	if ! _nvme_cli_supports_context; then
+		return
+	fi
+
+	mkdir -p /run/nvme
+	cat >> /run/nvme/blktests.json <<-EOF
+	[
+	  {
+	    "hostnqn": "${def_hostnqn}",
+	    "hostid": "${def_hostid}",
+	    "subsystems": [
+	      {
+	        "application": "blktests",
+	        "nqn": "blktests-subsystem-1",
+	        "ports": [
+	          {
+	            "transport": "fc",
+	            "traddr": "nn-${def_remote_wwnn}:pn-${def_remote_wwpn}",
+	            "host_traddr": "nn-${def_local_wwnn}:pn-${def_local_wwpn}"
+	          }
+	        ]
+	      }
+	    ]
+	  }
+	]
+	EOF
+}
+
+_cleanup_nvme_cli() {
+	if ! _nvme_cli_supports_context; then
+		return
+	fi
+
+	rm -f /run/nvme/blktests.json
+}
+
 _nvme_fcloop_add_rport() {
 	local local_wwnn="$1"
 	local local_wwpn="$2"
@@ -235,6 +279,8 @@  _cleanup_fcloop() {
 	_nvme_fcloop_del_lport "${local_wwnn}" "${local_wwpn}"
 	_nvme_fcloop_del_rport "${local_wwnn}" "${local_wwpn}" \
 			       "${remote_wwnn}" "${remote_wwpn}"
+
+	_cleanup_nvme_cli
 }
 
 _cleanup_nvmet() {
@@ -337,6 +383,8 @@  _setup_nvmet() {
 		def_host_traddr=$(printf "nn-%s:pn-%s" \
 					 "${def_local_wwnn}" \
 					 "${def_local_wwpn}")
+
+		_setup_nvme_cli
 	fi
 }
 
@@ -436,18 +484,18 @@  _nvme_connect_subsys() {
 	trtype="$1"
 	subsysnqn="$2"
 
-	ARGS=(-t "${trtype}" -n "${subsysnqn}")
+	ARGS=(-t "${trtype}"
+	      -n "${subsysnqn}"
+	      --hostnqn="${hostnqn}"
+	      --hostid="${hostid}")
+	if _nvme_cli_supports_context; then
+		ARGS+=(--context="blktests")
+	fi
 	if [[ "${trtype}" == "fc" ]] ; then
 		ARGS+=(-a "${traddr}" -w "${host_traddr}")
 	elif [[ "${trtype}" != "loop" ]]; then
 		ARGS+=(-a "${traddr}" -s "${trsvcid}")
 	fi
-	if [[ "${hostnqn}" != "$def_hostnqn" ]]; then
-		ARGS+=(--hostnqn="${hostnqn}")
-	fi
-	if [[ "${hostid}" != "$def_hostid" ]]; then
-		ARGS+=(--hostid="${hostid}")
-	fi
 	if [[ -n "${hostkey}" ]]; then
 		ARGS+=(--dhchap-secret="${hostkey}")
 	fi
@@ -482,7 +530,12 @@  _nvme_discover() {
 	local host_traddr="${3:-$def_host_traddr}"
 	local trsvcid="${3:-$def_trsvcid}"
 
-	ARGS=(-t "${trtype}")
+	ARGS=(-t "${trtype}"
+	      --hostnqn="${def_hostnqn}"
+	      --hostid="${def_hostid}")
+	if _nvme_cli_supports_context; then
+		ARGS+=(--context="blktests")
+	fi
 	if [[ "${trtype}" = "fc" ]]; then
 		ARGS+=(-a "${traddr}" -w "${host_traddr}")
 	elif [[ "${trtype}" != "loop" ]]; then