Message ID | 20210603111901.9888-1-parav@nvidia.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | David Ahern |
Headers | show |
Series | [RESEND,iproute2-next] devlink: Add optional controller user input | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Not a local patch |
On 2021/6/3 19:19, Parav Pandit wrote: > A user optionally provides the external controller number when user > wants to create devlink port for the external controller. Hi, Parav I was planing to use controller id to solve the devlink instance representing problem for multi-function which shares common resource in the same ASIC, see [1]. It seems the controller id used here is to differentiate the function used in different host? 1. https://lkml.org/lkml/2021/5/31/296 > > An example on eswitch system: > $ devlink dev eswitch set pci/0033:01:00.0 mode switchdev > > $ devlink port show > pci/0033:01:00.0/196607: type eth netdev enP51p1s0f0np0 flavour physical port 0 splittable false > > $ devlink port add pci/0033:01:00.0 flavour pcisf pfnum 0 sfnum 77 controller 1 > pci/0033:01:00.0/163840: type eth netdev eth0 flavour pcisf controller 1 pfnum 0 sfnum 77 external true splittable false > function: > hw_addr 00:00:00:00:00:00 state inactive opstate detached > > Signed-off-by: Parav Pandit <parav@nvidia.com> > Reviewed-by: Jiri Pirko <jiri@nvidia.com> > --- > devlink/devlink.c | 17 ++++++++++++++--- > man/man8/devlink-port.8 | 19 +++++++++++++++++++ > 2 files changed, 33 insertions(+), 3 deletions(-) > > diff --git a/devlink/devlink.c b/devlink/devlink.c > index 0b5548fb..170e8616 100644 > --- a/devlink/devlink.c > +++ b/devlink/devlink.c > @@ -286,6 +286,7 @@ static void ifname_map_free(struct ifname_map *ifname_map) > #define DL_OPT_PORT_PFNUMBER BIT(43) > #define DL_OPT_PORT_SFNUMBER BIT(44) > #define DL_OPT_PORT_FUNCTION_STATE BIT(45) > +#define DL_OPT_PORT_CONTROLLER BIT(46) > > struct dl_opts { > uint64_t present; /* flags of present items */ > @@ -336,6 +337,7 @@ struct dl_opts { > uint32_t overwrite_mask; > enum devlink_reload_action reload_action; > enum devlink_reload_limit reload_limit; > + uint32_t port_controller; > uint32_t port_sfnumber; > uint16_t port_flavour; > uint16_t port_pfnumber; > @@ -1886,6 +1888,12 @@ static int dl_argv_parse(struct dl *dl, uint64_t o_required, > if (err) > return err; > o_found |= DL_OPT_PORT_SFNUMBER; > + } else if (dl_argv_match(dl, "controller") && (o_all & DL_OPT_PORT_CONTROLLER)) { > + dl_arg_inc(dl); > + err = dl_argv_uint32_t(dl, &opts->port_controller); > + if (err) > + return err; > + o_found |= DL_OPT_PORT_CONTROLLER; > } else { > pr_err("Unknown option \"%s\"\n", dl_argv(dl)); > return -EINVAL; > @@ -2079,6 +2087,9 @@ static void dl_opts_put(struct nlmsghdr *nlh, struct dl *dl) > mnl_attr_put_u16(nlh, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, opts->port_pfnumber); > if (opts->present & DL_OPT_PORT_SFNUMBER) > mnl_attr_put_u32(nlh, DEVLINK_ATTR_PORT_PCI_SF_NUMBER, opts->port_sfnumber); > + if (opts->present & DL_OPT_PORT_CONTROLLER) > + mnl_attr_put_u32(nlh, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, > + opts->port_controller); > } > > static int dl_argv_parse_put(struct nlmsghdr *nlh, struct dl *dl, > @@ -3795,7 +3806,7 @@ static void cmd_port_help(void) > pr_err(" devlink port param set DEV/PORT_INDEX name PARAMETER value VALUE cmode { permanent | driverinit | runtime }\n"); > pr_err(" devlink port param show [DEV/PORT_INDEX name PARAMETER]\n"); > pr_err(" devlink port health show [ DEV/PORT_INDEX reporter REPORTER_NAME ]\n"); > - pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR pfnum PFNUM [ sfnum SFNUM ]\n"); > + pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); > pr_err(" devlink port del DEV/PORT_INDEX\n"); > } > > @@ -4324,7 +4335,7 @@ static int __cmd_health_show(struct dl *dl, bool show_device, bool show_port); > > static void cmd_port_add_help(void) > { > - pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour FLAVOUR pfnum PFNUM [ sfnum SFNUM ]\n"); > + pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour FLAVOUR pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); > } > > static int cmd_port_add(struct dl *dl) > @@ -4342,7 +4353,7 @@ static int cmd_port_add(struct dl *dl) > > err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE | DL_OPT_HANDLEP | > DL_OPT_PORT_FLAVOUR | DL_OPT_PORT_PFNUMBER, > - DL_OPT_PORT_SFNUMBER); > + DL_OPT_PORT_SFNUMBER | DL_OPT_PORT_CONTROLLER); > if (err) > return err; > > diff --git a/man/man8/devlink-port.8 b/man/man8/devlink-port.8 > index 563c5833..a5142c4e 100644 > --- a/man/man8/devlink-port.8 > +++ b/man/man8/devlink-port.8 > @@ -54,6 +54,8 @@ devlink-port \- devlink port configuration > .IR PFNUMBER " ]" > .RB "{ " pcisf > .IR SFNUMBER " }" > +.RB "{ " controller > +.IR CNUM " }" > .br > > .ti -8 > @@ -174,6 +176,12 @@ Specifies sfnumber to assign to the device of the SF. > This field is optional for those devices which supports auto assignment of the > SF number. > > +.TP > +.BR controller " { " controller " } " > +Specifies controller number for which the SF port is created. > +This field is optional. It is used only when SF port is created for the > +external controller. > + > .ti -8 > .SS devlink port function set - Set the port function attribute(s). > > @@ -327,6 +335,17 @@ devlink dev param set pci/0000:01:00.0/1 name internal_error_reset value true cm > .RS 4 > Sets the parameter internal_error_reset of specified devlink port (#1) to true. > .RE > +.PP > +devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 controller 1 > +.RS 4 > +Add a devlink port of flavour PCI SF on controller 1 which has PCI PF of number > +0 with SF number 88. To make use of the function an example sequence is to add > +a port, configure the function attribute and activate the function. Once > +the function usage is completed, deactivate the function and finally delete > +the port. When there is desire to reuse the port without deletion, it can be > +reconfigured and activated again when function is in inactive state and > +function's operational state is detached. > +.RE > > .SH SEE ALSO > .BR devlink (8), >
Hi Yunsheng, > From: Yunsheng Lin <linyunsheng@huawei.com> > Sent: Friday, June 4, 2021 7:05 AM > > On 2021/6/3 19:19, Parav Pandit wrote: > > A user optionally provides the external controller number when user > > wants to create devlink port for the external controller. > > Hi, Parav > I was planing to use controller id to solve the devlink instance representing > problem for multi-function which shares common resource in the same ASIC, > see [1]. > > It seems the controller id used here is to differentiate the function used in > different host? > That’s correct. Controller holds one or more PCI functions (PF,VF,SF). In your case if there is single devlink instance representing ASIC, it is better to have health reporters under this single instance. Devlink parameters do not span multiple devlink instance. So if you need to control devlink instance parameters of each function byitself, you likely need devlink instance for each. And still continue to have ASIC wide health reporters under single instance that represents whole ASIC. > 1. https://lkml.org/lkml/2021/5/31/296 > > > > > An example on eswitch system: > > $ devlink dev eswitch set pci/0033:01:00.0 mode switchdev > > > > $ devlink port show > > pci/0033:01:00.0/196607: type eth netdev enP51p1s0f0np0 flavour > > physical port 0 splittable false > > > > $ devlink port add pci/0033:01:00.0 flavour pcisf pfnum 0 sfnum 77 > > controller 1 > > pci/0033:01:00.0/163840: type eth netdev eth0 flavour pcisf controller 1 > pfnum 0 sfnum 77 external true splittable false > > function: > > hw_addr 00:00:00:00:00:00 state inactive opstate detached > > > > Signed-off-by: Parav Pandit <parav@nvidia.com> > > Reviewed-by: Jiri Pirko <jiri@nvidia.com> > > --- > > devlink/devlink.c | 17 ++++++++++++++--- > > man/man8/devlink-port.8 | 19 +++++++++++++++++++ > > 2 files changed, 33 insertions(+), 3 deletions(-) > > > > diff --git a/devlink/devlink.c b/devlink/devlink.c index > > 0b5548fb..170e8616 100644 > > --- a/devlink/devlink.c > > +++ b/devlink/devlink.c > > @@ -286,6 +286,7 @@ static void ifname_map_free(struct ifname_map > > *ifname_map) #define DL_OPT_PORT_PFNUMBER BIT(43) #define > > DL_OPT_PORT_SFNUMBER BIT(44) #define > DL_OPT_PORT_FUNCTION_STATE > > BIT(45) > > +#define DL_OPT_PORT_CONTROLLER BIT(46) > > > > struct dl_opts { > > uint64_t present; /* flags of present items */ @@ -336,6 +337,7 @@ > > struct dl_opts { > > uint32_t overwrite_mask; > > enum devlink_reload_action reload_action; > > enum devlink_reload_limit reload_limit; > > + uint32_t port_controller; > > uint32_t port_sfnumber; > > uint16_t port_flavour; > > uint16_t port_pfnumber; > > @@ -1886,6 +1888,12 @@ static int dl_argv_parse(struct dl *dl, uint64_t > o_required, > > if (err) > > return err; > > o_found |= DL_OPT_PORT_SFNUMBER; > > + } else if (dl_argv_match(dl, "controller") && (o_all & > DL_OPT_PORT_CONTROLLER)) { > > + dl_arg_inc(dl); > > + err = dl_argv_uint32_t(dl, &opts->port_controller); > > + if (err) > > + return err; > > + o_found |= DL_OPT_PORT_CONTROLLER; > > } else { > > pr_err("Unknown option \"%s\"\n", dl_argv(dl)); > > return -EINVAL; > > @@ -2079,6 +2087,9 @@ static void dl_opts_put(struct nlmsghdr *nlh, > struct dl *dl) > > mnl_attr_put_u16(nlh, > DEVLINK_ATTR_PORT_PCI_PF_NUMBER, opts->port_pfnumber); > > if (opts->present & DL_OPT_PORT_SFNUMBER) > > mnl_attr_put_u32(nlh, > DEVLINK_ATTR_PORT_PCI_SF_NUMBER, > > opts->port_sfnumber); > > + if (opts->present & DL_OPT_PORT_CONTROLLER) > > + mnl_attr_put_u32(nlh, > DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, > > + opts->port_controller); > > } > > > > static int dl_argv_parse_put(struct nlmsghdr *nlh, struct dl *dl, @@ > > -3795,7 +3806,7 @@ static void cmd_port_help(void) > > pr_err(" devlink port param set DEV/PORT_INDEX name > PARAMETER value VALUE cmode { permanent | driverinit | runtime }\n"); > > pr_err(" devlink port param show [DEV/PORT_INDEX name > PARAMETER]\n"); > > pr_err(" devlink port health show [ DEV/PORT_INDEX reporter > REPORTER_NAME ]\n"); > > - pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR > pfnum PFNUM [ sfnum SFNUM ]\n"); > > + pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR > pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); > > pr_err(" devlink port del DEV/PORT_INDEX\n"); > > } > > > > @@ -4324,7 +4335,7 @@ static int __cmd_health_show(struct dl *dl, bool > > show_device, bool show_port); > > > > static void cmd_port_add_help(void) > > { > > - pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour > FLAVOUR pfnum PFNUM [ sfnum SFNUM ]\n"); > > + pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour > FLAVOUR pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); > > } > > > > static int cmd_port_add(struct dl *dl) @@ -4342,7 +4353,7 @@ static > > int cmd_port_add(struct dl *dl) > > > > err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE | > DL_OPT_HANDLEP | > > DL_OPT_PORT_FLAVOUR | > DL_OPT_PORT_PFNUMBER, > > - DL_OPT_PORT_SFNUMBER); > > + DL_OPT_PORT_SFNUMBER | > DL_OPT_PORT_CONTROLLER); > > if (err) > > return err; > > > > diff --git a/man/man8/devlink-port.8 b/man/man8/devlink-port.8 index > > 563c5833..a5142c4e 100644 > > --- a/man/man8/devlink-port.8 > > +++ b/man/man8/devlink-port.8 > > @@ -54,6 +54,8 @@ devlink-port \- devlink port configuration .IR > > PFNUMBER " ]" > > .RB "{ " pcisf > > .IR SFNUMBER " }" > > +.RB "{ " controller > > +.IR CNUM " }" > > .br > > > > .ti -8 > > @@ -174,6 +176,12 @@ Specifies sfnumber to assign to the device of the > SF. > > This field is optional for those devices which supports auto > > assignment of the SF number. > > > > +.TP > > +.BR controller " { " controller " } " > > +Specifies controller number for which the SF port is created. > > +This field is optional. It is used only when SF port is created for > > +the external controller. > > + > > .ti -8 > > .SS devlink port function set - Set the port function attribute(s). > > > > @@ -327,6 +335,17 @@ devlink dev param set pci/0000:01:00.0/1 name > > internal_error_reset value true cm .RS 4 Sets the parameter > > internal_error_reset of specified devlink port (#1) to true. > > .RE > > +.PP > > +devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 > > +controller 1 .RS 4 Add a devlink port of flavour PCI SF on controller > > +1 which has PCI PF of number > > +0 with SF number 88. To make use of the function an example sequence > > +is to add a port, configure the function attribute and activate the > > +function. Once the function usage is completed, deactivate the > > +function and finally delete the port. When there is desire to reuse > > +the port without deletion, it can be reconfigured and activated again > > +when function is in inactive state and function's operational state is > detached. > > +.RE > > > > .SH SEE ALSO > > .BR devlink (8), > >
On 6/3/21 5:19 AM, Parav Pandit wrote: > @@ -3795,7 +3806,7 @@ static void cmd_port_help(void) > pr_err(" devlink port param set DEV/PORT_INDEX name PARAMETER value VALUE cmode { permanent | driverinit | runtime }\n"); > pr_err(" devlink port param show [DEV/PORT_INDEX name PARAMETER]\n"); > pr_err(" devlink port health show [ DEV/PORT_INDEX reporter REPORTER_NAME ]\n"); > - pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR pfnum PFNUM [ sfnum SFNUM ]\n"); > + pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); > pr_err(" devlink port del DEV/PORT_INDEX\n"); > } > > @@ -4324,7 +4335,7 @@ static int __cmd_health_show(struct dl *dl, bool show_device, bool show_port); > > static void cmd_port_add_help(void) > { > - pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour FLAVOUR pfnum PFNUM [ sfnum SFNUM ]\n"); > + pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour FLAVOUR pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); This line and the one above need to be wrapped. This addition puts it well into the 90s.
On 2021/6/6 15:10, Parav Pandit wrote: > Hi Yunsheng, > >> From: Yunsheng Lin <linyunsheng@huawei.com> >> Sent: Friday, June 4, 2021 7:05 AM >> >> On 2021/6/3 19:19, Parav Pandit wrote: >>> A user optionally provides the external controller number when user >>> wants to create devlink port for the external controller. >> >> Hi, Parav >> I was planing to use controller id to solve the devlink instance representing >> problem for multi-function which shares common resource in the same ASIC, >> see [1]. >> >> It seems the controller id used here is to differentiate the function used in >> different host? >> > That’s correct. Controller holds one or more PCI functions (PF,VF,SF). I am not sure I understand the exact usage of controller and why controller id is in "devlink_port_*_attrs". Let's consider a simplified case where there is two PF(supposing both have VF enabled), and each PF has different controller and each PF corresponds to a different physical port(Or it is about multi-host case multi PF may sharing the same physical port?): 1. I suppose each PF has it's devlink instance for mlx case(I suppose each VF can not have it's own devlink instance for VF shares the same physical port with PF, right?). 2. each PF's devlink instance has three types of port, which is FLAVOUR_PHYSICAL, FLAVOUR_PCI_PF and FLAVOUR_PCI_VF(supposing I understand port flavour correctly). If I understand above correctly, all ports in the same devlink instance should have the same controller id, right? If yes, why not put the controller id in the devlink instance? > In your case if there is single devlink instance representing ASIC, it is better to have health reporters under this single instance. > > Devlink parameters do not span multiple devlink instance. Yes, that is what I try to do: shared status/parameters in devlink instance, physical port specific status/parameters in devlink port instance. > So if you need to control devlink instance parameters of each function byitself, you likely need devlink instance for each. > And still continue to have ASIC wide health reporters under single instance that represents whole ASIC. I do not think each function need a devlink instance if there is a devlink instance representing a whole ASIC, using the devlink port instance to represent the function seems enough? > >> 1. https://lkml.org/lkml/2021/5/31/296
> From: Yunsheng Lin <linyunsheng@huawei.com> > Sent: Monday, June 7, 2021 9:01 AM > > On 2021/6/6 15:10, Parav Pandit wrote: > > Hi Yunsheng, > > > >> From: Yunsheng Lin <linyunsheng@huawei.com> > >> Sent: Friday, June 4, 2021 7:05 AM > >> > >> On 2021/6/3 19:19, Parav Pandit wrote: > >>> A user optionally provides the external controller number when user > >>> wants to create devlink port for the external controller. > >> > >> Hi, Parav > >> I was planing to use controller id to solve the devlink instance > >> representing problem for multi-function which shares common resource > >> in the same ASIC, see [1]. > >> > >> It seems the controller id used here is to differentiate the function > >> used in different host? > >> > > That’s correct. Controller holds one or more PCI functions (PF,VF,SF). > > I am not sure I understand the exact usage of controller and why controller id > is in "devlink_port_*_attrs". > > Let's consider a simplified case where there is two PF(supposing both have > VF enabled), and each PF has different controller and each PF corresponds to > a different physical port(Or it is about multi-host case multi PF may sharing > the same physical port?): Typically single host with two PFs have their own physical ports. Multi-host with two PFs, on each host they share respective physical ports. Single host: Pf0.physical_port = p0 Pf1.physical_port = p1. Multi-host (two) host setup H1.pf0.phyical_port = p0. H1.pf1.phyical_port = p1. H2.pf0.phyical_port = p0. H2.pf1.phyical_port = p1. > 1. I suppose each PF has it's devlink instance for mlx case(I suppose each > VF can not have it's own devlink instance for VF shares the same physical > port with PF, right?). VF and SF ports are of flavour VIRTUAL. > 2. each PF's devlink instance has three types of port, which is > FLAVOUR_PHYSICAL, FLAVOUR_PCI_PF and FLAVOUR_PCI_VF(supposing I > understand > port flavour correctly). > FLAVOUR_PCI_{PF,VF,SF} belongs to eswitch (representor) side on switchdev device. > If I understand above correctly, all ports in the same devlink instance should > have the same controller id, right? If yes, why not put the controller id in the > devlink instance? Need not be. All PCI_{PF,VF,SF} can have controller id different for different controllers. Usually each multi-host is a different controller. Refer to this diagram [1] and detailed description. > > > In your case if there is single devlink instance representing ASIC, it is better > to have health reporters under this single instance. > > > > Devlink parameters do not span multiple devlink instance. > > Yes, that is what I try to do: shared status/parameters in devlink instance, > physical port specific status/parameters in devlink port instance. > > > So if you need to control devlink instance parameters of each function > byitself, you likely need devlink instance for each. > > And still continue to have ASIC wide health reporters under single instance > that represents whole ASIC. > > I do not think each function need a devlink instance if there is a devlink > instance representing a whole ASIC, using the devlink port instance to > represent the function seems enough? 'devlink port function's is equivalent of hypervisor/nicvisor entity controlled by the network/sysadmin. While devlink instance of a given PF,VF,SF is managed by the user of such function itself. For example when a VF is mapped to a VM, devlink instance of this VF resides in the VM managed by the guest VM. Before this VF is given to VM, usually hypervisor/nicvisor admin programs this function (such as mac address) explained in [3]. So that a given VM always gets the same mac address regardless of which VF {a or b). [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/networking/devlink/devlink-port.rst?h=v5.13-rc5#n72 [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/networking/devlink/devlink-port.rst?h=v5.13-rc5#n60 [3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/networking/devlink/devlink-port.rst?h=v5.13-rc5#n110 > > > > >> 1. https://lkml.org/lkml/2021/5/31/296
On 2021/6/7 14:10, Parav Pandit wrote: >> From: Yunsheng Lin <linyunsheng@huawei.com> >> Sent: Monday, June 7, 2021 9:01 AM >> >> On 2021/6/6 15:10, Parav Pandit wrote: >>> Hi Yunsheng, >>> >>>> From: Yunsheng Lin <linyunsheng@huawei.com> >>>> Sent: Friday, June 4, 2021 7:05 AM >>>> >>>> On 2021/6/3 19:19, Parav Pandit wrote: >>>>> A user optionally provides the external controller number when user >>>>> wants to create devlink port for the external controller. >>>> >>>> Hi, Parav >>>> I was planing to use controller id to solve the devlink instance >>>> representing problem for multi-function which shares common resource >>>> in the same ASIC, see [1]. >>>> >>>> It seems the controller id used here is to differentiate the function >>>> used in different host? >>>> >>> That’s correct. Controller holds one or more PCI functions (PF,VF,SF). >> >> I am not sure I understand the exact usage of controller and why controller id >> is in "devlink_port_*_attrs". >> >> Let's consider a simplified case where there is two PF(supposing both have >> VF enabled), and each PF has different controller and each PF corresponds to >> a different physical port(Or it is about multi-host case multi PF may sharing >> the same physical port?): > Typically single host with two PFs have their own physical ports. > Multi-host with two PFs, on each host they share respective physical ports. > > Single host: > Pf0.physical_port = p0 > Pf1.physical_port = p1. > > Multi-host (two) host setup > H1.pf0.phyical_port = p0. > H1.pf1.phyical_port = p1. > H2.pf0.phyical_port = p0. > H2.pf1.phyical_port = p1. Multi-host (two) host setup with separate physical port for each host: H1.pf0.phyical_port = p0 H2.pf0.phyical_port = p1 Does above use case make sense for mlx, it seems a common case for our internal use. > >> 1. I suppose each PF has it's devlink instance for mlx case(I suppose each >> VF can not have it's own devlink instance for VF shares the same physical >> port with PF, right?). > VF and SF ports are of flavour VIRTUAL. Which devlink instance does the flavour VIRTUAL port instance for VF and SF is registered to? Does it mean VF has it's own devlink instance in VM when it is passed a VM, and flavour VIRTUAL port instance for that VF is registered to that devlink instance in the VM too? Even in the same host as PF, the VF also has it's own devlink instance? > >> 2. each PF's devlink instance has three types of port, which is >> FLAVOUR_PHYSICAL, FLAVOUR_PCI_PF and FLAVOUR_PCI_VF(supposing I >> understand >> port flavour correctly). >> > FLAVOUR_PCI_{PF,VF,SF} belongs to eswitch (representor) side on switchdev device. If devlink instance or eswitch is in DEVLINK_ESWITCH_MODE_LEGACY mode, the FLAVOUR_PCI_{PF,VF,SF} port instance does not need to created? > >> If I understand above correctly, all ports in the same devlink instance should >> have the same controller id, right? If yes, why not put the controller id in the >> devlink instance? > Need not be. All PCI_{PF,VF,SF} can have controller id different for different controllers. The point is that two VF from different PF may be in the different host, all VF of a specific PF need to be in the same host, right? otherwise it may break PCI enumeration process? If yes, as PCI_{PF,VF,SF} belongs to eswitch (representor) side on switchdev device(which means PCI_{PF,VF,SF} port instance is in the same host, as the host corresponding to "controller_num=0" in diagram [1]), so it seems all the PCI_{PF,VF,SF} of a specific PF should have the same controller id, and using a controller id of the devlink instance in "controller_num=0" in diagram [1] seems enough? > Usually each multi-host is a different controller. > Refer to this diagram [1] and detailed description. devlink instance does not exist in the host corresponding to "controller_num=1" in diagram [1]? Or devlink instance does exist in the host corresponding to "controller_num=1", but the mode of that devlink instance is DEVLINK_ESWITCH_MODE_LEGACY in diagram [1]? Also, eswitch mode can only be set on the devlink instance corresponding to PF, but not for VF/SF(supposing that VF/SF could have it's own devlink instance too), right? > >> >>> In your case if there is single devlink instance representing ASIC, it is better >> to have health reporters under this single instance. >>> >>> Devlink parameters do not span multiple devlink instance. >> >> Yes, that is what I try to do: shared status/parameters in devlink instance, >> physical port specific status/parameters in devlink port instance. >> >>> So if you need to control devlink instance parameters of each function >> byitself, you likely need devlink instance for each. >>> And still continue to have ASIC wide health reporters under single instance >> that represents whole ASIC. >> >> I do not think each function need a devlink instance if there is a devlink >> instance representing a whole ASIC, using the devlink port instance to >> represent the function seems enough? > 'devlink port function's is equivalent of hypervisor/nicvisor entity controlled by the network/sysadmin. > While devlink instance of a given PF,VF,SF is managed by the user of such function itself. 'devlink port function' means "struct devlink_port", right? It seems 'devlink port function' in the host is representing a VF when devlink instance of that VF is in the VM? > For example when a VF is mapped to a VM, devlink instance of this VF resides in the VM managed by the guest VM. Does the user in VM really care about devlink info or configuration when network/sysadmin has configured the VF through 'devlink port function' in the host? which devlink info or configuration does user need to query or configure in a VM? > > Before this VF is given to VM, usually hypervisor/nicvisor admin programs this function (such as mac address) explained in [3]. > So that a given VM always gets the same mac address regardless of which VF {a or b). > > [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/networking/devlink/devlink-port.rst?h=v5.13-rc5#n72 > [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/networking/devlink/devlink-port.rst?h=v5.13-rc5#n60 > [3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/networking/devlink/devlink-port.rst?h=v5.13-rc5#n110 > >> >>> >>>> 1. https://lkml.org/lkml/2021/5/31/296 >
> From: Yunsheng Lin <linyunsheng@huawei.com> > Sent: Monday, June 7, 2021 4:27 PM > > On 2021/6/7 14:10, Parav Pandit wrote: > >> From: Yunsheng Lin <linyunsheng@huawei.com> > >> Sent: Monday, June 7, 2021 9:01 AM > >> > >> On 2021/6/6 15:10, Parav Pandit wrote: > >>> Hi Yunsheng, > >>> > >>>> From: Yunsheng Lin <linyunsheng@huawei.com> > >>>> Sent: Friday, June 4, 2021 7:05 AM > >>>> > >>>> On 2021/6/3 19:19, Parav Pandit wrote: > >>>>> A user optionally provides the external controller number when > >>>>> user wants to create devlink port for the external controller. > >>>> > >>>> Hi, Parav > >>>> I was planing to use controller id to solve the devlink instance > >>>> representing problem for multi-function which shares common > >>>> resource in the same ASIC, see [1]. > >>>> > >>>> It seems the controller id used here is to differentiate the > >>>> function used in different host? > >>>> > >>> That’s correct. Controller holds one or more PCI functions (PF,VF,SF). > >> > >> I am not sure I understand the exact usage of controller and why > >> controller id is in "devlink_port_*_attrs". > >> > >> Let's consider a simplified case where there is two PF(supposing both > >> have VF enabled), and each PF has different controller and each PF > >> corresponds to a different physical port(Or it is about multi-host > >> case multi PF may sharing the same physical port?): > > Typically single host with two PFs have their own physical ports. > > Multi-host with two PFs, on each host they share respective physical ports. > > > > Single host: > > Pf0.physical_port = p0 > > Pf1.physical_port = p1. > > > > Multi-host (two) host setup > > H1.pf0.phyical_port = p0. > > H1.pf1.phyical_port = p1. > > H2.pf0.phyical_port = p0. > > H2.pf1.phyical_port = p1. > > Multi-host (two) host setup with separate physical port for each host: > H1.pf0.phyical_port = p0 > H2.pf0.phyical_port = p1 > > Does above use case make sense for mlx, it seems a common case for our > internal use. It is mlx5 use case too. > > > > >> 1. I suppose each PF has it's devlink instance for mlx case(I suppose each > >> VF can not have it's own devlink instance for VF shares the same physical > >> port with PF, right?). > > VF and SF ports are of flavour VIRTUAL. > > Which devlink instance does the flavour VIRTUAL port instance for VF and SF > is registered to? To the devlink instance of the VF/SF. > Does it mean VF has it's own devlink instance in VM when it is passed a VM, > and flavour VIRTUAL port instance for that VF is registered to that devlink > instance in the VM too? Yes. > > Even in the same host as PF, the VF also has it's own devlink instance? Yes. > > > > >> 2. each PF's devlink instance has three types of port, which is > >> FLAVOUR_PHYSICAL, FLAVOUR_PCI_PF and > FLAVOUR_PCI_VF(supposing I > >> understand > >> port flavour correctly). > >> > > FLAVOUR_PCI_{PF,VF,SF} belongs to eswitch (representor) side on > switchdev device. > > If devlink instance or eswitch is in DEVLINK_ESWITCH_MODE_LEGACY mode, > the FLAVOUR_PCI_{PF,VF,SF} port instance does not need to created? No. in eswitch legacy, there are no representor netdevice or devlink ports. > > > > >> If I understand above correctly, all ports in the same devlink > >> instance should have the same controller id, right? If yes, why not > >> put the controller id in the devlink instance? > > Need not be. All PCI_{PF,VF,SF} can have controller id different for > different controllers. > > The point is that two VF from different PF may be in the different host, all VF > of a specific PF need to be in the same host, right? > otherwise it may break PCI enumeration process? > Sure. VFs belong to PF, PF belong to controller, controller is plugged into a host root complex. > If yes, as PCI_{PF,VF,SF} belongs to eswitch (representor) side on switchdev > device(which means PCI_{PF,VF,SF} port instance is in the same host, as the > host corresponding to "controller_num=0" in diagram [1]), so it seems all the > PCI_{PF,VF,SF} of a specific PF should have the same controller id, Yes. > and using > a controller id of the devlink instance in "controller_num=0" in diagram [1] > seems enough? Yes. > > > Usually each multi-host is a different controller. > > Refer to this diagram [1] and detailed description. > > devlink instance does not exist in the host corresponding to > "controller_num=1" in diagram [1]? Devlink instance do exist for controller=1 related PCI PF,VF,SF devices when those functions are plugged in the host. > Or devlink instance does exist in the host corresponding to > "controller_num=1", but the mode of that devlink instance is > DEVLINK_ESWITCH_MODE_LEGACY in diagram [1]? As you can see that eswitch is located only on controller=0. This eswitch is serving PF, VF, SFs of controller=1 + controlloler=0 as well. > > Also, eswitch mode can only be set on the devlink instance corresponding to > PF, but not for VF/SF(supposing that VF/SF could have it's own devlink > instance too), right? Yes. Eswitch can be located on the VF too. Mlx5 driver doesn't have it yet on VF. This may be some nested eswitch in future. I do not know when. > by the network/sysadmin. > > While devlink instance of a given PF,VF,SF is managed by the user of such > function itself. > > 'devlink port function' means "struct devlink_port", right? 'function' is the object managing the function connected on the otherside of this port. This includes its hw_addr, rate, state, operational state. > It seems 'devlink port function' in the host is representing a VF when devlink > instance of that VF is in the VM? Right. > > > For example when a VF is mapped to a VM, devlink instance of this VF > resides in the VM managed by the guest VM. > > Does the user in VM really care about devlink info or configuration when > network/sysadmin has configured the VF through 'devlink port function' > in the host? Yes. devlink instance offers many knobs in uniform way on PF, VF, SF. They are in use in mlx5 for devlink params, reload, net ns. > which devlink info or configuration does user need to query or configure in a > VM? Usually not much. Few examples that mlx5 users do with devlink instance of VF in a VM are, devlink params, devlink reload, board info. Health reporters, health recovery to name few.
Hi David, > From: David Ahern <dsahern@gmail.com> > Sent: Monday, June 7, 2021 8:31 AM > > On 6/3/21 5:19 AM, Parav Pandit wrote: > > @@ -3795,7 +3806,7 @@ static void cmd_port_help(void) > > pr_err(" devlink port param set DEV/PORT_INDEX name > PARAMETER value VALUE cmode { permanent | driverinit | runtime }\n"); > > pr_err(" devlink port param show [DEV/PORT_INDEX name > PARAMETER]\n"); > > pr_err(" devlink port health show [ DEV/PORT_INDEX reporter > REPORTER_NAME ]\n"); > > - pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR > pfnum PFNUM [ sfnum SFNUM ]\n"); > > + pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR > pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); > > pr_err(" devlink port del DEV/PORT_INDEX\n"); > > } > > > > @@ -4324,7 +4335,7 @@ static int __cmd_health_show(struct dl *dl, bool > > show_device, bool show_port); > > > > static void cmd_port_add_help(void) > > { > > - pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour > FLAVOUR pfnum PFNUM [ sfnum SFNUM ]\n"); > > + pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour > FLAVOUR pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); > > This line and the one above need to be wrapped. This addition puts it well > into the 90s. > It’s a print message. I was following coding style of [1] that says "However, never break user-visible strings such as printk messages because that breaks the ability to grep for them.". Recent code of dcb_ets.c has similar long string in print. So I didn't wrap it. Should we warp it? [1] https://www.kernel.org/doc/html/latest/process/coding-style.html#breaking-long-lines-and-strings
On 6/7/21 5:43 AM, Parav Pandit wrote: > Hi David, > >> From: David Ahern <dsahern@gmail.com> >> Sent: Monday, June 7, 2021 8:31 AM >> >> On 6/3/21 5:19 AM, Parav Pandit wrote: >>> @@ -3795,7 +3806,7 @@ static void cmd_port_help(void) >>> pr_err(" devlink port param set DEV/PORT_INDEX name >> PARAMETER value VALUE cmode { permanent | driverinit | runtime }\n"); >>> pr_err(" devlink port param show [DEV/PORT_INDEX name >> PARAMETER]\n"); >>> pr_err(" devlink port health show [ DEV/PORT_INDEX reporter >> REPORTER_NAME ]\n"); >>> - pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR >> pfnum PFNUM [ sfnum SFNUM ]\n"); >>> + pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR >> pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); >>> pr_err(" devlink port del DEV/PORT_INDEX\n"); >>> } >>> >>> @@ -4324,7 +4335,7 @@ static int __cmd_health_show(struct dl *dl, bool >>> show_device, bool show_port); >>> >>> static void cmd_port_add_help(void) >>> { >>> - pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour >> FLAVOUR pfnum PFNUM [ sfnum SFNUM ]\n"); >>> + pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour >> FLAVOUR pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); >> >> This line and the one above need to be wrapped. This addition puts it well >> into the 90s. >> > It’s a print message. > I was following coding style of [1] that says "However, never break user-visible strings such as printk messages because that breaks the ability to grep for them.". > Recent code of dcb_ets.c has similar long string in print. So I didn't wrap it. I missed that when reviewing the dcb command then. > Should we warp it? > > [1] https://www.kernel.org/doc/html/latest/process/coding-style.html#breaking-long-lines-and-strings > [1] is referring to messages from kernel code, and I agree with that style. This is help message from iproute2. I tend to keep my terminal widths between 80 and 90 columns, so the long help lines from commands are not very friendly causing me to resize the terminal.
> From: David Ahern <dsahern@gmail.com> > Sent: Monday, June 7, 2021 8:11 PM > > On 6/7/21 5:43 AM, Parav Pandit wrote: > > Hi David, > > > >> From: David Ahern <dsahern@gmail.com> > >> Sent: Monday, June 7, 2021 8:31 AM > >> > >> On 6/3/21 5:19 AM, Parav Pandit wrote: > >>> @@ -3795,7 +3806,7 @@ static void cmd_port_help(void) > >>> pr_err(" devlink port param set DEV/PORT_INDEX name > >> PARAMETER value VALUE cmode { permanent | driverinit | runtime }\n"); > >>> pr_err(" devlink port param show [DEV/PORT_INDEX name > >> PARAMETER]\n"); > >>> pr_err(" devlink port health show [ DEV/PORT_INDEX reporter > >> REPORTER_NAME ]\n"); > >>> - pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR > >> pfnum PFNUM [ sfnum SFNUM ]\n"); > >>> + pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR > >> pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); > >>> pr_err(" devlink port del DEV/PORT_INDEX\n"); > >>> } > >>> > >>> @@ -4324,7 +4335,7 @@ static int __cmd_health_show(struct dl *dl, > >>> bool show_device, bool show_port); > >>> > >>> static void cmd_port_add_help(void) { > >>> - pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour > >> FLAVOUR pfnum PFNUM [ sfnum SFNUM ]\n"); > >>> + pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour > >> FLAVOUR pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); > >> > >> This line and the one above need to be wrapped. This addition puts it > >> well into the 90s. > >> > > It’s a print message. > > I was following coding style of [1] that says "However, never break user- > visible strings such as printk messages because that breaks the ability to grep > for them.". > > Recent code of dcb_ets.c has similar long string in print. So I didn't wrap it. > > I missed that when reviewing the dcb command then. > > > Should we warp it? > > > > [1] > > https://www.kernel.org/doc/html/latest/process/coding-style.html#break > > ing-long-lines-and-strings > > > > [1] is referring to messages from kernel code, and I agree with that style. This > is help message from iproute2. I tend to keep my terminal widths between > 80 and 90 columns, so the long help lines from commands are not very > friendly causing me to resize the terminal. I see. So do you recommend splitting the print message? I personally feel easier to follow kernel coding standard as much possible in spirit of "grep them".
On 6/7/21 9:12 AM, Parav Pandit wrote: > I see. So do you recommend splitting the print message? > I personally feel easier to follow kernel coding standard as much possible in spirit of "grep them".
On 6/7/21 9:12 AM, Parav Pandit wrote: > > >> From: David Ahern <dsahern@gmail.com> >> Sent: Monday, June 7, 2021 8:11 PM >> >> On 6/7/21 5:43 AM, Parav Pandit wrote: >>> Hi David, >>> >>>> From: David Ahern <dsahern@gmail.com> >>>> Sent: Monday, June 7, 2021 8:31 AM >>>> >>>> On 6/3/21 5:19 AM, Parav Pandit wrote: >>>>> @@ -3795,7 +3806,7 @@ static void cmd_port_help(void) >>>>> pr_err(" devlink port param set DEV/PORT_INDEX name >>>> PARAMETER value VALUE cmode { permanent | driverinit | runtime }\n"); >>>>> pr_err(" devlink port param show [DEV/PORT_INDEX name >>>> PARAMETER]\n"); >>>>> pr_err(" devlink port health show [ DEV/PORT_INDEX reporter >>>> REPORTER_NAME ]\n"); >>>>> - pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR >>>> pfnum PFNUM [ sfnum SFNUM ]\n"); >>>>> + pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR >>>> pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); >>>>> pr_err(" devlink port del DEV/PORT_INDEX\n"); >>>>> } >>>>> >>>>> @@ -4324,7 +4335,7 @@ static int __cmd_health_show(struct dl *dl, >>>>> bool show_device, bool show_port); >>>>> >>>>> static void cmd_port_add_help(void) { >>>>> - pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour >>>> FLAVOUR pfnum PFNUM [ sfnum SFNUM ]\n"); >>>>> + pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour >>>> FLAVOUR pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); >>>> >>>> This line and the one above need to be wrapped. This addition puts it >>>> well into the 90s. >>>> >>> It’s a print message. >>> I was following coding style of [1] that says "However, never break user- >> visible strings such as printk messages because that breaks the ability to grep >> for them.". >>> Recent code of dcb_ets.c has similar long string in print. So I didn't wrap it. >> >> I missed that when reviewing the dcb command then. >> >>> Should we warp it? >>> >>> [1] >>> https://www.kernel.org/doc/html/latest/process/coding-style.html#break >>> ing-long-lines-and-strings >>> >> >> [1] is referring to messages from kernel code, and I agree with that style. This >> is help message from iproute2. I tend to keep my terminal widths between >> 80 and 90 columns, so the long help lines from commands are not very >> friendly causing me to resize the terminal. > I see. So do you recommend splitting the print message? > I personally feel easier to follow kernel coding standard as much possible in spirit of "grep them".
> From: David Ahern <dsahern@gmail.com> > Sent: Monday, June 7, 2021 9:44 PM > > On 6/7/21 9:12 AM, Parav Pandit wrote: > > > > > >> From: David Ahern <dsahern@gmail.com> > >> Sent: Monday, June 7, 2021 8:11 PM > >> > >> On 6/7/21 5:43 AM, Parav Pandit wrote: > >>> Hi David, > >>> > >>>> From: David Ahern <dsahern@gmail.com> > >>>> Sent: Monday, June 7, 2021 8:31 AM > >>>> > >>>> On 6/3/21 5:19 AM, Parav Pandit wrote: > >>>>> @@ -3795,7 +3806,7 @@ static void cmd_port_help(void) > >>>>> pr_err(" devlink port param set DEV/PORT_INDEX name > >>>> PARAMETER value VALUE cmode { permanent | driverinit | runtime > >>>> }\n"); > >>>>> pr_err(" devlink port param show [DEV/PORT_INDEX name > >>>> PARAMETER]\n"); > >>>>> pr_err(" devlink port health show [ DEV/PORT_INDEX reporter > >>>> REPORTER_NAME ]\n"); > >>>>> - pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR > >>>> pfnum PFNUM [ sfnum SFNUM ]\n"); > >>>>> + pr_err(" devlink port add DEV/PORT_INDEX flavour > FLAVOUR > >>>> pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); > >>>>> pr_err(" devlink port del DEV/PORT_INDEX\n"); > >>>>> } > >>>>> > >>>>> @@ -4324,7 +4335,7 @@ static int __cmd_health_show(struct dl *dl, > >>>>> bool show_device, bool show_port); > >>>>> > >>>>> static void cmd_port_add_help(void) { > >>>>> - pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour > >>>> FLAVOUR pfnum PFNUM [ sfnum SFNUM ]\n"); > >>>>> + pr_err(" devlink port add { DEV | DEV/PORT_INDEX } > flavour > >>>> FLAVOUR pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); > >>>> > >>>> This line and the one above need to be wrapped. This addition puts > >>>> it well into the 90s. > >>>> > >>> It’s a print message. > >>> I was following coding style of [1] that says "However, never break > >>> user- > >> visible strings such as printk messages because that breaks the > >> ability to grep for them.". > >>> Recent code of dcb_ets.c has similar long string in print. So I didn't wrap > it. > >> > >> I missed that when reviewing the dcb command then. > >> > >>> Should we warp it? > >>> > >>> [1] > >>> https://www.kernel.org/doc/html/latest/process/coding-style.html#bre > >>> ak > >>> ing-long-lines-and-strings > >>> > >> > >> [1] is referring to messages from kernel code, and I agree with that > >> style. This is help message from iproute2. I tend to keep my terminal > >> widths between > >> 80 and 90 columns, so the long help lines from commands are not very > >> friendly causing me to resize the terminal. > > I see. So do you recommend splitting the print message? > > I personally feel easier to follow kernel coding standard as much > > possible in spirit of "grep them".
On 2021/6/7 19:12, Parav Pandit wrote: >> From: Yunsheng Lin <linyunsheng@huawei.com> >> Sent: Monday, June 7, 2021 4:27 PM >> [..] >>> >>>> 2. each PF's devlink instance has three types of port, which is >>>> FLAVOUR_PHYSICAL, FLAVOUR_PCI_PF and >> FLAVOUR_PCI_VF(supposing I >>>> understand >>>> port flavour correctly). >>>> >>> FLAVOUR_PCI_{PF,VF,SF} belongs to eswitch (representor) side on >> switchdev device. >> >> If devlink instance or eswitch is in DEVLINK_ESWITCH_MODE_LEGACY mode, >> the FLAVOUR_PCI_{PF,VF,SF} port instance does not need to created? > No. in eswitch legacy, there are no representor netdevice or devlink ports. It seems each devlink port instance corresponds to a netdevice. More specificly, the devlink instance is created in the struct pci_driver' probe function of a pci function, a devlink port instance is created and registered to that devlink instance when a netdev of that pci function is created? As in diagram [1], the devlink port instance(flavour FLAVOUR_PHYSICAL) for ctrl-0-pf0 is created when the netdev of ctrl-0-pf0 is created in the host of smartNIC, the devlink port instance(flavour FLAVOUR_VIRTUAL) for ctrl-0-pf0vfN is created when the netdev of ctrl-0-pf0vfN is created in the host of smartNIC, right? When eswitch mode is set to DEVLINK_ESWITCH_MODE_SWITCHDEV, the representor netdev for PF/VF in "controller_num=1" is created in the host of smartNIC, so is the devlink port instance(FLAVOUR_PCI_{PF,VF,SF}) corresponding to that representor netdev just created in the host of smartNIC? More specificly, devlink port instance(flavour FLAVOUR_PCI_PF) for ctrl-1-pf0 and devlink port instance (flavour FLAVOUR_PCI_VF)for ctrl-1-pf0vfN? When "controller_num=1" is plugged to a server, the server host creates devlink instance and devlink port instance in the host of server as similar as the ctrl-0-pf0 and ctrl-0-pf0vfN in the host of smartNIC? > >> >>> >>>> If I understand above correctly, all ports in the same devlink >>>> instance should have the same controller id, right? If yes, why not >>>> put the controller id in the devlink instance? >>> Need not be. All PCI_{PF,VF,SF} can have controller id different for >> different controllers. >> >> The point is that two VF from different PF may be in the different host, all VF >> of a specific PF need to be in the same host, right? >> otherwise it may break PCI enumeration process? >> > Sure. VFs belong to PF, PF belong to controller, controller is plugged into a host root complex. > >> If yes, as PCI_{PF,VF,SF} belongs to eswitch (representor) side on switchdev >> device(which means PCI_{PF,VF,SF} port instance is in the same host, as the >> host corresponding to "controller_num=0" in diagram [1]), so it seems all the >> PCI_{PF,VF,SF} of a specific PF should have the same controller id, > Yes. > >> and using >> a controller id of the devlink instance in "controller_num=0" in diagram [1] >> seems enough? > Yes. > >> >>> Usually each multi-host is a different controller. >>> Refer to this diagram [1] and detailed description. >> >> devlink instance does not exist in the host corresponding to >> "controller_num=1" in diagram [1]? > Devlink instance do exist for controller=1 related PCI PF,VF,SF devices when those functions are plugged in the host. > >> Or devlink instance does exist in the host corresponding to >> "controller_num=1", but the mode of that devlink instance is >> DEVLINK_ESWITCH_MODE_LEGACY in diagram [1]? > As you can see that eswitch is located only on controller=0. > This eswitch is serving PF, VF, SFs of controller=1 + controlloler=0 as well. How do we decide where eswitch is located? through some fw/hw configuration? It seems if the eswitch is enabled on "controller=1", that is a nested eswitch too, which you mentioned below? >> >> Also, eswitch mode can only be set on the devlink instance corresponding to >> PF, but not for VF/SF(supposing that VF/SF could have it's own devlink >> instance too), right? > Yes. Eswitch can be located on the VF too. Mlx5 driver doesn't have it yet on VF. > This may be some nested eswitch in future. I do not know when. > >> by the network/sysadmin. >>> While devlink instance of a given PF,VF,SF is managed by the user of such >> function itself. >> >> 'devlink port function' means "struct devlink_port", right? > 'function' is the object managing the function connected on the otherside of this port. > This includes its hw_addr, rate, state, operational state. Does "other side of this port" means the pci function that is most likely have been passed through to a VM? "devlink port" without the "function" represents the representor netdev on the host where eswitch is located? > >> It seems 'devlink port function' in the host is representing a VF when devlink >> instance of that VF is in the VM? > Right. >> >>> For example when a VF is mapped to a VM, devlink instance of this VF >> resides in the VM managed by the guest VM. >> >> Does the user in VM really care about devlink info or configuration when >> network/sysadmin has configured the VF through 'devlink port function' >> in the host? > Yes. devlink instance offers many knobs in uniform way on PF, VF, SF. > They are in use in mlx5 for devlink params, reload, net ns. "net ns" refer to "net namespace", right? I am not sure how devlink instance related to net namespace yet. I thought devlink is not limited to networking, it can be used in other pcie device other than ethernet device? > >> which devlink info or configuration does user need to query or configure in a >> VM? > Usually not much. > Few examples that mlx5 users do with devlink instance of VF in a VM are, devlink params, devlink reload, board info. Health reporters, health recovery to name few. >
> From: Yunsheng Lin <linyunsheng@huawei.com> > Sent: Tuesday, June 8, 2021 8:58 AM > > On 2021/6/7 19:12, Parav Pandit wrote: > >> From: Yunsheng Lin <linyunsheng@huawei.com> > >> Sent: Monday, June 7, 2021 4:27 PM > >> > > [..] > > >>> > >>>> 2. each PF's devlink instance has three types of port, which is > >>>> FLAVOUR_PHYSICAL, FLAVOUR_PCI_PF and > >> FLAVOUR_PCI_VF(supposing I > >>>> understand > >>>> port flavour correctly). > >>>> > >>> FLAVOUR_PCI_{PF,VF,SF} belongs to eswitch (representor) side on > >> switchdev device. > >> > >> If devlink instance or eswitch is in DEVLINK_ESWITCH_MODE_LEGACY > >> mode, the FLAVOUR_PCI_{PF,VF,SF} port instance does not need to > created? > > No. in eswitch legacy, there are no representor netdevice or devlink ports. > > It seems each devlink port instance corresponds to a netdevice. > More specificly, the devlink instance is created in the struct pci_driver' probe > function of a pci function, a devlink port instance is created and registered to > that devlink instance when a netdev of that pci function is created? > Yes. > As in diagram [1], the devlink port instance(flavour FLAVOUR_PHYSICAL) for > ctrl-0-pf0 is created when the netdev of ctrl-0-pf0 is created in the host of > smartNIC, the devlink port instance(flavour FLAVOUR_VIRTUAL) for ctrl-0- > pf0vfN is created when the netdev of ctrl-0-pf0vfN is created in the host of > smartNIC, right? > Ctrl-0-pf0vfN, ctrl-0-pf0 ports are eswitch ports. They are created where there is eswitch. Usually in smartnic where eswitch is located. > When eswitch mode is set to DEVLINK_ESWITCH_MODE_SWITCHDEV, the > representor netdev for PF/VF in "controller_num=1" is created in the host of > smartNIC, so is the devlink port instance(FLAVOUR_PCI_{PF,VF,SF}) > corresponding to that representor netdev just created in the host of > smartNIC? More specificly, devlink port instance(flavour FLAVOUR_PCI_PF) > for ctrl-1-pf0 and devlink port instance (flavour FLAVOUR_PCI_VF)for ctrl-1- > pf0vfN? > I do not know what you mean by "host of smartnic". But as clarified in diagram and above, They are created on the eswitch device. > When "controller_num=1" is plugged to a server, the server host creates > devlink instance and devlink port instance in the host of server as similar as > the ctrl-0-pf0 and ctrl-0-pf0vfN in the host of smartNIC? Since eswitch is not in the controller=1, controller 1 is just hosting PCI PF, VF, SF functions. Either it has PHYSICAL or VIRTUAL ports along with its netdevice and devlink instance. > >> Or devlink instance does exist in the host corresponding to > >> "controller_num=1", but the mode of that devlink instance is > >> DEVLINK_ESWITCH_MODE_LEGACY in diagram [1]? > > As you can see that eswitch is located only on controller=0. > > This eswitch is serving PF, VF, SFs of controller=1 + controlloler=0 as well. > > How do we decide where eswitch is located? through some fw/hw > configuration? This is usually decided by the hardware. > > It seems if the eswitch is enabled on "controller=1", that is a nested eswitch > too, which you mentioned below? Yes. > > >> > >> Also, eswitch mode can only be set on the devlink instance > >> corresponding to PF, but not for VF/SF(supposing that VF/SF could > >> have it's own devlink instance too), right? > > Yes. Eswitch can be located on the VF too. Mlx5 driver doesn't have it yet > on VF. > > This may be some nested eswitch in future. I do not know when. > > > >> by the network/sysadmin. > >>> While devlink instance of a given PF,VF,SF is managed by the user of > >>> such > >> function itself. > >> > >> 'devlink port function' means "struct devlink_port", right? > > 'function' is the object managing the function connected on the otherside > of this port. > > This includes its hw_addr, rate, state, operational state. > > Does "other side of this port" means the pci function that is most likely have > been passed through to a VM? > Yes. > "devlink port" without the "function" represents the representor netdev on > the host where eswitch is located? > Yes. > > > >> It seems 'devlink port function' in the host is representing a VF > >> when devlink instance of that VF is in the VM? > > Right. > >> > >>> For example when a VF is mapped to a VM, devlink instance of this VF > >> resides in the VM managed by the guest VM. > >> > >> Does the user in VM really care about devlink info or configuration > >> when network/sysadmin has configured the VF through 'devlink port > function' > >> in the host? > > Yes. devlink instance offers many knobs in uniform way on PF, VF, SF. > > They are in use in mlx5 for devlink params, reload, net ns. > > "net ns" refer to "net namespace", right? Yes. > I am not sure how devlink instance related to net namespace yet. > I thought devlink is not limited to networking, it can be used in other pcie > device other than ethernet device? It can be. And it can be used for networking devices too and hence it is attached to single net ns.
On 2021/6/8 13:26, Parav Pandit wrote: >> From: Yunsheng Lin <linyunsheng@huawei.com> >> Sent: Tuesday, June 8, 2021 8:58 AM >> >> On 2021/6/7 19:12, Parav Pandit wrote: >>>> From: Yunsheng Lin <linyunsheng@huawei.com> >>>> Sent: Monday, June 7, 2021 4:27 PM >>>> >> >> [..] >> >>>>> >>>>>> 2. each PF's devlink instance has three types of port, which is >>>>>> FLAVOUR_PHYSICAL, FLAVOUR_PCI_PF and >>>> FLAVOUR_PCI_VF(supposing I >>>>>> understand >>>>>> port flavour correctly). >>>>>> >>>>> FLAVOUR_PCI_{PF,VF,SF} belongs to eswitch (representor) side on >>>> switchdev device. >>>> >>>> If devlink instance or eswitch is in DEVLINK_ESWITCH_MODE_LEGACY >>>> mode, the FLAVOUR_PCI_{PF,VF,SF} port instance does not need to >> created? >>> No. in eswitch legacy, there are no representor netdevice or devlink ports. >> >> It seems each devlink port instance corresponds to a netdevice. >> More specificly, the devlink instance is created in the struct pci_driver' probe >> function of a pci function, a devlink port instance is created and registered to >> that devlink instance when a netdev of that pci function is created? >> > Yes. > >> As in diagram [1], the devlink port instance(flavour FLAVOUR_PHYSICAL) for >> ctrl-0-pf0 is created when the netdev of ctrl-0-pf0 is created in the host of >> smartNIC, the devlink port instance(flavour FLAVOUR_VIRTUAL) for ctrl-0- >> pf0vfN is created when the netdev of ctrl-0-pf0vfN is created in the host of >> smartNIC, right? >> > Ctrl-0-pf0vfN, ctrl-0-pf0 ports are eswitch ports. They are created where there is eswitch. > Usually in smartnic where eswitch is located. Does diagram in [1] corresponds to the multi-host (two) host setup as memtioned previously? H1.pf0.phyical_port = p0. H1.pf1.phyical_port = p1. H2.pf0.phyical_port = p0. H2.pf1.phyical_port = p1. Let's say H1 = server and H2 = smartNIC as the pci rc connected to below: --------------------------------------------------------- | | | --------- --------- ------- ------- | ----------- | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | server | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | pci rc |=== | pf0 |______/________/ | pf1 |___/_______/ | | connect | | ------- ------- | ----------- | | controller_num=1 (no eswitch) | ------|-------------------------------------------------- (internal wire) | --------------------------------------------------------- | devlink eswitch ports and reps | | ----------------------------------------------------- | | |ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 |ctrl-0 | | | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | | ----------------------------------------------------- | | |ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 |ctrl-1 | | | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | | ----------------------------------------------------- | | | | | ----------- | --------- --------- ------- ------- | | smartNIC| | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | pci rc |==| ------- ----/---- ---/----- ------- ---/--- ---/--- | | connect | | | pf0 |______/________/ | pf1 |___/_______/ | ----------- | ------- ------- | | | | local controller_num=0 (eswitch) | --------------------------------------------------------- A vanilla kernel can run on the smartNIC host, right? what the smartNIC host see is two PF corresponding to ctrl-0-pf0 and ctrl-0-pf1 When the kernel is boot up first and mlx driver is not loaded yet, right? I am not sure it is ok to leave out the VF and SF, but let's leave them out for simplicity now. When mlx driver is loaded, two devlink instances are created, which corresponds to ctrl-0-pf0 and ctrl-0-pf1, and two devlink port instances (flavour FLAVOUR_PHYSICAL) is created and registered to corresponding devlink instances just created, right? As the eswitch mode is based on devlink instance, Let's only set the mode of ctrl-0-pf0' devlink instance to DEVLINK_ESWITCH_MODE_SWITCHDEV, the representor netdev of ctrl-1-pf0 is created and devlink port instance of that representor netdev is created and registered to devlink instances corresponding to ctrl-0-pf0? I think I miss something here, the above does not seems right, because: 1. For single host case:the PF is not passed through to the VM, devlink port instance of VF's representor netdev can be registered to the devlink instance corresponding to it's PF, right? 2. But for two-host case as above, do we need to create a devlink instances for the PF corresponding to ctrl-1-pf0 in smartNIC host? >
> From: Yunsheng Lin <linyunsheng@huawei.com> > Sent: Tuesday, June 8, 2021 1:06 PM > > On 2021/6/8 13:26, Parav Pandit wrote: > >> From: Yunsheng Lin <linyunsheng@huawei.com> > >> Sent: Tuesday, June 8, 2021 8:58 AM > >> > >> On 2021/6/7 19:12, Parav Pandit wrote: > >>>> From: Yunsheng Lin <linyunsheng@huawei.com> > >>>> Sent: Monday, June 7, 2021 4:27 PM > >>>> > >> > >> [..] > >> > >>>>> > >>>>>> 2. each PF's devlink instance has three types of port, which is > >>>>>> FLAVOUR_PHYSICAL, FLAVOUR_PCI_PF and > >>>> FLAVOUR_PCI_VF(supposing I > >>>>>> understand > >>>>>> port flavour correctly). > >>>>>> > >>>>> FLAVOUR_PCI_{PF,VF,SF} belongs to eswitch (representor) side on > >>>> switchdev device. > >>>> > >>>> If devlink instance or eswitch is in DEVLINK_ESWITCH_MODE_LEGACY > >>>> mode, the FLAVOUR_PCI_{PF,VF,SF} port instance does not need to > >> created? > >>> No. in eswitch legacy, there are no representor netdevice or devlink > ports. > >> > >> It seems each devlink port instance corresponds to a netdevice. > >> More specificly, the devlink instance is created in the struct > >> pci_driver' probe function of a pci function, a devlink port instance > >> is created and registered to that devlink instance when a netdev of that > pci function is created? > >> > > Yes. > > > >> As in diagram [1], the devlink port instance(flavour > >> FLAVOUR_PHYSICAL) for > >> ctrl-0-pf0 is created when the netdev of ctrl-0-pf0 is created in the > >> host of smartNIC, the devlink port instance(flavour FLAVOUR_VIRTUAL) > >> for ctrl-0- pf0vfN is created when the netdev of ctrl-0-pf0vfN is > >> created in the host of smartNIC, right? > >> > > Ctrl-0-pf0vfN, ctrl-0-pf0 ports are eswitch ports. They are created where > there is eswitch. > > Usually in smartnic where eswitch is located. > > Does diagram in [1] corresponds to the multi-host (two) host setup as > memtioned previously? > H1.pf0.phyical_port = p0. > H1.pf1.phyical_port = p1. > H2.pf0.phyical_port = p0. > H2.pf1.phyical_port = p1. > Yes. > Let's say H1 = server and H2 = smartNIC as the pci rc connected to below: > --------------------------------------------------------- > | | > | --------- --------- ------- ------- | > ----------- | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | > | server | | ------- ----/---- ---/----- ------- ---/--- ---/--- | > | pci rc |=== | pf0 |______/________/ | pf1 |___/_______/ | > | connect | | ------- ------- | > ----------- | | controller_num=1 (no eswitch) | > ------|-------------------------------------------------- > (internal wire) > | > --------------------------------------------------------- > | devlink eswitch ports and reps | > | ----------------------------------------------------- | > | |ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 |ctrl-0 | | > | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | > | ----------------------------------------------------- | > | |ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 |ctrl-1 | | > | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | > | ----------------------------------------------------- | > | | > | | > ----------- | --------- --------- ------- ------- | > | smartNIC| | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | > | pci rc |==| ------- ----/---- ---/----- ------- ---/--- ---/--- | > | connect | | | pf0 |______/________/ | pf1 |___/_______/ | > ----------- | ------- ------- | > | | > | local controller_num=0 (eswitch) | > --------------------------------------------------------- > > A vanilla kernel can run on the smartNIC host, right? Right. > what the smartNIC host see is two PF corresponding to ctrl-0-pf0 and > ctrl-0-pf1 When the kernel is boot up first and mlx driver is not loaded yet, > right? > > I am not sure it is ok to leave out the VF and SF, but let's leave them out for > simplicity now. > When mlx driver is loaded, two devlink instances are created, which > corresponds to ctrl-0-pf0 and ctrl-0-pf1, and two devlink port instances > (flavour FLAVOUR_PHYSICAL) is created and registered to corresponding > devlink instances just created, right? > > As the eswitch mode is based on devlink instance, Let's only set the mode of > ctrl-0-pf0' devlink instance to DEVLINK_ESWITCH_MODE_SWITCHDEV, the > representor netdev of ctrl-1-pf0 is created and devlink port instance of that > representor netdev is created and registered to devlink instances > corresponding to ctrl-0-pf0? > > I think I miss something here, the above does not seems right, because: > 1. For single host case:the PF is not passed through to the VM, devlink port > instance of VF's representor netdev can be registered to the devlink > instance > corresponding to it's PF, right? Yes, if I understand your question right. > 2. But for two-host case as above, do we need to create a devlink instances > for the PF corresponding to ctrl-1-pf0 in smartNIC host? You can choose not to create a devlink instance in external controller PF. It may not be even a Linux OS running there. I read questions few more times, but I find it hard to understand what you really want to ask. Not sure I understood you. Trying again, The model is really very straight forward as visible in the diagram. There is one PF that has the eswitch. Eswitch contains representor ports. Each representor port represent either PF, VF or SF. This PF, VF or SF can be of local controller residing on the eswitch device or it can be of an external controller(s). Here external controller = 1. Every single PF, VF, SF has devlink instance including the eswitch PF and PF of external controller (often called as external host). Why such devlink instance exists? -> I explained you before in [1]. [1] https://lore.kernel.org/netdev/PH0PR12MB5481FB8528A90E34FA3578C1DC389@PH0PR12MB5481.namprd12.prod.outlook.com/
On 2021/6/8 16:47, Parav Pandit wrote: >> From: Yunsheng Lin <linyunsheng@huawei.com> >> Sent: Tuesday, June 8, 2021 1:06 PM >> >> On 2021/6/8 13:26, Parav Pandit wrote: >>>> From: Yunsheng Lin <linyunsheng@huawei.com> >>>> Sent: Tuesday, June 8, 2021 8:58 AM >>>> >>>> On 2021/6/7 19:12, Parav Pandit wrote: >>>>>> From: Yunsheng Lin <linyunsheng@huawei.com> >>>>>> Sent: Monday, June 7, 2021 4:27 PM >>>>>> >>>> >>>> [..] >>>> >>>>>>> >>>>>>>> 2. each PF's devlink instance has three types of port, which is >>>>>>>> FLAVOUR_PHYSICAL, FLAVOUR_PCI_PF and >>>>>> FLAVOUR_PCI_VF(supposing I >>>>>>>> understand >>>>>>>> port flavour correctly). >>>>>>>> >>>>>>> FLAVOUR_PCI_{PF,VF,SF} belongs to eswitch (representor) side on >>>>>> switchdev device. >>>>>> >>>>>> If devlink instance or eswitch is in DEVLINK_ESWITCH_MODE_LEGACY >>>>>> mode, the FLAVOUR_PCI_{PF,VF,SF} port instance does not need to >>>> created? >>>>> No. in eswitch legacy, there are no representor netdevice or devlink >> ports. >>>> >>>> It seems each devlink port instance corresponds to a netdevice. >>>> More specificly, the devlink instance is created in the struct >>>> pci_driver' probe function of a pci function, a devlink port instance >>>> is created and registered to that devlink instance when a netdev of that >> pci function is created? >>>> >>> Yes. >>> >>>> As in diagram [1], the devlink port instance(flavour >>>> FLAVOUR_PHYSICAL) for >>>> ctrl-0-pf0 is created when the netdev of ctrl-0-pf0 is created in the >>>> host of smartNIC, the devlink port instance(flavour FLAVOUR_VIRTUAL) >>>> for ctrl-0- pf0vfN is created when the netdev of ctrl-0-pf0vfN is >>>> created in the host of smartNIC, right? >>>> >>> Ctrl-0-pf0vfN, ctrl-0-pf0 ports are eswitch ports. They are created where >> there is eswitch. >>> Usually in smartnic where eswitch is located. >> >> Does diagram in [1] corresponds to the multi-host (two) host setup as >> memtioned previously? >> H1.pf0.phyical_port = p0. >> H1.pf1.phyical_port = p1. >> H2.pf0.phyical_port = p0. >> H2.pf1.phyical_port = p1. >> > Yes. > >> Let's say H1 = server and H2 = smartNIC as the pci rc connected to below: >> --------------------------------------------------------- >> | | >> | --------- --------- ------- ------- | >> ----------- | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | >> | server | | ------- ----/---- ---/----- ------- ---/--- ---/--- | >> | pci rc |=== | pf0 |______/________/ | pf1 |___/_______/ | >> | connect | | ------- ------- | >> ----------- | | controller_num=1 (no eswitch) | >> ------|-------------------------------------------------- >> (internal wire) >> | >> --------------------------------------------------------- >> | devlink eswitch ports and reps | >> | ----------------------------------------------------- | >> | |ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 |ctrl-0 | | >> | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | >> | ----------------------------------------------------- | >> | |ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 |ctrl-1 | | >> | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | >> | ----------------------------------------------------- | >> | | >> | | >> ----------- | --------- --------- ------- ------- | >> | smartNIC| | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | >> | pci rc |==| ------- ----/---- ---/----- ------- ---/--- ---/--- | >> | connect | | | pf0 |______/________/ | pf1 |___/_______/ | >> ----------- | ------- ------- | >> | | >> | local controller_num=0 (eswitch) | >> --------------------------------------------------------- >> >> A vanilla kernel can run on the smartNIC host, right? > Right. > >> what the smartNIC host see is two PF corresponding to ctrl-0-pf0 and >> ctrl-0-pf1 When the kernel is boot up first and mlx driver is not loaded yet, >> right? >> >> I am not sure it is ok to leave out the VF and SF, but let's leave them out for >> simplicity now. >> When mlx driver is loaded, two devlink instances are created, which >> corresponds to ctrl-0-pf0 and ctrl-0-pf1, and two devlink port instances >> (flavour FLAVOUR_PHYSICAL) is created and registered to corresponding >> devlink instances just created, right? >> >> As the eswitch mode is based on devlink instance, Let's only set the mode of >> ctrl-0-pf0' devlink instance to DEVLINK_ESWITCH_MODE_SWITCHDEV, the >> representor netdev of ctrl-1-pf0 is created and devlink port instance of that >> representor netdev is created and registered to devlink instances >> corresponding to ctrl-0-pf0? >> >> I think I miss something here, the above does not seems right, because: >> 1. For single host case:the PF is not passed through to the VM, devlink port >> instance of VF's representor netdev can be registered to the devlink >> instance >> corresponding to it's PF, right? > Yes, if I understand your question right. > >> 2. But for two-host case as above, do we need to create a devlink instances >> for the PF corresponding to ctrl-1-pf0 in smartNIC host? > You can choose not to create a devlink instance in external controller PF. It may not be even a Linux OS running there. > > I read questions few more times, but I find it hard to understand what you really want to ask. > Not sure I understood you. > > Trying again, > > The model is really very straight forward as visible in the diagram. > > There is one PF that has the eswitch. Eswitch contains representor ports. I thought the representor ports of a PF'eswitch is decided by the function under a specific PF(For example, the PF itself and the VF under this PF)? > Each representor port represent either PF, VF or SF. > This PF, VF or SF can be of local controller residing on the eswitch device or it can be of an external controller(s). > Here external controller = 1. If I understood above correctly: The fw/hw decide which PF has the eswitch, and how many devlink/representor port does this eswitch has? Suppose PF0 of controller_num=0 in have the eswitch, and the eswitch may has devlink/representor port representing other PF, like PF1 in controller_num=0, and even PF0/PF1 in controller_num=1? > > Every single PF, VF, SF has devlink instance including the eswitch PF and PF of external controller (often called as external host). > Why such devlink instance exists? -> I explained you before in [1]. > > [1] https://lore.kernel.org/netdev/PH0PR12MB5481FB8528A90E34FA3578C1DC389@PH0PR12MB5481.namprd12.prod.outlook.com/
> From: Yunsheng Lin <linyunsheng@huawei.com> > Sent: Tuesday, June 8, 2021 3:02 PM > > On 2021/6/8 16:47, Parav Pandit wrote: > >> From: Yunsheng Lin <linyunsheng@huawei.com> > >> Sent: Tuesday, June 8, 2021 1:06 PM > >> > >> On 2021/6/8 13:26, Parav Pandit wrote: > >>>> From: Yunsheng Lin <linyunsheng@huawei.com> > >>>> Sent: Tuesday, June 8, 2021 8:58 AM > >>>> > >>>> On 2021/6/7 19:12, Parav Pandit wrote: > >>>>>> From: Yunsheng Lin <linyunsheng@huawei.com> > >>>>>> Sent: Monday, June 7, 2021 4:27 PM > >>>>>> > >>>> > >>>> [..] > >>>> > >>>>>>> > >>>>>>>> 2. each PF's devlink instance has three types of port, which is > >>>>>>>> FLAVOUR_PHYSICAL, FLAVOUR_PCI_PF and > >>>>>> FLAVOUR_PCI_VF(supposing I > >>>>>>>> understand > >>>>>>>> port flavour correctly). > >>>>>>>> > >>>>>>> FLAVOUR_PCI_{PF,VF,SF} belongs to eswitch (representor) side on > >>>>>> switchdev device. > >>>>>> > >>>>>> If devlink instance or eswitch is in > DEVLINK_ESWITCH_MODE_LEGACY > >>>>>> mode, the FLAVOUR_PCI_{PF,VF,SF} port instance does not need to > >>>> created? > >>>>> No. in eswitch legacy, there are no representor netdevice or > >>>>> devlink > >> ports. > >>>> > >>>> It seems each devlink port instance corresponds to a netdevice. > >>>> More specificly, the devlink instance is created in the struct > >>>> pci_driver' probe function of a pci function, a devlink port > >>>> instance is created and registered to that devlink instance when a > >>>> netdev of that > >> pci function is created? > >>>> > >>> Yes. > >>> > >>>> As in diagram [1], the devlink port instance(flavour > >>>> FLAVOUR_PHYSICAL) for > >>>> ctrl-0-pf0 is created when the netdev of ctrl-0-pf0 is created in > >>>> the host of smartNIC, the devlink port instance(flavour > >>>> FLAVOUR_VIRTUAL) for ctrl-0- pf0vfN is created when the netdev of > >>>> ctrl-0-pf0vfN is created in the host of smartNIC, right? > >>>> > >>> Ctrl-0-pf0vfN, ctrl-0-pf0 ports are eswitch ports. They are created > >>> where > >> there is eswitch. > >>> Usually in smartnic where eswitch is located. > >> > >> Does diagram in [1] corresponds to the multi-host (two) host setup as > >> memtioned previously? > >> H1.pf0.phyical_port = p0. > >> H1.pf1.phyical_port = p1. > >> H2.pf0.phyical_port = p0. > >> H2.pf1.phyical_port = p1. > >> > > Yes. > > > >> Let's say H1 = server and H2 = smartNIC as the pci rc connected to below: > >> --------------------------------------------------------- > >> | | > >> | --------- --------- ------- ------- | > >> ----------- | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | > >> | server | | ------- ----/---- ---/----- ------- ---/--- ---/--- | > >> | pci rc |=== | pf0 |______/________/ | pf1 |___/_______/ | > >> | connect | | ------- ------- | > >> ----------- | | controller_num=1 (no eswitch) | > >> ------|-------------------------------------------------- > >> (internal wire) > >> | > >> --------------------------------------------------------- > >> | devlink eswitch ports and reps | > >> | ----------------------------------------------------- | > >> | |ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 |ctrl-0 | | > >> | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | > >> | ----------------------------------------------------- | > >> | |ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 |ctrl-1 | | > >> | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | > >> | ----------------------------------------------------- | > >> | | > >> | | > >> ----------- | --------- --------- ------- ------- | > >> | smartNIC| | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | > >> | pci rc |==| ------- ----/---- ---/----- ------- ---/--- ---/--- | > >> | connect | | | pf0 |______/________/ | pf1 |___/_______/ | > >> ----------- | ------- ------- | > >> | | > >> | local controller_num=0 (eswitch) | > >> > >> --------------------------------------------------------- > >> > >> A vanilla kernel can run on the smartNIC host, right? > > Right. > > > >> what the smartNIC host see is two PF corresponding to ctrl-0-pf0 and > >> ctrl-0-pf1 When the kernel is boot up first and mlx driver is not > >> loaded yet, right? > >> > >> I am not sure it is ok to leave out the VF and SF, but let's leave > >> them out for simplicity now. > >> When mlx driver is loaded, two devlink instances are created, which > >> corresponds to ctrl-0-pf0 and ctrl-0-pf1, and two devlink port > >> instances (flavour FLAVOUR_PHYSICAL) is created and registered to > >> corresponding devlink instances just created, right? > >> > >> As the eswitch mode is based on devlink instance, Let's only set the > >> mode of ctrl-0-pf0' devlink instance to > >> DEVLINK_ESWITCH_MODE_SWITCHDEV, the representor netdev of ctrl-1- > pf0 > >> is created and devlink port instance of that representor netdev is > >> created and registered to devlink instances corresponding to ctrl-0-pf0? > >> > >> I think I miss something here, the above does not seems right, > >> because: > >> 1. For single host case:the PF is not passed through to the VM, devlink > port > >> instance of VF's representor netdev can be registered to the > >> devlink instance > >> corresponding to it's PF, right? > > Yes, if I understand your question right. > > > >> 2. But for two-host case as above, do we need to create a devlink > instances > >> for the PF corresponding to ctrl-1-pf0 in smartNIC host? > > You can choose not to create a devlink instance in external controller PF. It > may not be even a Linux OS running there. > > > > I read questions few more times, but I find it hard to understand what you > really want to ask. > > Not sure I understood you. > > > > Trying again, > > > > The model is really very straight forward as visible in the diagram. > > > > There is one PF that has the eswitch. Eswitch contains representor ports. > > I thought the representor ports of a PF'eswitch is decided by the function > under a specific PF(For example, the PF itself and the VF under this PF)? Eswitch is not per PF in context of smartnic/multi-host. PF _has_ eswitch that contains the representor ports for PF, VF, SF. > > > Each representor port represent either PF, VF or SF. > > This PF, VF or SF can be of local controller residing on the eswitch device or > it can be of an external controller(s). > > Here external controller = 1. > > If I understood above correctly: > The fw/hw decide which PF has the eswitch, and how many > devlink/representor port does this eswitch has? Number of ports are dynamic. When new SFs/VFs are created, ports get added to the switch. > Suppose PF0 of controller_num=0 in have the eswitch, and the eswitch may > has devlink/representor port representing other PF, like PF1 in > controller_num=0, and even PF0/PF1 in controller_num=1? Yes. Correct.
On 2021/6/9 17:24, Parav Pandit wrote: >> From: Yunsheng Lin <linyunsheng@huawei.com> >> >> I thought the representor ports of a PF'eswitch is decided by the function >> under a specific PF(For example, the PF itself and the VF under this PF)? > > Eswitch is not per PF in context of smartnic/multi-host. So the Eswitch may be per PF in context of *non*-"smartnic/multi-host", right? It seems that it makes more sense to set the eswitch mode based on devlink port instance instead of devlink instance if devlink instance represents a multi-function ASIC? > PF _has_ eswitch that contains the representor ports for PF, VF, SF. > >> >>> Each representor port represent either PF, VF or SF. >>> This PF, VF or SF can be of local controller residing on the eswitch device or >> it can be of an external controller(s). >>> Here external controller = 1. >> >> If I understood above correctly: >> The fw/hw decide which PF has the eswitch, and how many >> devlink/representor port does this eswitch has? > Number of ports are dynamic. When new SFs/VFs are created, ports get added to the switch. > >> Suppose PF0 of controller_num=0 in have the eswitch, and the eswitch may >> has devlink/representor port representing other PF, like PF1 in >> controller_num=0, and even PF0/PF1 in controller_num=1? > Yes. Correct. Thanks for clarifying, I think I can see the big picture now. >
> From: Yunsheng Lin <linyunsheng@huawei.com> > Sent: Wednesday, June 9, 2021 5:05 PM > > On 2021/6/9 17:24, Parav Pandit wrote: > >> From: Yunsheng Lin <linyunsheng@huawei.com> > >> > >> I thought the representor ports of a PF'eswitch is decided by the > >> function under a specific PF(For example, the PF itself and the VF under > this PF)? > > > > Eswitch is not per PF in context of smartnic/multi-host. > > So the Eswitch may be per PF in context of *non*-"smartnic/multi-host", > right? Right. > It seems that it makes more sense to set the eswitch mode based on devlink > port instance instead of devlink instance if devlink instance represents a > multi-function ASIC? Devlink ports are the children/sub objects of devlink instance. Eswitch mode is per devlink instance that drives how its sub objects to be handled. Shouldn't be other way around. If you mean to say, that in multi-function ASIC, ASIC capabilities decide which devlink instance to support eswitch (and hence its ports), it make sense to me. > > > PF _has_ eswitch that contains the representor ports for PF, VF, SF. > > > >> > >>> Each representor port represent either PF, VF or SF. > >>> This PF, VF or SF can be of local controller residing on the eswitch > >>> device or > >> it can be of an external controller(s). > >>> Here external controller = 1. > >> > >> If I understood above correctly: > >> The fw/hw decide which PF has the eswitch, and how many > >> devlink/representor port does this eswitch has? > > Number of ports are dynamic. When new SFs/VFs are created, ports get > added to the switch. > > > >> Suppose PF0 of controller_num=0 in have the eswitch, and the eswitch > >> may has devlink/representor port representing other PF, like PF1 in > >> controller_num=0, and even PF0/PF1 in controller_num=1? > > Yes. Correct. > > Thanks for clarifying, I think I can see the big picture now. > > >
diff --git a/devlink/devlink.c b/devlink/devlink.c index 0b5548fb..170e8616 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -286,6 +286,7 @@ static void ifname_map_free(struct ifname_map *ifname_map) #define DL_OPT_PORT_PFNUMBER BIT(43) #define DL_OPT_PORT_SFNUMBER BIT(44) #define DL_OPT_PORT_FUNCTION_STATE BIT(45) +#define DL_OPT_PORT_CONTROLLER BIT(46) struct dl_opts { uint64_t present; /* flags of present items */ @@ -336,6 +337,7 @@ struct dl_opts { uint32_t overwrite_mask; enum devlink_reload_action reload_action; enum devlink_reload_limit reload_limit; + uint32_t port_controller; uint32_t port_sfnumber; uint16_t port_flavour; uint16_t port_pfnumber; @@ -1886,6 +1888,12 @@ static int dl_argv_parse(struct dl *dl, uint64_t o_required, if (err) return err; o_found |= DL_OPT_PORT_SFNUMBER; + } else if (dl_argv_match(dl, "controller") && (o_all & DL_OPT_PORT_CONTROLLER)) { + dl_arg_inc(dl); + err = dl_argv_uint32_t(dl, &opts->port_controller); + if (err) + return err; + o_found |= DL_OPT_PORT_CONTROLLER; } else { pr_err("Unknown option \"%s\"\n", dl_argv(dl)); return -EINVAL; @@ -2079,6 +2087,9 @@ static void dl_opts_put(struct nlmsghdr *nlh, struct dl *dl) mnl_attr_put_u16(nlh, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, opts->port_pfnumber); if (opts->present & DL_OPT_PORT_SFNUMBER) mnl_attr_put_u32(nlh, DEVLINK_ATTR_PORT_PCI_SF_NUMBER, opts->port_sfnumber); + if (opts->present & DL_OPT_PORT_CONTROLLER) + mnl_attr_put_u32(nlh, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, + opts->port_controller); } static int dl_argv_parse_put(struct nlmsghdr *nlh, struct dl *dl, @@ -3795,7 +3806,7 @@ static void cmd_port_help(void) pr_err(" devlink port param set DEV/PORT_INDEX name PARAMETER value VALUE cmode { permanent | driverinit | runtime }\n"); pr_err(" devlink port param show [DEV/PORT_INDEX name PARAMETER]\n"); pr_err(" devlink port health show [ DEV/PORT_INDEX reporter REPORTER_NAME ]\n"); - pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR pfnum PFNUM [ sfnum SFNUM ]\n"); + pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); pr_err(" devlink port del DEV/PORT_INDEX\n"); } @@ -4324,7 +4335,7 @@ static int __cmd_health_show(struct dl *dl, bool show_device, bool show_port); static void cmd_port_add_help(void) { - pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour FLAVOUR pfnum PFNUM [ sfnum SFNUM ]\n"); + pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour FLAVOUR pfnum PFNUM [ sfnum SFNUM ] [ controller CNUM ]\n"); } static int cmd_port_add(struct dl *dl) @@ -4342,7 +4353,7 @@ static int cmd_port_add(struct dl *dl) err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE | DL_OPT_HANDLEP | DL_OPT_PORT_FLAVOUR | DL_OPT_PORT_PFNUMBER, - DL_OPT_PORT_SFNUMBER); + DL_OPT_PORT_SFNUMBER | DL_OPT_PORT_CONTROLLER); if (err) return err; diff --git a/man/man8/devlink-port.8 b/man/man8/devlink-port.8 index 563c5833..a5142c4e 100644 --- a/man/man8/devlink-port.8 +++ b/man/man8/devlink-port.8 @@ -54,6 +54,8 @@ devlink-port \- devlink port configuration .IR PFNUMBER " ]" .RB "{ " pcisf .IR SFNUMBER " }" +.RB "{ " controller +.IR CNUM " }" .br .ti -8 @@ -174,6 +176,12 @@ Specifies sfnumber to assign to the device of the SF. This field is optional for those devices which supports auto assignment of the SF number. +.TP +.BR controller " { " controller " } " +Specifies controller number for which the SF port is created. +This field is optional. It is used only when SF port is created for the +external controller. + .ti -8 .SS devlink port function set - Set the port function attribute(s). @@ -327,6 +335,17 @@ devlink dev param set pci/0000:01:00.0/1 name internal_error_reset value true cm .RS 4 Sets the parameter internal_error_reset of specified devlink port (#1) to true. .RE +.PP +devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 controller 1 +.RS 4 +Add a devlink port of flavour PCI SF on controller 1 which has PCI PF of number +0 with SF number 88. To make use of the function an example sequence is to add +a port, configure the function attribute and activate the function. Once +the function usage is completed, deactivate the function and finally delete +the port. When there is desire to reuse the port without deletion, it can be +reconfigured and activated again when function is in inactive state and +function's operational state is detached. +.RE .SH SEE ALSO .BR devlink (8),