Message ID | 20241114220937.719507-4-tariqt@nvidia.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | net/mlx5: ConnectX-8 SW Steering + Rate management on traffic classes | expand |
Thu, Nov 14, 2024 at 11:09:32PM CET, tariqt@nvidia.com wrote: >From: Carolina Jubran <cjubran@nvidia.com> > >Introduce support for specifying bandwidth proportions between traffic >classes (TC) in the devlink-rate API. This new option allows users to >allocate bandwidth across multiple traffic classes in a single command. > >This feature provides a more granular control over traffic management, >especially for scenarios requiring Enhanced Transmission Selection. > >Users can now define a specific bandwidth share for each traffic class, >such as allocating 20% for TC0 (TCP/UDP) and 80% for TC5 (RoCE). > >Example: >DEV=pci/0000:08:00.0 > >$ devlink port function rate add $DEV/vfs_group tx_share 10Gbit \ > tx_max 50Gbit tc-bw 0:20 1:0 2:0 3:0 4:0 5:80 6:0 7:0 > >$ devlink port function rate set $DEV/vfs_group \ > tc-bw 0:20 1:0 2:0 3:0 4:0 5:10 6:60 7:0 > >Signed-off-by: Carolina Jubran <cjubran@nvidia.com> >Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com> >Signed-off-by: Tariq Toukan <tariqt@nvidia.com> >--- > Documentation/netlink/specs/devlink.yaml | 22 ++++++++ > include/net/devlink.h | 7 +++ > include/uapi/linux/devlink.h | 4 ++ > net/devlink/netlink_gen.c | 15 +++-- > net/devlink/netlink_gen.h | 1 + > net/devlink/rate.c | 71 +++++++++++++++++++++++- > 6 files changed, 113 insertions(+), 7 deletions(-) > >diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml >index 09fbb4c03fc8..68211b8218fd 100644 >--- a/Documentation/netlink/specs/devlink.yaml >+++ b/Documentation/netlink/specs/devlink.yaml >@@ -817,6 +817,16 @@ attribute-sets: > - > name: rate-tx-weight > type: u32 >+ - >+ name: rate-tc-index >+ type: u8 >+ >+ - name: rate-bw This is bandwidth of tc. The name therefore should be "rate-tc-bw". Could you also document units of this attr value? >+ type: u32 >+ - >+ name: rate-tc-bw >+ type: nest >+ nested-attributes: dl-rate-tc-bw > - > name: region-direct > type: flag >@@ -1225,6 +1235,16 @@ attribute-sets: > - > name: flash > type: flag >+ - >+ name: dl-rate-tc-bw >+ subset-of: devlink >+ attributes: >+ - >+ name: rate-tc-index >+ type: u8 >+ - >+ name: rate-bw >+ type: u32 > > operations: > enum-model: directional >@@ -2148,6 +2168,7 @@ operations: > - rate-tx-max > - rate-tx-priority > - rate-tx-weight >+ - rate-tc-bw > - rate-parent-node-name > > - >@@ -2168,6 +2189,7 @@ operations: > - rate-tx-max > - rate-tx-priority > - rate-tx-weight >+ - rate-tc-bw > - rate-parent-node-name > > - >diff --git a/include/net/devlink.h b/include/net/devlink.h >index fbb9a2668e24..277b826cdd60 100644 >--- a/include/net/devlink.h >+++ b/include/net/devlink.h >@@ -20,6 +20,7 @@ > #include <uapi/linux/devlink.h> > #include <linux/xarray.h> > #include <linux/firmware.h> >+#include <linux/dcbnl.h> > > struct devlink; > struct devlink_linecard; >@@ -117,6 +118,8 @@ struct devlink_rate { > > u32 tx_priority; > u32 tx_weight; >+ >+ u32 tc_bw[IEEE_8021QAZ_MAX_TCS]; > }; > > struct devlink_port { >@@ -1469,6 +1472,8 @@ struct devlink_ops { > u32 tx_priority, struct netlink_ext_ack *extack); > int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, > u32 tx_weight, struct netlink_ext_ack *extack); >+ int (*rate_leaf_tc_bw_set)(struct devlink_rate *devlink_rate, void *priv, >+ u32 *tc_bw, struct netlink_ext_ack *extack); > int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, > u64 tx_share, struct netlink_ext_ack *extack); > int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, >@@ -1477,6 +1482,8 @@ struct devlink_ops { > u32 tx_priority, struct netlink_ext_ack *extack); > int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, > u32 tx_weight, struct netlink_ext_ack *extack); >+ int (*rate_node_tc_bw_set)(struct devlink_rate *devlink_rate, void *priv, >+ u32 *tc_bw, struct netlink_ext_ack *extack); > int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, > struct netlink_ext_ack *extack); > int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, >diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h >index 9401aa343673..a66217808dd9 100644 >--- a/include/uapi/linux/devlink.h >+++ b/include/uapi/linux/devlink.h >@@ -614,6 +614,10 @@ enum devlink_attr { > > DEVLINK_ATTR_REGION_DIRECT, /* flag */ > >+ DEVLINK_ATTR_RATE_TC_INDEX, /* u8 */ >+ DEVLINK_ATTR_RATE_BW, /* u32 */ >+ DEVLINK_ATTR_RATE_TC_BW, /* nested */ >+ > /* Add new attributes above here, update the spec in > * Documentation/netlink/specs/devlink.yaml and re-generate > * net/devlink/netlink_gen.c. >diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c >index f9786d51f68f..fac062ede7a4 100644 >--- a/net/devlink/netlink_gen.c >+++ b/net/devlink/netlink_gen.c >@@ -18,6 +18,11 @@ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_ > [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15), > }; > >+const struct nla_policy devlink_dl_rate_tc_bw_nl_policy[DEVLINK_ATTR_RATE_BW + 1] = { >+ [DEVLINK_ATTR_RATE_TC_INDEX] = { .type = NLA_U8, }, >+ [DEVLINK_ATTR_RATE_BW] = { .type = NLA_U32, }, >+}; >+ > const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = { > [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG, }, > }; >@@ -496,7 +501,7 @@ static const struct nla_policy devlink_rate_get_dump_nl_policy[DEVLINK_ATTR_DEV_ > }; > > /* DEVLINK_CMD_RATE_SET - do */ >-static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = { >+static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = { > [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, > [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, > [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, >@@ -504,11 +509,12 @@ static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_W > [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64, }, > [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, }, > [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, }, >+ [DEVLINK_ATTR_RATE_TC_BW] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bw_nl_policy), > [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, }, > }; > > /* DEVLINK_CMD_RATE_NEW - do */ >-static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = { >+static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = { > [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, > [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, > [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, >@@ -516,6 +522,7 @@ static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_W > [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64, }, > [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, }, > [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, }, >+ [DEVLINK_ATTR_RATE_TC_BW] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bw_nl_policy), > [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, }, > }; > >@@ -1164,7 +1171,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { > .doit = devlink_nl_rate_set_doit, > .post_doit = devlink_nl_post_doit, > .policy = devlink_rate_set_nl_policy, >- .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT, >+ .maxattr = DEVLINK_ATTR_RATE_TC_BW, > .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, > }, > { >@@ -1174,7 +1181,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { > .doit = devlink_nl_rate_new_doit, > .post_doit = devlink_nl_post_doit, > .policy = devlink_rate_new_nl_policy, >- .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT, >+ .maxattr = DEVLINK_ATTR_RATE_TC_BW, > .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, > }, > { >diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h >index 8f2bd50ddf5e..df37c3ef3113 100644 >--- a/net/devlink/netlink_gen.h >+++ b/net/devlink/netlink_gen.h >@@ -13,6 +13,7 @@ > > /* Common nested types */ > extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1]; >+extern const struct nla_policy devlink_dl_rate_tc_bw_nl_policy[DEVLINK_ATTR_RATE_BW + 1]; > extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1]; > > /* Ops table for devlink */ >diff --git a/net/devlink/rate.c b/net/devlink/rate.c >index 8828ffaf6cbc..dbf1d552fae2 100644 >--- a/net/devlink/rate.c >+++ b/net/devlink/rate.c >@@ -86,7 +86,9 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, > int flags, struct netlink_ext_ack *extack) > { > struct devlink *devlink = devlink_rate->devlink; >+ struct nlattr *nla_tc_bw; > void *hdr; >+ int i; > > hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); > if (!hdr) >@@ -124,10 +126,29 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, > devlink_rate->tx_weight)) > goto nla_put_failure; > >- if (devlink_rate->parent) >- if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, >- devlink_rate->parent->name)) >+ nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BW); >+ if (!nla_tc_bw) >+ goto nla_put_failure; >+ >+ for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { Wait, so you fill this up unconditionally? I mean, for set you check if the driver supports it. For get you always fill this up? That looks wrong. Any reason to do so? >+ struct nlattr *nla_tc_entry = nla_nest_start(msg, i); >+ >+ if (!nla_tc_entry) { >+ nla_nest_cancel(msg, nla_tc_bw); >+ goto nla_put_failure; >+ } >+ >+ if (nla_put_u8(msg, DEVLINK_ATTR_RATE_TC_INDEX, i) || >+ nla_put_u32(msg, DEVLINK_ATTR_RATE_BW, devlink_rate->tc_bw[i])) { >+ nla_nest_cancel(msg, nla_tc_entry); >+ nla_nest_cancel(msg, nla_tc_bw); > goto nla_put_failure; >+ } >+ >+ nla_nest_end(msg, nla_tc_entry); >+ } >+ >+ nla_nest_end(msg, nla_tc_bw); > > genlmsg_end(msg, hdr); > return 0; >@@ -380,6 +401,38 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, > devlink_rate->tx_weight = weight; > } > >+ if (attrs[DEVLINK_ATTR_RATE_TC_BW]) { >+ struct nlattr *nla_tc_bw = attrs[DEVLINK_ATTR_RATE_TC_BW]; >+ struct nlattr *tb[DEVLINK_ATTR_RATE_BW + 1]; >+ u32 tc_bw[IEEE_8021QAZ_MAX_TCS] = {0}; >+ struct nlattr *nla_tc_entry; >+ int rem, tc_index; >+ >+ nla_for_each_nested(nla_tc_entry, nla_tc_bw, rem) { >+ err = nla_parse_nested(tb, DEVLINK_ATTR_RATE_BW, nla_tc_entry, >+ devlink_dl_rate_tc_bw_nl_policy, info->extack); >+ if (err) >+ return err; >+ >+ if (tb[DEVLINK_ATTR_RATE_TC_INDEX] && tb[DEVLINK_ATTR_RATE_BW]) { >+ tc_index = nla_get_u8(tb[DEVLINK_ATTR_RATE_TC_INDEX]); Ough, you trust user to provide you index to array. Recipe for disaster. NLA_POLICY_RANGE() for tc-index would sanitize this. Btw, you use nested array to carry rate-bw and tc-index attrs, isn't the array index good enough? Then you can avoid tc-index attr (which in most cases holds redundant index value). Or do you expect userspace to pass only partial set of tcs? >+ tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_ATTR_RATE_BW]); >+ } >+ } >+ >+ if (devlink_rate_is_leaf(devlink_rate)) >+ err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv, >+ tc_bw, info->extack); >+ else if (devlink_rate_is_node(devlink_rate)) >+ err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv, >+ tc_bw, info->extack); >+ >+ if (err) >+ return err; >+ >+ memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw)); >+ } >+ > nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; > if (nla_parent) { > err = devlink_nl_rate_parent_node_set(devlink_rate, info, >@@ -423,6 +476,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, > "TX weight set isn't supported for the leafs"); > return false; > } >+ if (attrs[DEVLINK_ATTR_RATE_TC_BW] && !ops->rate_leaf_tc_bw_set) { >+ NL_SET_ERR_MSG_ATTR(info->extack, >+ attrs[DEVLINK_ATTR_RATE_TC_BW], >+ "TC bandwidth set isn't supported for the leafs"); >+ return false; >+ } > } else if (type == DEVLINK_RATE_TYPE_NODE) { > if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { > NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); >@@ -449,6 +508,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, > "TX weight set isn't supported for the nodes"); > return false; > } >+ if (attrs[DEVLINK_ATTR_RATE_TC_BW] && !ops->rate_node_tc_bw_set) { >+ NL_SET_ERR_MSG_ATTR(info->extack, >+ attrs[DEVLINK_ATTR_RATE_TC_BW], >+ "TC bandwidth set isn't supported for the nodes"); >+ return false; >+ } > } else { > WARN(1, "Unknown type of rate object"); > return false; >-- >2.44.0 >
On 11/14/24 23:09, Tariq Toukan wrote: > From: Carolina Jubran <cjubran@nvidia.com> > > Introduce support for specifying bandwidth proportions between traffic > classes (TC) in the devlink-rate API. This new option allows users to > allocate bandwidth across multiple traffic classes in a single command. > > This feature provides a more granular control over traffic management, > especially for scenarios requiring Enhanced Transmission Selection. > > Users can now define a specific bandwidth share for each traffic class, > such as allocating 20% for TC0 (TCP/UDP) and 80% for TC5 (RoCE). > > Example: > DEV=pci/0000:08:00.0 > > $ devlink port function rate add $DEV/vfs_group tx_share 10Gbit \ > tx_max 50Gbit tc-bw 0:20 1:0 2:0 3:0 4:0 5:80 6:0 7:0 > > $ devlink port function rate set $DEV/vfs_group \ > tc-bw 0:20 1:0 2:0 3:0 4:0 5:10 6:60 7:0 > > Signed-off-by: Carolina Jubran <cjubran@nvidia.com> > Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com> > Signed-off-by: Tariq Toukan <tariqt@nvidia.com> I haven't dug into it, but this patch is apparently causing netdevsim self-tests failures: https://netdev-3.bots.linux.dev/vmksft-netdevsim/results/860662/4-devlink-sh/stdout Could you please have a look? Thanks! Paolo
On 15/11/2024 9:51, Jiri Pirko wrote: > Thu, Nov 14, 2024 at 11:09:32PM CET, tariqt@nvidia.com wrote: >> From: Carolina Jubran <cjubran@nvidia.com> >> >> Introduce support for specifying bandwidth proportions between traffic >> classes (TC) in the devlink-rate API. This new option allows users to >> allocate bandwidth across multiple traffic classes in a single command. >> >> This feature provides a more granular control over traffic management, >> especially for scenarios requiring Enhanced Transmission Selection. >> >> Users can now define a specific bandwidth share for each traffic class, >> such as allocating 20% for TC0 (TCP/UDP) and 80% for TC5 (RoCE). >> >> Example: >> DEV=pci/0000:08:00.0 >> >> $ devlink port function rate add $DEV/vfs_group tx_share 10Gbit \ >> tx_max 50Gbit tc-bw 0:20 1:0 2:0 3:0 4:0 5:80 6:0 7:0 >> >> $ devlink port function rate set $DEV/vfs_group \ >> tc-bw 0:20 1:0 2:0 3:0 4:0 5:10 6:60 7:0 >> >> Signed-off-by: Carolina Jubran <cjubran@nvidia.com> >> Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com> >> Signed-off-by: Tariq Toukan <tariqt@nvidia.com> >> --- >> Documentation/netlink/specs/devlink.yaml | 22 ++++++++ >> include/net/devlink.h | 7 +++ >> include/uapi/linux/devlink.h | 4 ++ >> net/devlink/netlink_gen.c | 15 +++-- >> net/devlink/netlink_gen.h | 1 + >> net/devlink/rate.c | 71 +++++++++++++++++++++++- >> 6 files changed, 113 insertions(+), 7 deletions(-) >> >> diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml >> index 09fbb4c03fc8..68211b8218fd 100644 >> --- a/Documentation/netlink/specs/devlink.yaml >> +++ b/Documentation/netlink/specs/devlink.yaml >> @@ -817,6 +817,16 @@ attribute-sets: >> - >> name: rate-tx-weight >> type: u32 >> + - >> + name: rate-tc-index >> + type: u8 >> + >> + - name: rate-bw > > This is bandwidth of tc. The name therefore should be "rate-tc-bw". > Could you also document units of this attr value? > > I will rename it to rate-tc-bw and update the documentation to include the valid range (0-100). >> + type: u32 >> + - >> + name: rate-tc-bw >> + type: nest >> + nested-attributes: dl-rate-tc-bw >> - >> name: region-direct >> type: flag >> @@ -1225,6 +1235,16 @@ attribute-sets: >> - >> name: flash >> type: flag >> + - >> + name: dl-rate-tc-bw >> + subset-of: devlink >> + attributes: >> + - >> + name: rate-tc-index >> + type: u8 >> + - >> + name: rate-bw >> + type: u32 >> >> operations: >> enum-model: directional >> @@ -2148,6 +2168,7 @@ operations: >> - rate-tx-max >> - rate-tx-priority >> - rate-tx-weight >> + - rate-tc-bw >> - rate-parent-node-name >> >> - >> @@ -2168,6 +2189,7 @@ operations: >> - rate-tx-max >> - rate-tx-priority >> - rate-tx-weight >> + - rate-tc-bw >> - rate-parent-node-name >> >> - >> diff --git a/include/net/devlink.h b/include/net/devlink.h >> index fbb9a2668e24..277b826cdd60 100644 >> --- a/include/net/devlink.h >> +++ b/include/net/devlink.h >> @@ -20,6 +20,7 @@ >> #include <uapi/linux/devlink.h> >> #include <linux/xarray.h> >> #include <linux/firmware.h> >> +#include <linux/dcbnl.h> >> >> struct devlink; >> struct devlink_linecard; >> @@ -117,6 +118,8 @@ struct devlink_rate { >> >> u32 tx_priority; >> u32 tx_weight; >> + >> + u32 tc_bw[IEEE_8021QAZ_MAX_TCS]; >> }; >> >> struct devlink_port { >> @@ -1469,6 +1472,8 @@ struct devlink_ops { >> u32 tx_priority, struct netlink_ext_ack *extack); >> int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, >> u32 tx_weight, struct netlink_ext_ack *extack); >> + int (*rate_leaf_tc_bw_set)(struct devlink_rate *devlink_rate, void *priv, >> + u32 *tc_bw, struct netlink_ext_ack *extack); >> int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, >> u64 tx_share, struct netlink_ext_ack *extack); >> int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, >> @@ -1477,6 +1482,8 @@ struct devlink_ops { >> u32 tx_priority, struct netlink_ext_ack *extack); >> int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, >> u32 tx_weight, struct netlink_ext_ack *extack); >> + int (*rate_node_tc_bw_set)(struct devlink_rate *devlink_rate, void *priv, >> + u32 *tc_bw, struct netlink_ext_ack *extack); >> int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, >> struct netlink_ext_ack *extack); >> int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, >> diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h >> index 9401aa343673..a66217808dd9 100644 >> --- a/include/uapi/linux/devlink.h >> +++ b/include/uapi/linux/devlink.h >> @@ -614,6 +614,10 @@ enum devlink_attr { >> >> DEVLINK_ATTR_REGION_DIRECT, /* flag */ >> >> + DEVLINK_ATTR_RATE_TC_INDEX, /* u8 */ >> + DEVLINK_ATTR_RATE_BW, /* u32 */ >> + DEVLINK_ATTR_RATE_TC_BW, /* nested */ >> + >> /* Add new attributes above here, update the spec in >> * Documentation/netlink/specs/devlink.yaml and re-generate >> * net/devlink/netlink_gen.c. >> diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c >> index f9786d51f68f..fac062ede7a4 100644 >> --- a/net/devlink/netlink_gen.c >> +++ b/net/devlink/netlink_gen.c >> @@ -18,6 +18,11 @@ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_ >> [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15), >> }; >> >> +const struct nla_policy devlink_dl_rate_tc_bw_nl_policy[DEVLINK_ATTR_RATE_BW + 1] = { >> + [DEVLINK_ATTR_RATE_TC_INDEX] = { .type = NLA_U8, }, >> + [DEVLINK_ATTR_RATE_BW] = { .type = NLA_U32, }, >> +}; >> + >> const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = { >> [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG, }, >> }; >> @@ -496,7 +501,7 @@ static const struct nla_policy devlink_rate_get_dump_nl_policy[DEVLINK_ATTR_DEV_ >> }; >> >> /* DEVLINK_CMD_RATE_SET - do */ >> -static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = { >> +static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = { >> [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, >> [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, >> [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, >> @@ -504,11 +509,12 @@ static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_W >> [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64, }, >> [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, }, >> [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, }, >> + [DEVLINK_ATTR_RATE_TC_BW] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bw_nl_policy), >> [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, }, >> }; >> >> /* DEVLINK_CMD_RATE_NEW - do */ >> -static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = { >> +static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = { >> [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, >> [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, >> [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, >> @@ -516,6 +522,7 @@ static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_W >> [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64, }, >> [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, }, >> [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, }, >> + [DEVLINK_ATTR_RATE_TC_BW] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bw_nl_policy), >> [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, }, >> }; >> >> @@ -1164,7 +1171,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { >> .doit = devlink_nl_rate_set_doit, >> .post_doit = devlink_nl_post_doit, >> .policy = devlink_rate_set_nl_policy, >> - .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT, >> + .maxattr = DEVLINK_ATTR_RATE_TC_BW, >> .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, >> }, >> { >> @@ -1174,7 +1181,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { >> .doit = devlink_nl_rate_new_doit, >> .post_doit = devlink_nl_post_doit, >> .policy = devlink_rate_new_nl_policy, >> - .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT, >> + .maxattr = DEVLINK_ATTR_RATE_TC_BW, >> .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, >> }, >> { >> diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h >> index 8f2bd50ddf5e..df37c3ef3113 100644 >> --- a/net/devlink/netlink_gen.h >> +++ b/net/devlink/netlink_gen.h >> @@ -13,6 +13,7 @@ >> >> /* Common nested types */ >> extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1]; >> +extern const struct nla_policy devlink_dl_rate_tc_bw_nl_policy[DEVLINK_ATTR_RATE_BW + 1]; >> extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1]; >> >> /* Ops table for devlink */ >> diff --git a/net/devlink/rate.c b/net/devlink/rate.c >> index 8828ffaf6cbc..dbf1d552fae2 100644 >> --- a/net/devlink/rate.c >> +++ b/net/devlink/rate.c >> @@ -86,7 +86,9 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, >> int flags, struct netlink_ext_ack *extack) >> { >> struct devlink *devlink = devlink_rate->devlink; >> + struct nlattr *nla_tc_bw; >> void *hdr; >> + int i; >> >> hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); >> if (!hdr) >> @@ -124,10 +126,29 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, >> devlink_rate->tx_weight)) >> goto nla_put_failure; >> >> - if (devlink_rate->parent) >> - if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, >> - devlink_rate->parent->name)) >> + nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BW); >> + if (!nla_tc_bw) >> + goto nla_put_failure; >> + >> + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { > > Wait, so you fill this up unconditionally? I mean, for set you check if > the driver supports it. For get you always fill this up? That looks > wrong. Any reason to do so? > > I followed the existing pattern in devlink, where attributes like tx_share and tx_weight are filled unconditionally in the get operation. Should I still add a check to see if tc_bw_set is supported? >> + struct nlattr *nla_tc_entry = nla_nest_start(msg, i); >> + >> + if (!nla_tc_entry) { >> + nla_nest_cancel(msg, nla_tc_bw); >> + goto nla_put_failure; >> + } >> + >> + if (nla_put_u8(msg, DEVLINK_ATTR_RATE_TC_INDEX, i) || >> + nla_put_u32(msg, DEVLINK_ATTR_RATE_BW, devlink_rate->tc_bw[i])) { >> + nla_nest_cancel(msg, nla_tc_entry); >> + nla_nest_cancel(msg, nla_tc_bw); >> goto nla_put_failure; >> + } >> + >> + nla_nest_end(msg, nla_tc_entry); >> + } >> + >> + nla_nest_end(msg, nla_tc_bw); >> >> genlmsg_end(msg, hdr); >> return 0; >> @@ -380,6 +401,38 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, >> devlink_rate->tx_weight = weight; >> } >> >> + if (attrs[DEVLINK_ATTR_RATE_TC_BW]) { >> + struct nlattr *nla_tc_bw = attrs[DEVLINK_ATTR_RATE_TC_BW]; >> + struct nlattr *tb[DEVLINK_ATTR_RATE_BW + 1]; >> + u32 tc_bw[IEEE_8021QAZ_MAX_TCS] = {0}; >> + struct nlattr *nla_tc_entry; >> + int rem, tc_index; >> + >> + nla_for_each_nested(nla_tc_entry, nla_tc_bw, rem) { >> + err = nla_parse_nested(tb, DEVLINK_ATTR_RATE_BW, nla_tc_entry, >> + devlink_dl_rate_tc_bw_nl_policy, info->extack); >> + if (err) >> + return err; >> + >> + if (tb[DEVLINK_ATTR_RATE_TC_INDEX] && tb[DEVLINK_ATTR_RATE_BW]) { >> + tc_index = nla_get_u8(tb[DEVLINK_ATTR_RATE_TC_INDEX]); > > Ough, you trust user to provide you index to array. Recipe for disaster. > NLA_POLICY_RANGE() for tc-index would sanitize this. Btw, you use nested > array to carry rate-bw and tc-index attrs, isn't the array index good > enough? Then you can avoid tc-index attr (which in most cases holds > redundant index value). Or do you expect userspace to pass only partial > set of tcs? > > I will drop the tc-index attribute as it is redundant. To address your question: I expect userspace to always provide a complete set of tcs. >> + tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_ATTR_RATE_BW]); >> + } >> + } >> + >> + if (devlink_rate_is_leaf(devlink_rate)) >> + err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv, >> + tc_bw, info->extack); >> + else if (devlink_rate_is_node(devlink_rate)) >> + err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv, >> + tc_bw, info->extack); >> + >> + if (err) >> + return err; >> + >> + memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw)); >> + } >> + >> nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; >> if (nla_parent) { >> err = devlink_nl_rate_parent_node_set(devlink_rate, info, >> @@ -423,6 +476,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, >> "TX weight set isn't supported for the leafs"); >> return false; >> } >> + if (attrs[DEVLINK_ATTR_RATE_TC_BW] && !ops->rate_leaf_tc_bw_set) { >> + NL_SET_ERR_MSG_ATTR(info->extack, >> + attrs[DEVLINK_ATTR_RATE_TC_BW], >> + "TC bandwidth set isn't supported for the leafs"); >> + return false; >> + } >> } else if (type == DEVLINK_RATE_TYPE_NODE) { >> if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { >> NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); >> @@ -449,6 +508,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, >> "TX weight set isn't supported for the nodes"); >> return false; >> } >> + if (attrs[DEVLINK_ATTR_RATE_TC_BW] && !ops->rate_node_tc_bw_set) { >> + NL_SET_ERR_MSG_ATTR(info->extack, >> + attrs[DEVLINK_ATTR_RATE_TC_BW], >> + "TC bandwidth set isn't supported for the nodes"); >> + return false; >> + } >> } else { >> WARN(1, "Unknown type of rate object"); >> return false; >> -- >> 2.44.0 >>
On 15/11/2024 12:15, Paolo Abeni wrote: > On 11/14/24 23:09, Tariq Toukan wrote: >> From: Carolina Jubran <cjubran@nvidia.com> >> >> Introduce support for specifying bandwidth proportions between traffic >> classes (TC) in the devlink-rate API. This new option allows users to >> allocate bandwidth across multiple traffic classes in a single command. >> >> This feature provides a more granular control over traffic management, >> especially for scenarios requiring Enhanced Transmission Selection. >> >> Users can now define a specific bandwidth share for each traffic class, >> such as allocating 20% for TC0 (TCP/UDP) and 80% for TC5 (RoCE). >> >> Example: >> DEV=pci/0000:08:00.0 >> >> $ devlink port function rate add $DEV/vfs_group tx_share 10Gbit \ >> tx_max 50Gbit tc-bw 0:20 1:0 2:0 3:0 4:0 5:80 6:0 7:0 >> >> $ devlink port function rate set $DEV/vfs_group \ >> tc-bw 0:20 1:0 2:0 3:0 4:0 5:10 6:60 7:0 >> >> Signed-off-by: Carolina Jubran <cjubran@nvidia.com> >> Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com> >> Signed-off-by: Tariq Toukan <tariqt@nvidia.com> > > I haven't dug into it, but this patch is apparently causing netdevsim > self-tests failures: > > https://netdev-3.bots.linux.dev/vmksft-netdevsim/results/860662/4-devlink-sh/stdout > > Could you please have a look? > > Thanks! > > Paolo > Thanks for pointing this out. I’ve identified the issue and will address it in v3.
Sun, Nov 17, 2024 at 03:33:06PM CET, cjubran@nvidia.com wrote: > > >On 15/11/2024 9:51, Jiri Pirko wrote: >> Thu, Nov 14, 2024 at 11:09:32PM CET, tariqt@nvidia.com wrote: >> > From: Carolina Jubran <cjubran@nvidia.com> >> > >> > Introduce support for specifying bandwidth proportions between traffic >> > classes (TC) in the devlink-rate API. This new option allows users to >> > allocate bandwidth across multiple traffic classes in a single command. >> > >> > This feature provides a more granular control over traffic management, >> > especially for scenarios requiring Enhanced Transmission Selection. >> > >> > Users can now define a specific bandwidth share for each traffic class, >> > such as allocating 20% for TC0 (TCP/UDP) and 80% for TC5 (RoCE). >> > >> > Example: >> > DEV=pci/0000:08:00.0 >> > >> > $ devlink port function rate add $DEV/vfs_group tx_share 10Gbit \ >> > tx_max 50Gbit tc-bw 0:20 1:0 2:0 3:0 4:0 5:80 6:0 7:0 >> > >> > $ devlink port function rate set $DEV/vfs_group \ >> > tc-bw 0:20 1:0 2:0 3:0 4:0 5:10 6:60 7:0 >> > >> > Signed-off-by: Carolina Jubran <cjubran@nvidia.com> >> > Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com> >> > Signed-off-by: Tariq Toukan <tariqt@nvidia.com> >> > --- >> > Documentation/netlink/specs/devlink.yaml | 22 ++++++++ >> > include/net/devlink.h | 7 +++ >> > include/uapi/linux/devlink.h | 4 ++ >> > net/devlink/netlink_gen.c | 15 +++-- >> > net/devlink/netlink_gen.h | 1 + >> > net/devlink/rate.c | 71 +++++++++++++++++++++++- >> > 6 files changed, 113 insertions(+), 7 deletions(-) >> > >> > diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml >> > index 09fbb4c03fc8..68211b8218fd 100644 >> > --- a/Documentation/netlink/specs/devlink.yaml >> > +++ b/Documentation/netlink/specs/devlink.yaml >> > @@ -817,6 +817,16 @@ attribute-sets: >> > - >> > name: rate-tx-weight >> > type: u32 >> > + - >> > + name: rate-tc-index >> > + type: u8 >> > + >> > + - name: rate-bw >> >> This is bandwidth of tc. The name therefore should be "rate-tc-bw". >> Could you also document units of this attr value? >> >> > >I will rename it to rate-tc-bw and update the documentation to include the >valid range (0-100). So is it percent? I asked about the units. > >> > + type: u32 >> > + - >> > + name: rate-tc-bw >> > + type: nest >> > + nested-attributes: dl-rate-tc-bw >> > - >> > name: region-direct >> > type: flag >> > @@ -1225,6 +1235,16 @@ attribute-sets: >> > - >> > name: flash >> > type: flag >> > + - >> > + name: dl-rate-tc-bw >> > + subset-of: devlink >> > + attributes: >> > + - >> > + name: rate-tc-index >> > + type: u8 >> > + - >> > + name: rate-bw >> > + type: u32 >> > >> > operations: >> > enum-model: directional >> > @@ -2148,6 +2168,7 @@ operations: >> > - rate-tx-max >> > - rate-tx-priority >> > - rate-tx-weight >> > + - rate-tc-bw >> > - rate-parent-node-name >> > >> > - >> > @@ -2168,6 +2189,7 @@ operations: >> > - rate-tx-max >> > - rate-tx-priority >> > - rate-tx-weight >> > + - rate-tc-bw >> > - rate-parent-node-name >> > >> > - >> > diff --git a/include/net/devlink.h b/include/net/devlink.h >> > index fbb9a2668e24..277b826cdd60 100644 >> > --- a/include/net/devlink.h >> > +++ b/include/net/devlink.h >> > @@ -20,6 +20,7 @@ >> > #include <uapi/linux/devlink.h> >> > #include <linux/xarray.h> >> > #include <linux/firmware.h> >> > +#include <linux/dcbnl.h> >> > >> > struct devlink; >> > struct devlink_linecard; >> > @@ -117,6 +118,8 @@ struct devlink_rate { >> > >> > u32 tx_priority; >> > u32 tx_weight; >> > + >> > + u32 tc_bw[IEEE_8021QAZ_MAX_TCS]; >> > }; >> > >> > struct devlink_port { >> > @@ -1469,6 +1472,8 @@ struct devlink_ops { >> > u32 tx_priority, struct netlink_ext_ack *extack); >> > int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, >> > u32 tx_weight, struct netlink_ext_ack *extack); >> > + int (*rate_leaf_tc_bw_set)(struct devlink_rate *devlink_rate, void *priv, >> > + u32 *tc_bw, struct netlink_ext_ack *extack); >> > int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, >> > u64 tx_share, struct netlink_ext_ack *extack); >> > int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, >> > @@ -1477,6 +1482,8 @@ struct devlink_ops { >> > u32 tx_priority, struct netlink_ext_ack *extack); >> > int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, >> > u32 tx_weight, struct netlink_ext_ack *extack); >> > + int (*rate_node_tc_bw_set)(struct devlink_rate *devlink_rate, void *priv, >> > + u32 *tc_bw, struct netlink_ext_ack *extack); >> > int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, >> > struct netlink_ext_ack *extack); >> > int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, >> > diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h >> > index 9401aa343673..a66217808dd9 100644 >> > --- a/include/uapi/linux/devlink.h >> > +++ b/include/uapi/linux/devlink.h >> > @@ -614,6 +614,10 @@ enum devlink_attr { >> > >> > DEVLINK_ATTR_REGION_DIRECT, /* flag */ >> > >> > + DEVLINK_ATTR_RATE_TC_INDEX, /* u8 */ >> > + DEVLINK_ATTR_RATE_BW, /* u32 */ >> > + DEVLINK_ATTR_RATE_TC_BW, /* nested */ >> > + >> > /* Add new attributes above here, update the spec in >> > * Documentation/netlink/specs/devlink.yaml and re-generate >> > * net/devlink/netlink_gen.c. >> > diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c >> > index f9786d51f68f..fac062ede7a4 100644 >> > --- a/net/devlink/netlink_gen.c >> > +++ b/net/devlink/netlink_gen.c >> > @@ -18,6 +18,11 @@ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_ >> > [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15), >> > }; >> > >> > +const struct nla_policy devlink_dl_rate_tc_bw_nl_policy[DEVLINK_ATTR_RATE_BW + 1] = { >> > + [DEVLINK_ATTR_RATE_TC_INDEX] = { .type = NLA_U8, }, >> > + [DEVLINK_ATTR_RATE_BW] = { .type = NLA_U32, }, >> > +}; >> > + >> > const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = { >> > [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG, }, >> > }; >> > @@ -496,7 +501,7 @@ static const struct nla_policy devlink_rate_get_dump_nl_policy[DEVLINK_ATTR_DEV_ >> > }; >> > >> > /* DEVLINK_CMD_RATE_SET - do */ >> > -static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = { >> > +static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = { >> > [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, >> > [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, >> > [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, >> > @@ -504,11 +509,12 @@ static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_W >> > [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64, }, >> > [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, }, >> > [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, }, >> > + [DEVLINK_ATTR_RATE_TC_BW] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bw_nl_policy), >> > [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, }, >> > }; >> > >> > /* DEVLINK_CMD_RATE_NEW - do */ >> > -static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = { >> > +static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = { >> > [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, >> > [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, >> > [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, >> > @@ -516,6 +522,7 @@ static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_W >> > [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64, }, >> > [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, }, >> > [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, }, >> > + [DEVLINK_ATTR_RATE_TC_BW] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bw_nl_policy), >> > [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, }, >> > }; >> > >> > @@ -1164,7 +1171,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { >> > .doit = devlink_nl_rate_set_doit, >> > .post_doit = devlink_nl_post_doit, >> > .policy = devlink_rate_set_nl_policy, >> > - .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT, >> > + .maxattr = DEVLINK_ATTR_RATE_TC_BW, >> > .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, >> > }, >> > { >> > @@ -1174,7 +1181,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { >> > .doit = devlink_nl_rate_new_doit, >> > .post_doit = devlink_nl_post_doit, >> > .policy = devlink_rate_new_nl_policy, >> > - .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT, >> > + .maxattr = DEVLINK_ATTR_RATE_TC_BW, >> > .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, >> > }, >> > { >> > diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h >> > index 8f2bd50ddf5e..df37c3ef3113 100644 >> > --- a/net/devlink/netlink_gen.h >> > +++ b/net/devlink/netlink_gen.h >> > @@ -13,6 +13,7 @@ >> > >> > /* Common nested types */ >> > extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1]; >> > +extern const struct nla_policy devlink_dl_rate_tc_bw_nl_policy[DEVLINK_ATTR_RATE_BW + 1]; >> > extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1]; >> > >> > /* Ops table for devlink */ >> > diff --git a/net/devlink/rate.c b/net/devlink/rate.c >> > index 8828ffaf6cbc..dbf1d552fae2 100644 >> > --- a/net/devlink/rate.c >> > +++ b/net/devlink/rate.c >> > @@ -86,7 +86,9 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, >> > int flags, struct netlink_ext_ack *extack) >> > { >> > struct devlink *devlink = devlink_rate->devlink; >> > + struct nlattr *nla_tc_bw; >> > void *hdr; >> > + int i; >> > >> > hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); >> > if (!hdr) >> > @@ -124,10 +126,29 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, >> > devlink_rate->tx_weight)) >> > goto nla_put_failure; >> > >> > - if (devlink_rate->parent) >> > - if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, >> > - devlink_rate->parent->name)) >> > + nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BW); >> > + if (!nla_tc_bw) >> > + goto nla_put_failure; >> > + >> > + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { >> >> Wait, so you fill this up unconditionally? I mean, for set you check if >> the driver supports it. For get you always fill this up? That looks >> wrong. Any reason to do so? >> >> > >I followed the existing pattern in devlink, where attributes like tx_share >and tx_weight are filled unconditionally in the get operation. >Should I still add a check to see if tc_bw_set is supported? So is are zeroes correct values passed to user when set is not supported? I guess 0 is default right? Same question goes to tx_weight and others. If the answer is yes, I'm okay to pass it unconditionally. > >> > + struct nlattr *nla_tc_entry = nla_nest_start(msg, i); >> > + >> > + if (!nla_tc_entry) { >> > + nla_nest_cancel(msg, nla_tc_bw); >> > + goto nla_put_failure; >> > + } >> > + >> > + if (nla_put_u8(msg, DEVLINK_ATTR_RATE_TC_INDEX, i) || >> > + nla_put_u32(msg, DEVLINK_ATTR_RATE_BW, devlink_rate->tc_bw[i])) { >> > + nla_nest_cancel(msg, nla_tc_entry); >> > + nla_nest_cancel(msg, nla_tc_bw); >> > goto nla_put_failure; >> > + } >> > + >> > + nla_nest_end(msg, nla_tc_entry); >> > + } >> > + >> > + nla_nest_end(msg, nla_tc_bw); >> > >> > genlmsg_end(msg, hdr); >> > return 0; >> > @@ -380,6 +401,38 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, >> > devlink_rate->tx_weight = weight; >> > } >> > >> > + if (attrs[DEVLINK_ATTR_RATE_TC_BW]) { >> > + struct nlattr *nla_tc_bw = attrs[DEVLINK_ATTR_RATE_TC_BW]; >> > + struct nlattr *tb[DEVLINK_ATTR_RATE_BW + 1]; >> > + u32 tc_bw[IEEE_8021QAZ_MAX_TCS] = {0}; >> > + struct nlattr *nla_tc_entry; >> > + int rem, tc_index; >> > + >> > + nla_for_each_nested(nla_tc_entry, nla_tc_bw, rem) { >> > + err = nla_parse_nested(tb, DEVLINK_ATTR_RATE_BW, nla_tc_entry, >> > + devlink_dl_rate_tc_bw_nl_policy, info->extack); >> > + if (err) >> > + return err; >> > + >> > + if (tb[DEVLINK_ATTR_RATE_TC_INDEX] && tb[DEVLINK_ATTR_RATE_BW]) { >> > + tc_index = nla_get_u8(tb[DEVLINK_ATTR_RATE_TC_INDEX]); >> >> Ough, you trust user to provide you index to array. Recipe for disaster. >> NLA_POLICY_RANGE() for tc-index would sanitize this. Btw, you use nested >> array to carry rate-bw and tc-index attrs, isn't the array index good >> enough? Then you can avoid tc-index attr (which in most cases holds >> redundant index value). Or do you expect userspace to pass only partial >> set of tcs? >> >> > >I will drop the tc-index attribute as it is redundant. To address your >question: I expect userspace to always provide a complete set of tcs. Okay. Makes sense. > >> > + tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_ATTR_RATE_BW]); >> > + } >> > + } >> > + >> > + if (devlink_rate_is_leaf(devlink_rate)) >> > + err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv, >> > + tc_bw, info->extack); >> > + else if (devlink_rate_is_node(devlink_rate)) >> > + err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv, >> > + tc_bw, info->extack); >> > + >> > + if (err) >> > + return err; >> > + >> > + memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw)); >> > + } >> > + >> > nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; >> > if (nla_parent) { >> > err = devlink_nl_rate_parent_node_set(devlink_rate, info, >> > @@ -423,6 +476,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, >> > "TX weight set isn't supported for the leafs"); >> > return false; >> > } >> > + if (attrs[DEVLINK_ATTR_RATE_TC_BW] && !ops->rate_leaf_tc_bw_set) { >> > + NL_SET_ERR_MSG_ATTR(info->extack, >> > + attrs[DEVLINK_ATTR_RATE_TC_BW], >> > + "TC bandwidth set isn't supported for the leafs"); >> > + return false; >> > + } >> > } else if (type == DEVLINK_RATE_TYPE_NODE) { >> > if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { >> > NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); >> > @@ -449,6 +508,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, >> > "TX weight set isn't supported for the nodes"); >> > return false; >> > } >> > + if (attrs[DEVLINK_ATTR_RATE_TC_BW] && !ops->rate_node_tc_bw_set) { >> > + NL_SET_ERR_MSG_ATTR(info->extack, >> > + attrs[DEVLINK_ATTR_RATE_TC_BW], >> > + "TC bandwidth set isn't supported for the nodes"); >> > + return false; >> > + } >> > } else { >> > WARN(1, "Unknown type of rate object"); >> > return false; >> > -- >> > 2.44.0 >> > >
diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index 09fbb4c03fc8..68211b8218fd 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -817,6 +817,16 @@ attribute-sets: - name: rate-tx-weight type: u32 + - + name: rate-tc-index + type: u8 + + - name: rate-bw + type: u32 + - + name: rate-tc-bw + type: nest + nested-attributes: dl-rate-tc-bw - name: region-direct type: flag @@ -1225,6 +1235,16 @@ attribute-sets: - name: flash type: flag + - + name: dl-rate-tc-bw + subset-of: devlink + attributes: + - + name: rate-tc-index + type: u8 + - + name: rate-bw + type: u32 operations: enum-model: directional @@ -2148,6 +2168,7 @@ operations: - rate-tx-max - rate-tx-priority - rate-tx-weight + - rate-tc-bw - rate-parent-node-name - @@ -2168,6 +2189,7 @@ operations: - rate-tx-max - rate-tx-priority - rate-tx-weight + - rate-tc-bw - rate-parent-node-name - diff --git a/include/net/devlink.h b/include/net/devlink.h index fbb9a2668e24..277b826cdd60 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -20,6 +20,7 @@ #include <uapi/linux/devlink.h> #include <linux/xarray.h> #include <linux/firmware.h> +#include <linux/dcbnl.h> struct devlink; struct devlink_linecard; @@ -117,6 +118,8 @@ struct devlink_rate { u32 tx_priority; u32 tx_weight; + + u32 tc_bw[IEEE_8021QAZ_MAX_TCS]; }; struct devlink_port { @@ -1469,6 +1472,8 @@ struct devlink_ops { u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_weight, struct netlink_ext_ack *extack); + int (*rate_leaf_tc_bw_set)(struct devlink_rate *devlink_rate, void *priv, + u32 *tc_bw, struct netlink_ext_ack *extack); int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack); int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, @@ -1477,6 +1482,8 @@ struct devlink_ops { u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_weight, struct netlink_ext_ack *extack); + int (*rate_node_tc_bw_set)(struct devlink_rate *devlink_rate, void *priv, + u32 *tc_bw, struct netlink_ext_ack *extack); int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, struct netlink_ext_ack *extack); int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 9401aa343673..a66217808dd9 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -614,6 +614,10 @@ enum devlink_attr { DEVLINK_ATTR_REGION_DIRECT, /* flag */ + DEVLINK_ATTR_RATE_TC_INDEX, /* u8 */ + DEVLINK_ATTR_RATE_BW, /* u32 */ + DEVLINK_ATTR_RATE_TC_BW, /* nested */ + /* Add new attributes above here, update the spec in * Documentation/netlink/specs/devlink.yaml and re-generate * net/devlink/netlink_gen.c. diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index f9786d51f68f..fac062ede7a4 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -18,6 +18,11 @@ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_ [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15), }; +const struct nla_policy devlink_dl_rate_tc_bw_nl_policy[DEVLINK_ATTR_RATE_BW + 1] = { + [DEVLINK_ATTR_RATE_TC_INDEX] = { .type = NLA_U8, }, + [DEVLINK_ATTR_RATE_BW] = { .type = NLA_U32, }, +}; + const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = { [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG, }, }; @@ -496,7 +501,7 @@ static const struct nla_policy devlink_rate_get_dump_nl_policy[DEVLINK_ATTR_DEV_ }; /* DEVLINK_CMD_RATE_SET - do */ -static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = { +static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, @@ -504,11 +509,12 @@ static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_W [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64, }, [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, }, + [DEVLINK_ATTR_RATE_TC_BW] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bw_nl_policy), [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, }, }; /* DEVLINK_CMD_RATE_NEW - do */ -static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = { +static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, @@ -516,6 +522,7 @@ static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_W [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64, }, [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, }, + [DEVLINK_ATTR_RATE_TC_BW] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bw_nl_policy), [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, }, }; @@ -1164,7 +1171,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_rate_set_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_rate_set_nl_policy, - .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT, + .maxattr = DEVLINK_ATTR_RATE_TC_BW, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { @@ -1174,7 +1181,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_rate_new_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_rate_new_nl_policy, - .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT, + .maxattr = DEVLINK_ATTR_RATE_TC_BW, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h index 8f2bd50ddf5e..df37c3ef3113 100644 --- a/net/devlink/netlink_gen.h +++ b/net/devlink/netlink_gen.h @@ -13,6 +13,7 @@ /* Common nested types */ extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1]; +extern const struct nla_policy devlink_dl_rate_tc_bw_nl_policy[DEVLINK_ATTR_RATE_BW + 1]; extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1]; /* Ops table for devlink */ diff --git a/net/devlink/rate.c b/net/devlink/rate.c index 8828ffaf6cbc..dbf1d552fae2 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -86,7 +86,9 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, int flags, struct netlink_ext_ack *extack) { struct devlink *devlink = devlink_rate->devlink; + struct nlattr *nla_tc_bw; void *hdr; + int i; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) @@ -124,10 +126,29 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, devlink_rate->tx_weight)) goto nla_put_failure; - if (devlink_rate->parent) - if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, - devlink_rate->parent->name)) + nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BW); + if (!nla_tc_bw) + goto nla_put_failure; + + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { + struct nlattr *nla_tc_entry = nla_nest_start(msg, i); + + if (!nla_tc_entry) { + nla_nest_cancel(msg, nla_tc_bw); + goto nla_put_failure; + } + + if (nla_put_u8(msg, DEVLINK_ATTR_RATE_TC_INDEX, i) || + nla_put_u32(msg, DEVLINK_ATTR_RATE_BW, devlink_rate->tc_bw[i])) { + nla_nest_cancel(msg, nla_tc_entry); + nla_nest_cancel(msg, nla_tc_bw); goto nla_put_failure; + } + + nla_nest_end(msg, nla_tc_entry); + } + + nla_nest_end(msg, nla_tc_bw); genlmsg_end(msg, hdr); return 0; @@ -380,6 +401,38 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, devlink_rate->tx_weight = weight; } + if (attrs[DEVLINK_ATTR_RATE_TC_BW]) { + struct nlattr *nla_tc_bw = attrs[DEVLINK_ATTR_RATE_TC_BW]; + struct nlattr *tb[DEVLINK_ATTR_RATE_BW + 1]; + u32 tc_bw[IEEE_8021QAZ_MAX_TCS] = {0}; + struct nlattr *nla_tc_entry; + int rem, tc_index; + + nla_for_each_nested(nla_tc_entry, nla_tc_bw, rem) { + err = nla_parse_nested(tb, DEVLINK_ATTR_RATE_BW, nla_tc_entry, + devlink_dl_rate_tc_bw_nl_policy, info->extack); + if (err) + return err; + + if (tb[DEVLINK_ATTR_RATE_TC_INDEX] && tb[DEVLINK_ATTR_RATE_BW]) { + tc_index = nla_get_u8(tb[DEVLINK_ATTR_RATE_TC_INDEX]); + tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_ATTR_RATE_BW]); + } + } + + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv, + tc_bw, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv, + tc_bw, info->extack); + + if (err) + return err; + + memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw)); + } + nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; if (nla_parent) { err = devlink_nl_rate_parent_node_set(devlink_rate, info, @@ -423,6 +476,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, "TX weight set isn't supported for the leafs"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TC_BW] && !ops->rate_leaf_tc_bw_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TC_BW], + "TC bandwidth set isn't supported for the leafs"); + return false; + } } else if (type == DEVLINK_RATE_TYPE_NODE) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); @@ -449,6 +508,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, "TX weight set isn't supported for the nodes"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TC_BW] && !ops->rate_node_tc_bw_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TC_BW], + "TC bandwidth set isn't supported for the nodes"); + return false; + } } else { WARN(1, "Unknown type of rate object"); return false;