Message ID | 20221025135958.6242-2-aaptel@nvidia.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | nvme-tcp receive offloads | expand |
On Tue, 25 Oct 2022 16:59:36 +0300 Aurelien Aptel wrote: > diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h > index 7c2d77d75a88..bf7391aa04c7 100644 > --- a/include/linux/netdev_features.h > +++ b/include/linux/netdev_features.h > @@ -14,7 +14,7 @@ typedef u64 netdev_features_t; > enum { > NETIF_F_SG_BIT, /* Scatter/gather IO. */ > NETIF_F_IP_CSUM_BIT, /* Can checksum TCP/UDP over IPv4. */ > - __UNUSED_NETIF_F_1, > + NETIF_F_HW_ULP_DDP_BIT, /* ULP direct data placement offload */ Why do you need a feature bit if there is a whole caps / limit querying mechanism? > NETIF_F_HW_CSUM_BIT, /* Can checksum all the packets. */ > NETIF_F_IPV6_CSUM_BIT, /* Can checksum TCP/UDP over IPV6 */ > NETIF_F_HIGHDMA_BIT, /* Can DMA to high memory. */ > @@ -168,6 +168,7 @@ enum { > #define NETIF_F_HW_HSR_TAG_RM __NETIF_F(HW_HSR_TAG_RM) > #define NETIF_F_HW_HSR_FWD __NETIF_F(HW_HSR_FWD) > #define NETIF_F_HW_HSR_DUP __NETIF_F(HW_HSR_DUP) > +#define NETIF_F_HW_ULP_DDP __NETIF_F(HW_ULP_DDP) > > /* Finds the next feature with the highest number of the range of start-1 till 0. > */ > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index eddf8ee270e7..84554f26ad6b 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -1043,6 +1043,7 @@ struct dev_ifalias { > > struct devlink; > struct tlsdev_ops; > +struct ulp_ddp_dev_ops; I thought forward declarations are not required for struct members, are they? > struct netdev_net_notifier { > struct list_head list; > @@ -2096,6 +2097,10 @@ struct net_device { > const struct tlsdev_ops *tlsdev_ops; > #endif > > +#if IS_ENABLED(CONFIG_ULP_DDP) > + const struct ulp_ddp_dev_ops *ulp_ddp_ops; > +#endif It's somewhat unclear to me why we add ops to struct net_device, rather than to ops.. can you explain? > const struct header_ops *header_ops; > > unsigned char operstate; > +#include <linux/netdevice.h> > +#include <net/inet_connection_sock.h> > +#include <net/sock.h> > + > +enum ulp_ddp_type { > + ULP_DDP_NVME = 1, I think the DDP and the NVME parts should have more separation. Are you planning to implement pure TCP placement, without NIC trying to also "add value" by processing whatever TCP is carrying. Can you split the DDP and NVME harder in the API, somehow? > +}; > + > +enum ulp_ddp_offload_capabilities { > + ULP_DDP_C_NVME_TCP = 1, > + ULP_DDP_C_NVME_TCP_DDGST_RX = 2, > +}; > + > +/** > + * struct ulp_ddp_limits - Generic ulp ddp limits: tcp ddp > + * protocol limits. > + * Protocol implementations must use this as the first member. > + * Add new instances of ulp_ddp_limits below (nvme-tcp, etc.). > + * > + * @type: type of this limits struct > + * @offload_capabilities:bitmask of supported offload types > + * @max_ddp_sgl_len: maximum sgl size supported (zero means no limit) > + * @io_threshold: minimum payload size required to offload > + * @buf: protocol-specific limits struct (if any) > + */ > +struct ulp_ddp_limits { Why is this called limits not capabilities / caps? > + enum ulp_ddp_type type; > + u64 offload_capabilities; > + int max_ddp_sgl_len; > + int io_threshold; > + unsigned char buf[]; Just put a union of all the protos here. > +}; > + > +/** > + * struct nvme_tcp_ddp_limits - nvme tcp driver limitations > + * > + * @lmt: generic ULP limits struct > + * @full_ccid_range: true if the driver supports the full CID range > + */ > +struct nvme_tcp_ddp_limits { > + struct ulp_ddp_limits lmt; > + > + bool full_ccid_range; > +}; > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c > index 0640453fce54..df37db420110 100644 > --- a/net/ipv4/tcp_input.c > +++ b/net/ipv4/tcp_input.c > @@ -5233,6 +5233,10 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, > memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); > #ifdef CONFIG_TLS_DEVICE > nskb->decrypted = skb->decrypted; > +#endif > +#ifdef CONFIG_ULP_DDP > + nskb->ulp_ddp = skb->ulp_ddp; > + nskb->ulp_crc = skb->ulp_crc; > #endif > TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; > if (list) > @@ -5266,6 +5270,10 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, > #ifdef CONFIG_TLS_DEVICE > if (skb->decrypted != nskb->decrypted) > goto end; > +#endif > +#ifdef CONFIG_ULP_DDP no ifdef needed > + if (skb_is_ulp_crc(skb) != skb_is_ulp_crc(nskb)) > + goto end; > #endif > } > }
On Tue, 25 Oct 2022 at 23:39, Jakub Kicinski <kuba@kernel.org> wrote: > On Tue, 25 Oct 2022 16:59:36 +0300 Aurelien Aptel wrote: > > diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h > > index 7c2d77d75a88..bf7391aa04c7 100644 > > --- a/include/linux/netdev_features.h > > +++ b/include/linux/netdev_features.h > > @@ -14,7 +14,7 @@ typedef u64 netdev_features_t; > > enum { > > NETIF_F_SG_BIT, /* Scatter/gather IO. */ > > NETIF_F_IP_CSUM_BIT, /* Can checksum TCP/UDP over IPv4. */ > > - __UNUSED_NETIF_F_1, > > + NETIF_F_HW_ULP_DDP_BIT, /* ULP direct data placement offload */ > > Why do you need a feature bit if there is a whole caps / limit querying > mechanism? The caps are used for the HW device to publish the supported capabilities/limitation, while the feature bit is used for the DDP enablement "per net-device". Disabling will be required in case that another feature which is mutually exclusive to the DDP is needed (as an example in the mlx case, CQE compress which is controlled from ethtool). > > > NETIF_F_HW_CSUM_BIT, /* Can checksum all the packets. */ > > NETIF_F_IPV6_CSUM_BIT, /* Can checksum TCP/UDP over IPV6 */ > > NETIF_F_HIGHDMA_BIT, /* Can DMA to high memory. */ > > @@ -168,6 +168,7 @@ enum { > > #define NETIF_F_HW_HSR_TAG_RM __NETIF_F(HW_HSR_TAG_RM) > > #define NETIF_F_HW_HSR_FWD __NETIF_F(HW_HSR_FWD) > > #define NETIF_F_HW_HSR_DUP __NETIF_F(HW_HSR_DUP) > > +#define NETIF_F_HW_ULP_DDP __NETIF_F(HW_ULP_DDP) > > > > /* Finds the next feature with the highest number of the range of start-1 till 0. > > */ > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > > index eddf8ee270e7..84554f26ad6b 100644 > > --- a/include/linux/netdevice.h > > +++ b/include/linux/netdevice.h > > @@ -1043,6 +1043,7 @@ struct dev_ifalias { > > > > struct devlink; > > struct tlsdev_ops; > > +struct ulp_ddp_dev_ops; > > I thought forward declarations are not required for struct members, > are they? Right, thanks, we will remove it. > > > struct netdev_net_notifier { > > struct list_head list; > > @@ -2096,6 +2097,10 @@ struct net_device { > > const struct tlsdev_ops *tlsdev_ops; > > #endif > > > > +#if IS_ENABLED(CONFIG_ULP_DDP) > > + const struct ulp_ddp_dev_ops *ulp_ddp_ops; > > +#endif > > It's somewhat unclear to me why we add ops to struct net_device, > rather than to ops.. can you explain? We were trying to follow the TLS design which is similar. Can you please clarify what do you mean by "rather than to ops.."? > > > const struct header_ops *header_ops; > > > > unsigned char operstate; > > > +#include <linux/netdevice.h> > > +#include <net/inet_connection_sock.h> > > +#include <net/sock.h> > > + > > +enum ulp_ddp_type { > > + ULP_DDP_NVME = 1, > > I think the DDP and the NVME parts should have more separation. > > Are you planning to implement pure TCP placement, without NIC trying > to also "add value" by processing whatever TCP is carrying. We are not planning to implement pure TCP placement. As we will present in the netdev, this is doable only if the HW is L5 aware. > > Can you split the DDP and NVME harder in the API, somehow? We can simplify the layering by using union per ulp_ddp_type. There are no nvme structures or definitions needed in ulp_ddp. > > > +}; > > + > > +enum ulp_ddp_offload_capabilities { > > + ULP_DDP_C_NVME_TCP = 1, > > + ULP_DDP_C_NVME_TCP_DDGST_RX = 2, > > +}; > > + > > +/** > > + * struct ulp_ddp_limits - Generic ulp ddp limits: tcp ddp > > + * protocol limits. > > + * Protocol implementations must use this as the first member. > > + * Add new instances of ulp_ddp_limits below (nvme-tcp, etc.). > > + * > > + * @type: type of this limits struct > > + * @offload_capabilities:bitmask of supported offload types > > + * @max_ddp_sgl_len: maximum sgl size supported (zero means no limit) > > + * @io_threshold: minimum payload size required to offload > > + * @buf: protocol-specific limits struct (if any) > > + */ > > +struct ulp_ddp_limits { > > Why is this called limits not capabilities / caps? We will change it to caps. > > > + enum ulp_ddp_type type; > > + u64 offload_capabilities; > > + int max_ddp_sgl_len; > > + int io_threshold; > > + unsigned char buf[]; > > Just put a union of all the protos here. Sure. > > > +}; > > + > > +/** > > + * struct nvme_tcp_ddp_limits - nvme tcp driver limitations > > + * > > + * @lmt: generic ULP limits struct > > + * @full_ccid_range: true if the driver supports the full CID range > > + */ > > +struct nvme_tcp_ddp_limits { > > + struct ulp_ddp_limits lmt; > > + > > + bool full_ccid_range; > > +}; > > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c > > index 0640453fce54..df37db420110 100644 > > --- a/net/ipv4/tcp_input.c > > +++ b/net/ipv4/tcp_input.c > > @@ -5233,6 +5233,10 @@ tcp_collapse(struct sock *sk, struct sk_buff_head > *list, struct rb_root *root, > > memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); > > #ifdef CONFIG_TLS_DEVICE > > nskb->decrypted = skb->decrypted; > > +#endif > > +#ifdef CONFIG_ULP_DDP > > + nskb->ulp_ddp = skb->ulp_ddp; > > + nskb->ulp_crc = skb->ulp_crc; > > #endif > > TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; > > if (list) > > @@ -5266,6 +5270,10 @@ tcp_collapse(struct sock *sk, struct sk_buff_head > *list, struct rb_root *root, > > #ifdef CONFIG_TLS_DEVICE > > if (skb->decrypted != nskb->decrypted) > > goto end; > > +#endif > > +#ifdef CONFIG_ULP_DDP > > no ifdef needed Thanks, we will remove it. > > > + if (skb_is_ulp_crc(skb) != skb_is_ulp_crc(nskb)) > > + goto end; > > #endif > > } > > }
On Wed, 26 Oct 2022 15:01:42 +0000 Shai Malin wrote: > > > @@ -14,7 +14,7 @@ typedef u64 netdev_features_t; > > > enum { > > > NETIF_F_SG_BIT, /* Scatter/gather IO. */ > > > NETIF_F_IP_CSUM_BIT, /* Can checksum TCP/UDP over IPv4. */ > > > - __UNUSED_NETIF_F_1, > > > + NETIF_F_HW_ULP_DDP_BIT, /* ULP direct data placement offload */ > > > > Why do you need a feature bit if there is a whole caps / limit querying > > mechanism? > > The caps are used for the HW device to publish the supported > capabilities/limitation, while the feature bit is used for the DDP > enablement "per net-device". > > Disabling will be required in case that another feature which is > mutually exclusive to the DDP is needed (as an example in the mlx case, > CQE compress which is controlled from ethtool). It's a big enough feature to add a genetlink or at least a ethtool command to control. If you add more L5 protos presumably you'll want to control disable / enable separately for them. Also it'd be cleaner to expose the full capabilities and report stats via a dedicated API. Feature bits are not a good fix for complex control-pathy features. > > It's somewhat unclear to me why we add ops to struct net_device, > > rather than to ops.. can you explain? > > We were trying to follow the TLS design which is similar. Ack, TLS should really move as well, IMHO, but that's a separate convo. > Can you please clarify what do you mean by "rather than to ops.."? Add the ulp_dpp_ops pointer to struct net_device_ops rather than struct net_device.
On Wed, 26 Oct 2022 at 17:24, Jakub Kicinski <kuba@kernel.org> wrote: > On Wed, 26 Oct 2022 15:01:42 +0000 Shai Malin wrote: > > > > @@ -14,7 +14,7 @@ typedef u64 netdev_features_t; > > > > enum { > > > > NETIF_F_SG_BIT, /* Scatter/gather IO. */ > > > > NETIF_F_IP_CSUM_BIT, /* Can checksum TCP/UDP over > IPv4. */ > > > > - __UNUSED_NETIF_F_1, > > > > + NETIF_F_HW_ULP_DDP_BIT, /* ULP direct data placement > offload */ > > > > > > Why do you need a feature bit if there is a whole caps / limit querying > > > mechanism? > > > > The caps are used for the HW device to publish the supported > > capabilities/limitation, while the feature bit is used for the DDP > > enablement "per net-device". > > > > Disabling will be required in case that another feature which is > > mutually exclusive to the DDP is needed (as an example in the mlx case, > > CQE compress which is controlled from ethtool). > > It's a big enough feature to add a genetlink or at least a ethtool > command to control. If you add more L5 protos presumably you'll want > to control disable / enable separately for them. Also it'd be cleaner > to expose the full capabilities and report stats via a dedicated API. > Feature bits are not a good fix for complex control-pathy features. With our existing design, we are supporting a bundle of DDP + CRC offload. We don't see the value of letting the user control it individually. The capabilities bits were added in order to allow future devices which supported only one of the capabilities to plug into the infrastructure or to allow additional capabilities/protocols. We could expose the caps via ethtool as regular netdev features, it would make everything simpler and cleaner, but the problem is that features have run out of bits (all 64 are taken, and we understand the challenge with changing that). We could add a new ethtool command, but on the kernel side it would be quite redundant as we would essentially re-implement feature flag processing (comparing string of features names and enabling bits). What do you think? > > > > It's somewhat unclear to me why we add ops to struct net_device, > > > rather than to ops.. can you explain? > > > > We were trying to follow the TLS design which is similar. > > Ack, TLS should really move as well, IMHO, but that's a separate convo. > > > Can you please clarify what do you mean by "rather than to ops.."? > > Add the ulp_dpp_ops pointer to struct net_device_ops rather than struct > net_device. Sure, will be fixed in v8.
On Fri, 28 Oct 2022 10:32:22 +0000 Shai Malin wrote: > > It's a big enough feature to add a genetlink or at least a ethtool > > command to control. If you add more L5 protos presumably you'll want > > to control disable / enable separately for them. Also it'd be cleaner > > to expose the full capabilities and report stats via a dedicated API. > > Feature bits are not a good fix for complex control-pathy features. > > With our existing design, we are supporting a bundle of DDP + CRC offload. > We don't see the value of letting the user control it individually. I was talking about the L5 parsing you do. I presume it won't be a huge challenge for you to implement support for framing different than NVMe, and perhaps even NVMe may have new revisions or things you don't support? At which point we're gonna have a bit for each protocol? :S Then there are stats. We should have a more expressive API here from the get go. TLS offload is clearly lacking in this area. > The capabilities bits were added in order to allow future devices which > supported only one of the capabilities to plug into the infrastructure > or to allow additional capabilities/protocols. > > We could expose the caps via ethtool as regular netdev features, it would > make everything simpler and cleaner, but the problem is that features have > run out of bits (all 64 are taken, and we understand the challenge with > changing that). Feature bits should be exclusively for information which needs to be accessed on the fast path, on per packet basis. If you have such a need then I'm not really opposed to you allocating bits as well, but primary feature discovery *for the user* should not be over the feature bits. > We could add a new ethtool command, but on the kernel side it would be > quite redundant as we would essentially re-implement feature flag processing > (comparing string of features names and enabling bits). > > What do you think?
On Fri, 28 Oct 2022 at 18:40, Jakub Kicinski <kuba@kernel.org> wrote: > On Fri, 28 Oct 2022 10:32:22 +0000 Shai Malin wrote: > > > It's a big enough feature to add a genetlink or at least a ethtool > > > command to control. If you add more L5 protos presumably you'll want > > > to control disable / enable separately for them. Also it'd be cleaner > > > to expose the full capabilities and report stats via a dedicated API. > > > Feature bits are not a good fix for complex control-pathy features. > > > > With our existing design, we are supporting a bundle of DDP + CRC offload. > > We don't see the value of letting the user control it individually. > > I was talking about the L5 parsing you do. I presume it won't be a huge > challenge for you to implement support for framing different than NVMe, > and perhaps even NVMe may have new revisions or things you don't > support? At which point we're gonna have a bit for each protocol? :S The existing HW L5 parsing is capable of supporting NVMeTCP offload host and target. As part of this series, we introduce the host Rx and following that we are planning to add support for host Tx, and target (both Rx and Tx). Supporting a new protocol, or a new NVMe format, is not in our plans at this point, but the overall ULP design should definitely allow it. > Then there are stats. In the patch "net/mlx5e: NVMEoTCP, statistics" we introduced rx_nvmeotcp_* stats. We believe it should be collected by the device driver and not by the ULP layer. > > We should have a more expressive API here from the get go. TLS offload > is clearly lacking in this area. Sure. > > > The capabilities bits were added in order to allow future devices which > > supported only one of the capabilities to plug into the infrastructure > > or to allow additional capabilities/protocols. > > > > We could expose the caps via ethtool as regular netdev features, it would > > make everything simpler and cleaner, but the problem is that features > have > > run out of bits (all 64 are taken, and we understand the challenge with > > changing that). > > Feature bits should be exclusively for information which needs to be > accessed on the fast path, on per packet basis. If you have such a need > then I'm not really opposed to you allocating bits as well, but primary > feature discovery *for the user* should not be over the feature bits. Our design does not require information that needs to be accessed on the fast path. The user will only need to configure it as part of the offload connection establishment. We will suggest a new approach. > > > We could add a new ethtool command, but on the kernel side it would be > > quite redundant as we would essentially re-implement feature flag > processing > > (comparing string of features names and enabling bits). > > > > What do you think?
On Mon, 31 Oct 2022 18:13:19 +0000 Shai Malin wrote: > > Then there are stats. > > In the patch "net/mlx5e: NVMEoTCP, statistics" we introduced > rx_nvmeotcp_* stats. > We believe it should be collected by the device driver and not > by the ULP layer. I'm not sure I agree, but that's not the key point. The key point is that we want the stats to come via a protocol specific interface, and not have to be fished out of the ethtool -S goop. You can still collect them in the driver, if we have any ULP-level stats at any point we can add them to the same interface.
On Tue, 1 Nov 2022 at 01:47, Jakub Kicinski <kuba@kernel.org> wrote: > On Mon, 31 Oct 2022 18:13:19 +0000 Shai Malin wrote: > > > Then there are stats. > > > > In the patch "net/mlx5e: NVMEoTCP, statistics" we introduced > > rx_nvmeotcp_* stats. > > We believe it should be collected by the device driver and not > > by the ULP layer. > > I'm not sure I agree, but that's not the key point. > The key point is that we want the stats to come via a protocol specific > interface, and not have to be fished out of the ethtool -S goop. You > can still collect them in the driver, if we have any ULP-level stats at > any point we can add them to the same interface. Understood. We will suggest a new design for the stats, together with the capabilities enablement.
Jakub, We came up with 2 designs for controlling the ULP DDP capability bits and getting the ULP DDP statistics. Both designs share some concepts so I'm going to talk about the common stuff first: We drop the netdev->feature bit. To fully disable ULP DDP offload the caps will have to be set to 0x0. In both design we replace the feature bit with a new field netdev->ulp_ddp_caps struct ulp_ddp_cap { bitmap caps_hw; // what the hw supports (filled by the driver, used as reference once initialized) bitmap caps_active; // what is currently set for the system, can be modified from userspace }; We add a new OP net_device_ops->ndo_set_ulp_caps() that drivers have to provide to fill netdev->ulp_ddp_caps.caps_hw. We call it around the same time as when we call ndo_set_features(). Interfacing with userspace is where the design differs. Design A ("netlink"): ===================== # Capabilities We can expose to the users a new ethtool api using netlink. For this we want to have a dynamic system where userspace doesn't have to hardcode all the caps but instead can get a list. We implement something similar to what is done for features bits. We add a table to map caps to string names const char *ulp_ddp_cap_names[] = { [ULP_DDP_NVME_TCP_XXX] = "nvme-tcp-xxx", ... }; We add ETHTOOL messages to get and set ULP caps: - ETHTOOL_MSG_ULP_CAPS_GET: get device ulp capabilities - ETHTOOL_MSG_ULP_CAPS_SET: set device up capabilities The GET reply code can use ethnl_put_bitset32() which does the job of sending bits + their names as strings. The SET code would apply the changes to netdev->ulp_ddp_caps.caps_active. # Statistics If the ETHTOOL_MSG_ULP_CAPS_GET message requests statistics (by setting the header flag ETHTOOL_FLAG_STATS) the kernel will append all the ULP statistics of the device at the end of the reply. Those statistics will be dynamic in the sense that they will use a new stringset for their names that userspace will have to fetch. # Ethtool changes We will add the -u|-U|--ulp-get|--ulp-set options to ethtool. # query list of caps supported and their value on device $dev ethtool -u|--ulp-get <dev> # query ULP stats of $dev ethtool -u|--ulp-get --include-statistics <dev> # set $cap to $val on device $dev -U|--ulp-set <dev> <cap> [on|off] Design B ("procfs") =================== In this design we add a new /proc/sys/net/ulp/* hierarchy, under which we will add a directory per device (e.g. /proc/sys/net/ulp/eth0/) to configure/query ULP DDP. # Capabilities # set capabilities per device $ echo 0x1 > /proc/sys/net/ulp/<device>/caps # Statistics # show per device stats (global and per queue) # space separated values, 1 stat per line $ cat /proc/sys/net/ulp/<device>/stats rx_nvmeotcp_drop 0 rx_nvmeotcp_resync 403 rx_nvmeotcp_offload_packets 75614185 rx_nvmeotcp_offload_bytes 107016641528 rx_nvmeotcp_sk_add 1 rx_nvmeotcp_sk_add_fail 0 rx_nvmeotcp_sk_del 0 rx_nvmeotcp_ddp_setup 3327969 rx_nvmeotcp_ddp_setup_fail 0 rx_nvmeotcp_ddp_teardown 3327969 I can also suggest the existing paths: - /sys/class/net/<device>/statistics/ - /proc/net/stat/ Or any other path you will prefer. We will appreciate your feedback. Thanks
On Thu, 03 Nov 2022 19:29:33 +0200 Aurelien Aptel wrote: > Jakub, > > We came up with 2 designs for controlling the ULP DDP capability bits > and getting the ULP DDP statistics. > > Both designs share some concepts so I'm going to talk about the common > stuff first: > > We drop the netdev->feature bit. To fully disable ULP DDP offload the > caps will have to be set to 0x0. > > In both design we replace the feature bit with a new field > netdev->ulp_ddp_caps > > struct ulp_ddp_cap { > bitmap caps_hw; // what the hw supports (filled by the driver, used as reference once initialized) > bitmap caps_active; // what is currently set for the system, can be modified from userspace > }; > > We add a new OP net_device_ops->ndo_set_ulp_caps() that drivers have > to provide to fill netdev->ulp_ddp_caps.caps_hw. We call it around > the same time as when we call ndo_set_features(). Sounds good. Just to be clear - I was suggesting: net_device_ops->ddp_ulp_ops->set_ulp_caps() so an extra indirection, but if you're worried about the overhead an ndo is fine, too. > Interfacing with userspace is where the design differs. > > Design A ("netlink"): > ===================== > > # Capabilities > > We can expose to the users a new ethtool api using netlink. > > For this we want to have a dynamic system where userspace doesn't have > to hardcode all the caps but instead can get a list. We implement > something similar to what is done for features bits. > > We add a table to map caps to string names > > const char *ulp_ddp_cap_names[] = { > [ULP_DDP_NVME_TCP_XXX] = "nvme-tcp-xxx", > ... > }; Right, you should be able to define your own strset (grep for stats_std_names for an example). > We add ETHTOOL messages to get and set ULP caps: > > - ETHTOOL_MSG_ULP_CAPS_GET: get device ulp capabilities > - ETHTOOL_MSG_ULP_CAPS_SET: set device up capabilities ULP or DDP? Are you planning to plumb TLS thru the same ops? Otherwise ULP on its own may be a little too generic of a name. > The GET reply code can use ethnl_put_bitset32() which does the job of > sending bits + their names as strings. > > The SET code would apply the changes to netdev->ulp_ddp_caps.caps_active. > > # Statistics > > If the ETHTOOL_MSG_ULP_CAPS_GET message requests statistics (by Would it make sense to drop the _CAPS from the name, then? Or replace by something more general, like INFO? We can call the bitset inside the message CAPS but the message also carries stats and perhaps other things in the future. > setting the header flag ETHTOOL_FLAG_STATS) the kernel will append all > the ULP statistics of the device at the end of the reply. > > Those statistics will be dynamic in the sense that they will use a new > stringset for their names that userspace will have to fetch. > > # Ethtool changes > We will add the -u|-U|--ulp-get|--ulp-set options to ethtool. > > # query list of caps supported and their value on device $dev > ethtool -u|--ulp-get <dev> > > # query ULP stats of $dev > ethtool -u|--ulp-get --include-statistics <dev> -I|--include-statistics ? > # set $cap to $val on device $dev > -U|--ulp-set <dev> <cap> [on|off] Sounds good! > Design B ("procfs") > =================== > > In this design we add a new /proc/sys/net/ulp/* hierarchy, under which > we will add a directory per device (e.g. /proc/sys/net/ulp/eth0/) to > configure/query ULP DDP. > > # Capabilities > > # set capabilities per device > $ echo 0x1 > /proc/sys/net/ulp/<device>/caps > > # Statistics > > # show per device stats (global and per queue) > # space separated values, 1 stat per line > $ cat /proc/sys/net/ulp/<device>/stats > rx_nvmeotcp_drop 0 > rx_nvmeotcp_resync 403 > rx_nvmeotcp_offload_packets 75614185 > rx_nvmeotcp_offload_bytes 107016641528 > rx_nvmeotcp_sk_add 1 > rx_nvmeotcp_sk_add_fail 0 > rx_nvmeotcp_sk_del 0 > rx_nvmeotcp_ddp_setup 3327969 > rx_nvmeotcp_ddp_setup_fail 0 > rx_nvmeotcp_ddp_teardown 3327969 > > I can also suggest the existing paths: > > - /sys/class/net/<device>/statistics/ > - /proc/net/stat/ > > Or any other path you will prefer. Thanks for describing the options! I definitely prefer ethtool/netlink.
Jakub Kicinski kuba@kernel.org writes: > Sounds good. Just to be clear - I was suggesting: > > net_device_ops->ddp_ulp_ops->set_ulp_caps() > > so an extra indirection, but if you're worried about the overhead > an ndo is fine, too. Sure, thanks. >> We add ETHTOOL messages to get and set ULP caps: >> >> - ETHTOOL_MSG_ULP_CAPS_GET: get device ulp capabilities >> - ETHTOOL_MSG_ULP_CAPS_SET: set device up capabilities > > ULP or DDP? Are you planning to plumb TLS thru the same ops? > Otherwise ULP on its own may be a little too generic of a name. TLS is not in our scope. It was originally used as a reference. We will use the term "ULP_DDP". > >> The GET reply code can use ethnl_put_bitset32() which does the job of >> sending bits + their names as strings. >> >> The SET code would apply the changes to netdev->ulp_ddp_caps.caps_active. >> >> # Statistics >> >> If the ETHTOOL_MSG_ULP_CAPS_GET message requests statistics (by > > Would it make sense to drop the _CAPS from the name, then? > Or replace by something more general, like INFO? Ok, we will drop the _CAPS. >> # query ULP stats of $dev >> ethtool -u|--ulp-get --include-statistics <dev> > > -I|--include-statistics ? Could you please elaborate what is the comment? >> # set $cap to $val on device $dev >> -U|--ulp-set <dev> <cap> [on|off] > > Sounds good! Since -u is taken we are going with -J/-j and --ulp-ddp to keep it consistent with the netlink flags. > Thanks for describing the options! I definitely prefer ethtool/netlink. Great, we will add it in v8. Thanks
On Fri, 04 Nov 2022 15:44:57 +0200 Aurelien Aptel wrote: > >> # query ULP stats of $dev > >> ethtool -u|--ulp-get --include-statistics <dev> > > > > -I|--include-statistics ? > > Could you please elaborate what is the comment? Since you were noting the short version for --ulp-gen I thought I'd mention that --include-statistics also has one and it's -I :)
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 7c2d77d75a88..bf7391aa04c7 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -14,7 +14,7 @@ typedef u64 netdev_features_t; enum { NETIF_F_SG_BIT, /* Scatter/gather IO. */ NETIF_F_IP_CSUM_BIT, /* Can checksum TCP/UDP over IPv4. */ - __UNUSED_NETIF_F_1, + NETIF_F_HW_ULP_DDP_BIT, /* ULP direct data placement offload */ NETIF_F_HW_CSUM_BIT, /* Can checksum all the packets. */ NETIF_F_IPV6_CSUM_BIT, /* Can checksum TCP/UDP over IPV6 */ NETIF_F_HIGHDMA_BIT, /* Can DMA to high memory. */ @@ -168,6 +168,7 @@ enum { #define NETIF_F_HW_HSR_TAG_RM __NETIF_F(HW_HSR_TAG_RM) #define NETIF_F_HW_HSR_FWD __NETIF_F(HW_HSR_FWD) #define NETIF_F_HW_HSR_DUP __NETIF_F(HW_HSR_DUP) +#define NETIF_F_HW_ULP_DDP __NETIF_F(HW_ULP_DDP) /* Finds the next feature with the highest number of the range of start-1 till 0. */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eddf8ee270e7..84554f26ad6b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1043,6 +1043,7 @@ struct dev_ifalias { struct devlink; struct tlsdev_ops; +struct ulp_ddp_dev_ops; struct netdev_net_notifier { struct list_head list; @@ -2096,6 +2097,10 @@ struct net_device { const struct tlsdev_ops *tlsdev_ops; #endif +#if IS_ENABLED(CONFIG_ULP_DDP) + const struct ulp_ddp_dev_ops *ulp_ddp_ops; +#endif + const struct header_ops *header_ops; unsigned char operstate; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 59c9fd55699d..2b97bf90f120 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -811,6 +811,8 @@ typedef unsigned char *sk_buff_data_t; * delivery_time in mono clock base (i.e. EDT). Otherwise, the * skb->tstamp has the (rcv) timestamp at ingress and * delivery_time at egress. + * @ulp_ddp: DDP offloaded + * @ulp_crc: CRC offloaded * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS * @alloc_cpu: CPU which did the skb allocation. @@ -984,6 +986,10 @@ struct sk_buff { __u8 slow_gro:1; __u8 csum_not_inet:1; __u8 scm_io_uring:1; +#ifdef CONFIG_ULP_DDP + __u8 ulp_ddp:1; + __u8 ulp_crc:1; +#endif #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ @@ -5050,5 +5056,23 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb) } #endif +static inline bool skb_is_ulp_ddp(struct sk_buff *skb) +{ +#ifdef CONFIG_ULP_DDP + return skb->ulp_ddp; +#else + return 0; +#endif +} + +static inline bool skb_is_ulp_crc(struct sk_buff *skb) +{ +#ifdef CONFIG_ULP_DDP + return skb->ulp_crc; +#else + return 0; +#endif +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c2b15f7e5516..2ba73167b3bb 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -68,6 +68,8 @@ struct inet_connection_sock_af_ops { * @icsk_ulp_ops Pluggable ULP control hook * @icsk_ulp_data ULP private data * @icsk_clean_acked Clean acked data hook + * @icsk_ulp_ddp_ops Pluggable ULP direct data placement control hook + * @icsk_ulp_ddp_data ULP direct data placement private data * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event @@ -98,6 +100,8 @@ struct inet_connection_sock { const struct tcp_ulp_ops *icsk_ulp_ops; void __rcu *icsk_ulp_data; void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); + const struct ulp_ddp_ulp_ops *icsk_ulp_ddp_ops; + void __rcu *icsk_ulp_ddp_data; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:5, icsk_ca_initialized:1, diff --git a/include/net/ulp_ddp.h b/include/net/ulp_ddp.h new file mode 100644 index 000000000000..b190db140367 --- /dev/null +++ b/include/net/ulp_ddp.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * ulp_ddp.h + * Author: Boris Pismenny <borisp@nvidia.com> + * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ +#ifndef _ULP_DDP_H +#define _ULP_DDP_H + +#include <linux/netdevice.h> +#include <net/inet_connection_sock.h> +#include <net/sock.h> + +enum ulp_ddp_type { + ULP_DDP_NVME = 1, +}; + +enum ulp_ddp_offload_capabilities { + ULP_DDP_C_NVME_TCP = 1, + ULP_DDP_C_NVME_TCP_DDGST_RX = 2, +}; + +/** + * struct ulp_ddp_limits - Generic ulp ddp limits: tcp ddp + * protocol limits. + * Protocol implementations must use this as the first member. + * Add new instances of ulp_ddp_limits below (nvme-tcp, etc.). + * + * @type: type of this limits struct + * @offload_capabilities:bitmask of supported offload types + * @max_ddp_sgl_len: maximum sgl size supported (zero means no limit) + * @io_threshold: minimum payload size required to offload + * @buf: protocol-specific limits struct (if any) + */ +struct ulp_ddp_limits { + enum ulp_ddp_type type; + u64 offload_capabilities; + int max_ddp_sgl_len; + int io_threshold; + unsigned char buf[]; +}; + +/** + * struct nvme_tcp_ddp_limits - nvme tcp driver limitations + * + * @lmt: generic ULP limits struct + * @full_ccid_range: true if the driver supports the full CID range + */ +struct nvme_tcp_ddp_limits { + struct ulp_ddp_limits lmt; + + bool full_ccid_range; +}; + +/** + * struct ulp_ddp_config - Generic ulp ddp configuration: tcp ddp IO queue + * config implementations must use this as the first member. + * Add new instances of ulp_ddp_config below (nvme-tcp, etc.). + * + * @type: type of this config struct + * @buf: protocol-specific config struct + */ +struct ulp_ddp_config { + enum ulp_ddp_type type; + unsigned char buf[]; +}; + +/** + * struct nvme_tcp_ddp_config - nvme tcp ddp configuration for an IO queue + * + * @cfg: generic ULP config struct + * @pfv: pdu version (e.g., NVME_TCP_PFV_1_0) + * @cpda: controller pdu data alignment (dwords, 0's based) + * @dgst: digest types enabled (header or data, see enum nvme_tcp_digest_option). + * The netdev will offload crc if it is supported. + * @queue_size: number of nvme-tcp IO queue elements + * @queue_id: queue identifier + * @io_cpu: cpu core running the IO thread for this queue + */ +struct nvme_tcp_ddp_config { + struct ulp_ddp_config cfg; + + u16 pfv; + u8 cpda; + u8 dgst; + int queue_size; + int queue_id; + int io_cpu; +}; + +/** + * struct ulp_ddp_io - ulp ddp configuration for an IO request. + * + * @command_id: identifier on the wire associated with these buffers + * @nents: number of entries in the sg_table + * @sg_table: describing the buffers for this IO request + * @first_sgl: first SGL in sg_table + */ +struct ulp_ddp_io { + u32 command_id; + int nents; + struct sg_table sg_table; + struct scatterlist first_sgl[SG_CHUNK_SIZE]; +}; + +/* struct ulp_ddp_dev_ops - operations used by an upper layer protocol + * to configure ddp offload + * + * @ulp_ddp_limits: query ulp driver limitations and quirks. + * @ulp_ddp_sk_add: add offload for the queue represented by socket+config + * pair. this function is used to configure either copy, crc + * or both offloads. + * @ulp_ddp_sk_del: remove offload from the socket, and release any device + * related resources. + * @ulp_ddp_setup: request copy offload for buffers associated with a + * command_id in ulp_ddp_io. + * @ulp_ddp_teardown: release offload resources association between buffers + * and command_id in ulp_ddp_io. + * @ulp_ddp_resync: respond to the driver's resync_request. Called only if + * resync is successful. + */ +struct ulp_ddp_dev_ops { + int (*ulp_ddp_limits)(struct net_device *netdev, + struct ulp_ddp_limits *limits); + int (*ulp_ddp_sk_add)(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_config *config); + void (*ulp_ddp_sk_del)(struct net_device *netdev, + struct sock *sk); + int (*ulp_ddp_setup)(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_io *io); + int (*ulp_ddp_teardown)(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_io *io, + void *ddp_ctx); + void (*ulp_ddp_resync)(struct net_device *netdev, + struct sock *sk, u32 seq); +}; + +#define ULP_DDP_RESYNC_PENDING BIT(0) + +/** + * struct ulp_ddp_ulp_ops - Interface to register uppper layer + * Direct Data Placement (DDP) TCP offload. + * @resync_request: NIC requests ulp to indicate if @seq is the start + * of a message. + * @ddp_teardown_done: NIC driver informs the ulp that teardown is done, + * used for async completions. + */ +struct ulp_ddp_ulp_ops { + bool (*resync_request)(struct sock *sk, u32 seq, u32 flags); + void (*ddp_teardown_done)(void *ddp_ctx); +}; + +/** + * struct ulp_ddp_ctx - Generic ulp ddp context: device driver per queue contexts must + * use this as the first member. + * + * @type: type of this context struct + * @buf: protocol-specific context struct + */ +struct ulp_ddp_ctx { + enum ulp_ddp_type type; + unsigned char buf[]; +}; + +static inline struct ulp_ddp_ctx *ulp_ddp_get_ctx(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + return (__force struct ulp_ddp_ctx *)icsk->icsk_ulp_ddp_data; +} + +static inline void ulp_ddp_set_ctx(struct sock *sk, void *ctx) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + rcu_assign_pointer(icsk->icsk_ulp_ddp_data, ctx); +} + +#endif /* _ULP_DDP_H */ diff --git a/net/Kconfig b/net/Kconfig index 48c33c222199..cd59be2d6c6e 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -471,4 +471,14 @@ config NETDEV_ADDR_LIST_TEST default KUNIT_ALL_TESTS depends on KUNIT +config ULP_DDP + bool "ULP direct data placement offload" + default n + help + Direct Data Placement (DDP) offload enables ULP, such as + NVMe-TCP, to request the NIC to place ULP payload data + of a command response directly into kernel pages while + calculate/verify the data digest on ULP PDU as they go through + the NIC. Thus avoiding the costly per-byte overhead. + endif # if NET diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9b3b19816d2d..ff80667adb14 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -72,6 +72,7 @@ #include <net/mptcp.h> #include <net/mctp.h> #include <net/page_pool.h> +#include <net/ulp_ddp.h> #include <linux/uaccess.h> #include <trace/events/skb.h> @@ -6416,7 +6417,7 @@ void skb_condense(struct sk_buff *skb) { if (skb->data_len) { if (skb->data_len > skb->end - skb->tail || - skb_cloned(skb)) + skb_cloned(skb) || skb_is_ulp_ddp(skb)) return; /* Nice, we can free page frag(s) right now */ diff --git a/net/ethtool/common.c b/net/ethtool/common.c index ee3e02da0013..5636ef148b4d 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -74,6 +74,7 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_HW_HSR_TAG_RM_BIT] = "hsr-tag-rm-offload", [NETIF_F_HW_HSR_FWD_BIT] = "hsr-fwd-offload", [NETIF_F_HW_HSR_DUP_BIT] = "hsr-dup-offload", + [NETIF_F_HW_ULP_DDP_BIT] = "ulp-ddp-offload", }; const char diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0640453fce54..df37db420110 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5233,6 +5233,10 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); #ifdef CONFIG_TLS_DEVICE nskb->decrypted = skb->decrypted; +#endif +#ifdef CONFIG_ULP_DDP + nskb->ulp_ddp = skb->ulp_ddp; + nskb->ulp_crc = skb->ulp_crc; #endif TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; if (list) @@ -5266,6 +5270,10 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, #ifdef CONFIG_TLS_DEVICE if (skb->decrypted != nskb->decrypted) goto end; +#endif +#ifdef CONFIG_ULP_DDP + if (skb_is_ulp_crc(skb) != skb_is_ulp_crc(nskb)) + goto end; #endif } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 87d440f47a70..e3d884b3bde7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1821,6 +1821,9 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || #ifdef CONFIG_TLS_DEVICE tail->decrypted != skb->decrypted || +#endif +#ifdef CONFIG_ULP_DDP + skb_is_ulp_crc(tail) != skb_is_ulp_crc(skb) || #endif thtail->doff != th->doff || memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 45dda7889387..2e62f18e85c0 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -268,6 +268,9 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) #ifdef CONFIG_TLS_DEVICE flush |= p->decrypted ^ skb->decrypted; #endif +#ifdef CONFIG_ULP_DDP + flush |= skb_is_ulp_crc(p) ^ skb_is_ulp_crc(skb); +#endif if (flush || skb_gro_receive(p, skb)) { mss = 1;