Message ID | 20230712161513.134860-2-aaptel@nvidia.com (mailing list archive) |
---|---|
State | RFC |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | nvme-tcp receive offloads | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Guessing tree name failed - patch did not apply, async |
On 7/12/23 19:14, Aurelien Aptel wrote: > From: Boris Pismenny <borisp@nvidia.com> > > This commit introduces direct data placement (DDP) offload for TCP. > > The motivation is saving compute resources/cycles that are spent > to copy data from SKBs to the block layer buffers and CRC > calculation/verification for received PDUs (Protocol Data Units). > > The DDP capability is accompanied by new net_device operations that > configure hardware contexts. > > There is a context per socket, and a context per DDP operation. > Additionally, a resynchronization routine is used to assist > hardware handle TCP OOO, and continue the offload. Furthermore, > we let the offloading driver advertise what is the max hw > sectors/segments. > > The interface includes the following net-device ddp operations: > > 1. sk_add - add offload for the queue represented by socket+config pair > 2. sk_del - remove the offload for the socket/queue > 3. ddp_setup - request copy offload for buffers associated with an IO > 4. ddp_teardown - release offload resources for that IO > 5. limits - query NIC driver for quirks and limitations (e.g. > max number of scatter gather entries per IO) > 6. set_caps - request ULP DDP capabilities enablement > 7. get_stats - query NIC driver for ULP DDP stats > > Using this interface, the NIC hardware will scatter TCP payload > directly to the BIO pages according to the command_id. > > To maintain the correctness of the network stack, the driver is > expected to construct SKBs that point to the BIO pages. > > The SKB passed to the network stack from the driver represents > data as it is on the wire, while it is pointing directly to data > in destination buffers. > > As a result, data from page frags should not be copied out to > the linear part. To avoid needless copies, such as when using > skb_condense, we mark the skb->ulp_ddp bit. > In addition, the skb->ulp_crc will be used by the upper layers to > determine if CRC re-calculation is required. The two separated skb > indications are needed to avoid false positives GRO flushing events. > > Follow-up patches will use this interface for DDP in NVMe-TCP. > > Capability bits stored in net_device allow drivers to report which > ULP DDP capabilities a device supports. Control over these > capabilities will be exposed to userspace in later patches. > > Signed-off-by: Boris Pismenny <borisp@nvidia.com> > Signed-off-by: Ben Ben-Ishay <benishay@nvidia.com> > Signed-off-by: Or Gerlitz <ogerlitz@nvidia.com> > Signed-off-by: Yoray Zack <yorayz@nvidia.com> > Signed-off-by: Shai Malin <smalin@nvidia.com> > Signed-off-by: Aurelien Aptel <aaptel@nvidia.com> > --- > include/linux/netdevice.h | 15 +++ > include/linux/skbuff.h | 25 +++- > include/net/inet_connection_sock.h | 6 + > include/net/ulp_ddp.h | 191 +++++++++++++++++++++++++++++ > include/net/ulp_ddp_caps.h | 35 ++++++ > net/Kconfig | 20 +++ > net/core/skbuff.c | 3 +- > net/ipv4/tcp_input.c | 13 +- > net/ipv4/tcp_ipv4.c | 3 + > net/ipv4/tcp_offload.c | 3 + > 10 files changed, 311 insertions(+), 3 deletions(-) > create mode 100644 include/net/ulp_ddp.h > create mode 100644 include/net/ulp_ddp_caps.h > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index b828c7a75be2..26e25b2df6fa 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -54,6 +54,10 @@ > #include <net/net_debug.h> > #include <net/dropreason-core.h> > > +#ifdef CONFIG_ULP_DDP > +#include <net/ulp_ddp_caps.h> > +#endif > + > struct netpoll_info; > struct device; > struct ethtool_ops; > @@ -1418,6 +1422,8 @@ struct netdev_net_notifier { > * Get hardware timestamp based on normal/adjustable time or free running > * cycle counter. This function is required if physical clock supports a > * free running cycle counter. > + * struct ulp_ddp_dev_ops *ulp_ddp_ops; > + * ULP DDP operations (see include/net/ulp_ddp.h) > */ > struct net_device_ops { > int (*ndo_init)(struct net_device *dev); > @@ -1652,6 +1658,9 @@ struct net_device_ops { > ktime_t (*ndo_get_tstamp)(struct net_device *dev, > const struct skb_shared_hwtstamps *hwtstamps, > bool cycles); > +#if IS_ENABLED(CONFIG_ULP_DDP) > + const struct ulp_ddp_dev_ops *ulp_ddp_ops; > +#endif > }; > > struct xdp_metadata_ops { > @@ -1825,6 +1834,9 @@ enum netdev_ml_priv_type { > * @mpls_features: Mask of features inheritable by MPLS > * @gso_partial_features: value(s) from NETIF_F_GSO\* > * > + * @ulp_ddp_caps: Bitflags keeping track of supported and enabled > + * ULP DDP capabilities. > + * > * @ifindex: interface index > * @group: The group the device belongs to > * > @@ -2121,6 +2133,9 @@ struct net_device { > netdev_features_t mpls_features; > netdev_features_t gso_partial_features; > > +#ifdef CONFIG_ULP_DDP > + struct ulp_ddp_netdev_caps ulp_ddp_caps; > +#endif > unsigned int min_mtu; > unsigned int max_mtu; > unsigned short type; > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index 91ed66952580..cfa65945874d 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -813,6 +813,8 @@ typedef unsigned char *sk_buff_data_t; > * delivery_time in mono clock base (i.e. EDT). Otherwise, the > * skb->tstamp has the (rcv) timestamp at ingress and > * delivery_time at egress. > + * @ulp_ddp: DDP offloaded > + * @ulp_crc: CRC offloaded > * @napi_id: id of the NAPI struct this skb came from > * @sender_cpu: (aka @napi_id) source CPU in XPS > * @alloc_cpu: CPU which did the skb allocation. > @@ -992,7 +994,10 @@ struct sk_buff { > #if IS_ENABLED(CONFIG_IP_SCTP) > __u8 csum_not_inet:1; > #endif > - > +#ifdef CONFIG_ULP_DDP > + __u8 ulp_ddp:1; > + __u8 ulp_crc:1; > +#endif > #ifdef CONFIG_NET_SCHED > __u16 tc_index; /* traffic control index */ > #endif > @@ -5040,5 +5045,23 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb) > ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, > ssize_t maxsize, gfp_t gfp); > > +static inline bool skb_is_ulp_ddp(struct sk_buff *skb) > +{ > +#ifdef CONFIG_ULP_DDP > + return skb->ulp_ddp; > +#else > + return 0; > +#endif > +} > + > +static inline bool skb_is_ulp_crc(struct sk_buff *skb) > +{ > +#ifdef CONFIG_ULP_DDP > + return skb->ulp_crc; > +#else > + return 0; > +#endif > +} > + > #endif /* __KERNEL__ */ > #endif /* _LINUX_SKBUFF_H */ > diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h > index c2b15f7e5516..b11fbbc95541 100644 > --- a/include/net/inet_connection_sock.h > +++ b/include/net/inet_connection_sock.h > @@ -68,6 +68,8 @@ struct inet_connection_sock_af_ops { > * @icsk_ulp_ops Pluggable ULP control hook > * @icsk_ulp_data ULP private data > * @icsk_clean_acked Clean acked data hook > + * @icsk_ulp_ddp_ops Pluggable ULP direct data placement control hook > + * @icsk_ulp_ddp_data ULP direct data placement private data > * @icsk_ca_state: Congestion control state > * @icsk_retransmits: Number of unrecovered [RTO] timeouts > * @icsk_pending: Scheduled timer event > @@ -98,6 +100,10 @@ struct inet_connection_sock { > const struct tcp_ulp_ops *icsk_ulp_ops; > void __rcu *icsk_ulp_data; > void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); > +#ifdef CONFIG_ULP_DDP > + const struct ulp_ddp_ulp_ops *icsk_ulp_ddp_ops; > + void __rcu *icsk_ulp_ddp_data; > +#endif > unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); > __u8 icsk_ca_state:5, > icsk_ca_initialized:1, > diff --git a/include/net/ulp_ddp.h b/include/net/ulp_ddp.h > new file mode 100644 > index 000000000000..b85fda4450b4 > --- /dev/null > +++ b/include/net/ulp_ddp.h > @@ -0,0 +1,191 @@ > +/* SPDX-License-Identifier: GPL-2.0 > + * > + * ulp_ddp.h > + * Author: Boris Pismenny <borisp@nvidia.com> > + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. > + */ > +#ifndef _ULP_DDP_H > +#define _ULP_DDP_H > + > +#include <linux/netdevice.h> > +#include <net/inet_connection_sock.h> > +#include <net/sock.h> > + > +#include "ulp_ddp_caps.h" > + > +enum ulp_ddp_type { > + ULP_DDP_NVME = 1, > +}; > + > +/** > + * struct nvme_tcp_ddp_limits - nvme tcp driver limitations > + * > + * @full_ccid_range: true if the driver supports the full CID range > + */ > +struct nvme_tcp_ddp_limits { > + bool full_ccid_range; > +}; > + > +/** > + * struct ulp_ddp_limits - Generic ulp ddp limits: tcp ddp > + * protocol limits. > + * Add new instances of ulp_ddp_limits in the union below (nvme-tcp, etc.). > + * > + * @type: type of this limits struct > + * @max_ddp_sgl_len: maximum sgl size supported (zero means no limit) > + * @io_threshold: minimum payload size required to offload > + * @tls: support for ULP over TLS > + * @nvmeotcp: NVMe-TCP specific limits > + */ > +struct ulp_ddp_limits { > + enum ulp_ddp_type type; > + int max_ddp_sgl_len; > + int io_threshold; > + bool tls:1; Is this a catch-all for tls 1.2/1.3 or future ones? > + union { > + struct nvme_tcp_ddp_limits nvmeotcp; > + }; > +}; > + > +/** > + * struct nvme_tcp_ddp_config - nvme tcp ddp configuration for an IO queue > + * > + * @pfv: pdu version (e.g., NVME_TCP_PFV_1_0) > + * @cpda: controller pdu data alignment (dwords, 0's based) > + * @dgst: digest types enabled (header or data, see enum nvme_tcp_digest_option). > + * The netdev will offload crc if it is supported. > + * @queue_size: number of nvme-tcp IO queue elements > + * @queue_id: queue identifier > + * @io_cpu: cpu core running the IO thread for this queue > + */ > +struct nvme_tcp_ddp_config { > + u16 pfv; > + u8 cpda; > + u8 dgst; > + int queue_size; > + int queue_id; > + int io_cpu; > +}; > + > +/** > + * struct ulp_ddp_config - Generic ulp ddp configuration > + * Add new instances of ulp_ddp_config in the union below (nvme-tcp, etc.). > + * > + * @type: type of this config struct > + * @nvmeotcp: NVMe-TCP specific config > + */ > +struct ulp_ddp_config { > + enum ulp_ddp_type type; > + union { > + struct nvme_tcp_ddp_config nvmeotcp; > + }; > +}; > + > +/** > + * struct ulp_ddp_io - ulp ddp configuration for an IO request. > + * > + * @command_id: identifier on the wire associated with these buffers > + * @nents: number of entries in the sg_table > + * @sg_table: describing the buffers for this IO request > + * @first_sgl: first SGL in sg_table > + */ > +struct ulp_ddp_io { > + u32 command_id; > + int nents; > + struct sg_table sg_table; > + struct scatterlist first_sgl[SG_CHUNK_SIZE]; > +}; > + > +struct ethtool_ulp_ddp_stats; > +struct netlink_ext_ack; > + > +/** > + * struct ulp_ddp_dev_ops - operations used by an upper layer protocol > + * to configure ddp offload > + * > + * @limits: query ulp driver limitations and quirks. > + * @sk_add: add offload for the queue represented by socket+config > + * pair. this function is used to configure either copy, crc > + * or both offloads. > + * @sk_del: remove offload from the socket, and release any device > + * related resources. > + * @setup: request copy offload for buffers associated with a > + * command_id in ulp_ddp_io. > + * @teardown: release offload resources association between buffers > + * and command_id in ulp_ddp_io. > + * @resync: respond to the driver's resync_request. Called only if > + * resync is successful. > + * @set_caps: set device ULP DDP capabilities. > + * returns a negative error code or zero. > + * @get_stats: query ULP DDP statistics. > + */ > +struct ulp_ddp_dev_ops { > + int (*limits)(struct net_device *netdev, > + struct ulp_ddp_limits *limits); > + int (*sk_add)(struct net_device *netdev, > + struct sock *sk, > + struct ulp_ddp_config *config); > + void (*sk_del)(struct net_device *netdev, > + struct sock *sk); > + int (*setup)(struct net_device *netdev, > + struct sock *sk, > + struct ulp_ddp_io *io); > + void (*teardown)(struct net_device *netdev, > + struct sock *sk, > + struct ulp_ddp_io *io, > + void *ddp_ctx); > + void (*resync)(struct net_device *netdev, > + struct sock *sk, u32 seq); > + int (*set_caps)(struct net_device *dev, unsigned long *bits, > + struct netlink_ext_ack *extack); > + int (*get_stats)(struct net_device *dev, > + struct ethtool_ulp_ddp_stats *stats); > +}; It would be beneficial to have proper wrappers to these that can also do some housekeeping work around the callbacks.
Sagi Grimberg <sagi@grimberg.me> writes: >> +struct ulp_ddp_limits { >> + enum ulp_ddp_type type; >> + int max_ddp_sgl_len; >> + int io_threshold; >> + bool tls:1; > > Is this a catch-all for tls 1.2/1.3 or future ones? This is catch-all. Once it will be supported together with the offload, we can add the TLS type incrementaly. >> +struct ulp_ddp_dev_ops { >> + int (*limits)(struct net_device *netdev, >> + struct ulp_ddp_limits *limits); >> + int (*sk_add)(struct net_device *netdev, >> + struct sock *sk, >> + struct ulp_ddp_config *config); >> + void (*sk_del)(struct net_device *netdev, >> + struct sock *sk); >> + int (*setup)(struct net_device *netdev, >> + struct sock *sk, >> + struct ulp_ddp_io *io); >> + void (*teardown)(struct net_device *netdev, >> + struct sock *sk, >> + struct ulp_ddp_io *io, >> + void *ddp_ctx); >> + void (*resync)(struct net_device *netdev, >> + struct sock *sk, u32 seq); >> + int (*set_caps)(struct net_device *dev, unsigned long *bits, >> + struct netlink_ext_ack *extack); >> + int (*get_stats)(struct net_device *dev, >> + struct ethtool_ulp_ddp_stats *stats); >> +}; > > It would be beneficial to have proper wrappers to these that > can also do some housekeeping work around the callbacks. We can add wrappers in net/core/ulp_ddp.c that can NULL-check the function pointers and do any common housekeeping if needed. Thanks
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b828c7a75be2..26e25b2df6fa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -54,6 +54,10 @@ #include <net/net_debug.h> #include <net/dropreason-core.h> +#ifdef CONFIG_ULP_DDP +#include <net/ulp_ddp_caps.h> +#endif + struct netpoll_info; struct device; struct ethtool_ops; @@ -1418,6 +1422,8 @@ struct netdev_net_notifier { * Get hardware timestamp based on normal/adjustable time or free running * cycle counter. This function is required if physical clock supports a * free running cycle counter. + * struct ulp_ddp_dev_ops *ulp_ddp_ops; + * ULP DDP operations (see include/net/ulp_ddp.h) */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1652,6 +1658,9 @@ struct net_device_ops { ktime_t (*ndo_get_tstamp)(struct net_device *dev, const struct skb_shared_hwtstamps *hwtstamps, bool cycles); +#if IS_ENABLED(CONFIG_ULP_DDP) + const struct ulp_ddp_dev_ops *ulp_ddp_ops; +#endif }; struct xdp_metadata_ops { @@ -1825,6 +1834,9 @@ enum netdev_ml_priv_type { * @mpls_features: Mask of features inheritable by MPLS * @gso_partial_features: value(s) from NETIF_F_GSO\* * + * @ulp_ddp_caps: Bitflags keeping track of supported and enabled + * ULP DDP capabilities. + * * @ifindex: interface index * @group: The group the device belongs to * @@ -2121,6 +2133,9 @@ struct net_device { netdev_features_t mpls_features; netdev_features_t gso_partial_features; +#ifdef CONFIG_ULP_DDP + struct ulp_ddp_netdev_caps ulp_ddp_caps; +#endif unsigned int min_mtu; unsigned int max_mtu; unsigned short type; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 91ed66952580..cfa65945874d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -813,6 +813,8 @@ typedef unsigned char *sk_buff_data_t; * delivery_time in mono clock base (i.e. EDT). Otherwise, the * skb->tstamp has the (rcv) timestamp at ingress and * delivery_time at egress. + * @ulp_ddp: DDP offloaded + * @ulp_crc: CRC offloaded * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS * @alloc_cpu: CPU which did the skb allocation. @@ -992,7 +994,10 @@ struct sk_buff { #if IS_ENABLED(CONFIG_IP_SCTP) __u8 csum_not_inet:1; #endif - +#ifdef CONFIG_ULP_DDP + __u8 ulp_ddp:1; + __u8 ulp_crc:1; +#endif #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ #endif @@ -5040,5 +5045,23 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb) ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, ssize_t maxsize, gfp_t gfp); +static inline bool skb_is_ulp_ddp(struct sk_buff *skb) +{ +#ifdef CONFIG_ULP_DDP + return skb->ulp_ddp; +#else + return 0; +#endif +} + +static inline bool skb_is_ulp_crc(struct sk_buff *skb) +{ +#ifdef CONFIG_ULP_DDP + return skb->ulp_crc; +#else + return 0; +#endif +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c2b15f7e5516..b11fbbc95541 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -68,6 +68,8 @@ struct inet_connection_sock_af_ops { * @icsk_ulp_ops Pluggable ULP control hook * @icsk_ulp_data ULP private data * @icsk_clean_acked Clean acked data hook + * @icsk_ulp_ddp_ops Pluggable ULP direct data placement control hook + * @icsk_ulp_ddp_data ULP direct data placement private data * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event @@ -98,6 +100,10 @@ struct inet_connection_sock { const struct tcp_ulp_ops *icsk_ulp_ops; void __rcu *icsk_ulp_data; void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); +#ifdef CONFIG_ULP_DDP + const struct ulp_ddp_ulp_ops *icsk_ulp_ddp_ops; + void __rcu *icsk_ulp_ddp_data; +#endif unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:5, icsk_ca_initialized:1, diff --git a/include/net/ulp_ddp.h b/include/net/ulp_ddp.h new file mode 100644 index 000000000000..b85fda4450b4 --- /dev/null +++ b/include/net/ulp_ddp.h @@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * ulp_ddp.h + * Author: Boris Pismenny <borisp@nvidia.com> + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ +#ifndef _ULP_DDP_H +#define _ULP_DDP_H + +#include <linux/netdevice.h> +#include <net/inet_connection_sock.h> +#include <net/sock.h> + +#include "ulp_ddp_caps.h" + +enum ulp_ddp_type { + ULP_DDP_NVME = 1, +}; + +/** + * struct nvme_tcp_ddp_limits - nvme tcp driver limitations + * + * @full_ccid_range: true if the driver supports the full CID range + */ +struct nvme_tcp_ddp_limits { + bool full_ccid_range; +}; + +/** + * struct ulp_ddp_limits - Generic ulp ddp limits: tcp ddp + * protocol limits. + * Add new instances of ulp_ddp_limits in the union below (nvme-tcp, etc.). + * + * @type: type of this limits struct + * @max_ddp_sgl_len: maximum sgl size supported (zero means no limit) + * @io_threshold: minimum payload size required to offload + * @tls: support for ULP over TLS + * @nvmeotcp: NVMe-TCP specific limits + */ +struct ulp_ddp_limits { + enum ulp_ddp_type type; + int max_ddp_sgl_len; + int io_threshold; + bool tls:1; + union { + struct nvme_tcp_ddp_limits nvmeotcp; + }; +}; + +/** + * struct nvme_tcp_ddp_config - nvme tcp ddp configuration for an IO queue + * + * @pfv: pdu version (e.g., NVME_TCP_PFV_1_0) + * @cpda: controller pdu data alignment (dwords, 0's based) + * @dgst: digest types enabled (header or data, see enum nvme_tcp_digest_option). + * The netdev will offload crc if it is supported. + * @queue_size: number of nvme-tcp IO queue elements + * @queue_id: queue identifier + * @io_cpu: cpu core running the IO thread for this queue + */ +struct nvme_tcp_ddp_config { + u16 pfv; + u8 cpda; + u8 dgst; + int queue_size; + int queue_id; + int io_cpu; +}; + +/** + * struct ulp_ddp_config - Generic ulp ddp configuration + * Add new instances of ulp_ddp_config in the union below (nvme-tcp, etc.). + * + * @type: type of this config struct + * @nvmeotcp: NVMe-TCP specific config + */ +struct ulp_ddp_config { + enum ulp_ddp_type type; + union { + struct nvme_tcp_ddp_config nvmeotcp; + }; +}; + +/** + * struct ulp_ddp_io - ulp ddp configuration for an IO request. + * + * @command_id: identifier on the wire associated with these buffers + * @nents: number of entries in the sg_table + * @sg_table: describing the buffers for this IO request + * @first_sgl: first SGL in sg_table + */ +struct ulp_ddp_io { + u32 command_id; + int nents; + struct sg_table sg_table; + struct scatterlist first_sgl[SG_CHUNK_SIZE]; +}; + +struct ethtool_ulp_ddp_stats; +struct netlink_ext_ack; + +/** + * struct ulp_ddp_dev_ops - operations used by an upper layer protocol + * to configure ddp offload + * + * @limits: query ulp driver limitations and quirks. + * @sk_add: add offload for the queue represented by socket+config + * pair. this function is used to configure either copy, crc + * or both offloads. + * @sk_del: remove offload from the socket, and release any device + * related resources. + * @setup: request copy offload for buffers associated with a + * command_id in ulp_ddp_io. + * @teardown: release offload resources association between buffers + * and command_id in ulp_ddp_io. + * @resync: respond to the driver's resync_request. Called only if + * resync is successful. + * @set_caps: set device ULP DDP capabilities. + * returns a negative error code or zero. + * @get_stats: query ULP DDP statistics. + */ +struct ulp_ddp_dev_ops { + int (*limits)(struct net_device *netdev, + struct ulp_ddp_limits *limits); + int (*sk_add)(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_config *config); + void (*sk_del)(struct net_device *netdev, + struct sock *sk); + int (*setup)(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_io *io); + void (*teardown)(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_io *io, + void *ddp_ctx); + void (*resync)(struct net_device *netdev, + struct sock *sk, u32 seq); + int (*set_caps)(struct net_device *dev, unsigned long *bits, + struct netlink_ext_ack *extack); + int (*get_stats)(struct net_device *dev, + struct ethtool_ulp_ddp_stats *stats); +}; + +#define ULP_DDP_RESYNC_PENDING BIT(0) + +/** + * struct ulp_ddp_ulp_ops - Interface to register upper layer + * Direct Data Placement (DDP) TCP offload. + * @resync_request: NIC requests ulp to indicate if @seq is the start + * of a message. + * @ddp_teardown_done: NIC driver informs the ulp that teardown is done, + * used for async completions. + */ +struct ulp_ddp_ulp_ops { + bool (*resync_request)(struct sock *sk, u32 seq, u32 flags); + void (*ddp_teardown_done)(void *ddp_ctx); +}; + +/** + * struct ulp_ddp_ctx - Generic ulp ddp context + * + * @type: type of this context struct + * @buf: protocol-specific context struct + */ +struct ulp_ddp_ctx { + enum ulp_ddp_type type; + unsigned char buf[]; +}; + +static inline struct ulp_ddp_ctx *ulp_ddp_get_ctx(const struct sock *sk) +{ +#ifdef CONFIG_ULP_DDP + struct inet_connection_sock *icsk = inet_csk(sk); + + return (__force struct ulp_ddp_ctx *)icsk->icsk_ulp_ddp_data; +#else + return NULL; +#endif +} + +static inline void ulp_ddp_set_ctx(struct sock *sk, void *ctx) +{ +#ifdef CONFIG_ULP_DDP + struct inet_connection_sock *icsk = inet_csk(sk); + + rcu_assign_pointer(icsk->icsk_ulp_ddp_data, ctx); +#endif +} + +#endif /* _ULP_DDP_H */ diff --git a/include/net/ulp_ddp_caps.h b/include/net/ulp_ddp_caps.h new file mode 100644 index 000000000000..8ee964a4df78 --- /dev/null +++ b/include/net/ulp_ddp_caps.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * ulp_ddp.h + * Author: Aurelien Aptel <aaptel@nvidia.com> + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ +#ifndef _ULP_DDP_CAPS_H +#define _ULP_DDP_CAPS_H + +#include <linux/types.h> + +enum { + ULP_DDP_C_NVME_TCP_BIT, + ULP_DDP_C_NVME_TCP_DDGST_RX_BIT, + + /* add capabilities above */ + ULP_DDP_C_COUNT, +}; + +struct ulp_ddp_netdev_caps { + DECLARE_BITMAP(active, ULP_DDP_C_COUNT); + DECLARE_BITMAP(hw, ULP_DDP_C_COUNT); +}; + +static inline bool ulp_ddp_cap_turned_on(unsigned long *old, unsigned long *new, int bit_nr) +{ + return !test_bit(bit_nr, old) && test_bit(bit_nr, new); +} + +static inline bool ulp_ddp_cap_turned_off(unsigned long *old, unsigned long *new, int bit_nr) +{ + return test_bit(bit_nr, old) && !test_bit(bit_nr, new); +} + +#endif diff --git a/net/Kconfig b/net/Kconfig index 2fb25b534df5..a1d06d8b7a00 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -503,4 +503,24 @@ config NETDEV_ADDR_LIST_TEST default KUNIT_ALL_TESTS depends on KUNIT +config ULP_DDP + bool "ULP direct data placement offload" + help + This feature provides a generic infrastructure for Direct + Data Placement (DDP) offload for Upper Layer Protocols (ULP, + such as NVMe-TCP). + + If the ULP and NIC driver supports it, the ULP code can + request the NIC to place ULP response data directly + into application memory, avoiding a costly copy. + + This infrastructure also allows for offloading the ULP data + integrity checks (e.g. data digest) that would otherwise + require another costly pass on the data we managed to avoid + copying. + + For more information, see + <file:Documentation/networking/ulp-ddp-offload.rst>. + + endif # if NET diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6c5915efbc17..eac2d0dc5122 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -75,6 +75,7 @@ #include <net/mctp.h> #include <net/page_pool.h> #include <net/dropreason.h> +#include <net/ulp_ddp.h> #include <linux/uaccess.h> #include <trace/events/skb.h> @@ -6505,7 +6506,7 @@ void skb_condense(struct sk_buff *skb) { if (skb->data_len) { if (skb->data_len > skb->end - skb->tail || - skb_cloned(skb)) + skb_cloned(skb) || skb_is_ulp_ddp(skb)) return; /* Nice, we can free page frag(s) right now */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 57c8af1859c1..aeae09290cdd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4677,7 +4677,10 @@ static bool tcp_try_coalesce(struct sock *sk, if (from->decrypted != to->decrypted) return false; #endif - +#ifdef CONFIG_ULP_DDP + if (skb_is_ulp_crc(from) != skb_is_ulp_crc(to)) + return false; +#endif if (!skb_try_coalesce(to, from, fragstolen, &delta)) return false; @@ -5240,6 +5243,10 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); #ifdef CONFIG_TLS_DEVICE nskb->decrypted = skb->decrypted; +#endif +#ifdef CONFIG_ULP_DDP + nskb->ulp_ddp = skb->ulp_ddp; + nskb->ulp_crc = skb->ulp_crc; #endif TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; if (list) @@ -5273,6 +5280,10 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, #ifdef CONFIG_TLS_DEVICE if (skb->decrypted != nskb->decrypted) goto end; +#endif +#ifdef CONFIG_ULP_DDP + if (skb_is_ulp_crc(skb) != skb_is_ulp_crc(nskb)) + goto end; #endif } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fd365de4d5ff..8ca4af90f03a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1866,6 +1866,9 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || #ifdef CONFIG_TLS_DEVICE tail->decrypted != skb->decrypted || +#endif +#ifdef CONFIG_ULP_DDP + skb_is_ulp_crc(tail) != skb_is_ulp_crc(skb) || #endif thtail->doff != th->doff || memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 8311c38267b5..56705fbe6ce4 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -268,6 +268,9 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) #ifdef CONFIG_TLS_DEVICE flush |= p->decrypted ^ skb->decrypted; #endif +#ifdef CONFIG_ULP_DDP + flush |= skb_is_ulp_crc(p) ^ skb_is_ulp_crc(skb); +#endif if (flush || skb_gro_receive(p, skb)) { mss = 1;