diff mbox

[rdma-core,3/6] verbs: Add basic infrastructure for mixed write and ioctl cmds

Message ID 20180218204002.7408-4-jgg@ziepe.ca (mailing list archive)
State Accepted
Headers show

Commit Message

Jason Gunthorpe Feb. 18, 2018, 8:39 p.m. UTC
From: Jason Gunthorpe <jgg@mellanox.com>

This provides a series of helper macros designed to let us merge the ioctl and
write paths of each command into a single function following a standard
pattern.

The macros intend to support a universal cmd function that accepts a
'ibv_commad_buffer link' from the driver, allowing the driver to provide
both the classic UHW structures as well as any additional driver specific
ids required.

To avoid changing every driver a wrapper is provided that generates the
ibv_command_buffer link from the standard cmd and resp pointers.

The helper macros are designed so the ioctl work can be largely elided by the
compiler if ioctl support is disabled, and that the legacy write work can be
elided if write compat support is disabled.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 CMakeLists.txt            |  15 +++
 buildlib/config.h.in      |  11 ++
 libibverbs/CMakeLists.txt |   2 +
 libibverbs/cmd_fallback.c | 263 +++++++++++++++++++++++++++++++++++++++
 libibverbs/cmd_ioctl.c    |  83 ++++++++++++-
 libibverbs/cmd_ioctl.h    |  51 ++++++++
 libibverbs/cmd_write.h    | 311 ++++++++++++++++++++++++++++++++++++++++++++++
 libibverbs/ibverbs.h      |   2 +
 8 files changed, 737 insertions(+), 1 deletion(-)
 create mode 100644 libibverbs/cmd_fallback.c
 create mode 100644 libibverbs/cmd_write.h
diff mbox

Patch

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ce76858662b013..96c36c947a3652 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,9 @@ 
 #      Do not generate backwards compatibility symbols in the shared
 #      libraries. This may is necessary if using a dynmic linker that does
 #      not support symbol versions, such as uclibc.
+#  -DIOCTL_MODE=both (default write)
+#      Enable new kABI ioctl() support and support for the legacy write
+#      path. May also be 'ioctl' to disable fallback to write.
 
 cmake_minimum_required(VERSION 2.8.11 FATAL_ERROR)
 project(rdma-core C)
@@ -435,6 +438,18 @@  if (NOT ${CMAKE_CURRENT_BINARY_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
   file(WRITE ${CMAKE_BINARY_DIR}/.gitignore "*")
 endif()
 
+if ("${IOCTL_MODE}" STREQUAL "both")
+  set(IOCTL_MODE_NUM 3)
+elseif ("${IOCTL_MODE}" STREQUAL "write")
+  set(IOCTL_MODE_NUM 2)
+elseif ("${IOCTL_MODE}" STREQUAL "ioctl")
+  set(IOCTL_MODE_NUM 1)
+elseif ("${IOCTL_MODE}" STREQUAL "")
+  set(IOCTL_MODE_NUM 2)
+else()
+  message(FATAL_ERROR "-DIOCTL_MODE=${IOCTL_MODE} is not a valid choice")
+endif()
+
 configure_file("${BUILDLIB}/config.h.in" "${BUILD_INCLUDE}/config.h" ESCAPE_QUOTES @ONLY)
 
 #-------------------------
diff --git a/buildlib/config.h.in b/buildlib/config.h.in
index 8ca0bc0f3f62db..316b712c0d0da9 100644
--- a/buildlib/config.h.in
+++ b/buildlib/config.h.in
@@ -48,4 +48,15 @@ 
 # define NRESOLVE_NEIGH 1
 #endif
 
+#if @IOCTL_MODE_NUM@ == 1
+# define VERBS_IOCTL_ONLY 1
+# define VERBS_WRITE_ONLY 0
+#elif  @IOCTL_MODE_NUM@ == 2
+# define VERBS_IOCTL_ONLY 0
+# define VERBS_WRITE_ONLY 1
+#elif  @IOCTL_MODE_NUM@ == 3
+# define VERBS_IOCTL_ONLY 0
+# define VERBS_WRITE_ONLY 0
+#endif
+
 #endif
diff --git a/libibverbs/CMakeLists.txt b/libibverbs/CMakeLists.txt
index 7c889bfe88663f..f8ef8c76441ad2 100644
--- a/libibverbs/CMakeLists.txt
+++ b/libibverbs/CMakeLists.txt
@@ -9,6 +9,7 @@  publish_headers(infiniband
 
 publish_internal_headers(infiniband
   cmd_ioctl.h
+  cmd_write.h
   driver.h
   kern-abi.h
   marshall.h
@@ -27,6 +28,7 @@  rdma_library(ibverbs "${CMAKE_CURRENT_BINARY_DIR}/libibverbs.map"
   # See Documentation/versioning.md
   1 1.1.${PACKAGE_VERSION}
   cmd.c
+  cmd_fallback.c
   cmd_ioctl.c
   compat-1_0.c
   device.c
diff --git a/libibverbs/cmd_fallback.c b/libibverbs/cmd_fallback.c
new file mode 100644
index 00000000000000..1c217dfb78d00d
--- /dev/null
+++ b/libibverbs/cmd_fallback.c
@@ -0,0 +1,263 @@ 
+/*
+ * Copyright (c) 2018 Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <infiniband/cmd_ioctl.h>
+#include <infiniband/cmd_write.h>
+#include "ibverbs.h"
+
+#include <util/compiler.h>
+#include <ccan/build_assert.h>
+
+#include <unistd.h>
+
+/*
+ * Check if the command buffer provided by the driver includes anything that
+ * is not compatible with the legacy interface. If so, then
+ * _execute_ioctl_fallback indicates it handled the call and sets the error
+ * code
+ */
+enum write_fallback _check_legacy(struct ibv_command_buffer *cmdb, int *ret)
+{
+	struct ib_uverbs_attr *cur;
+	bool fallback_require_ex = cmdb->fallback_require_ex;
+	bool fallback_ioctl_only = cmdb->fallback_ioctl_only;
+
+	for (cmdb = cmdb->next; cmdb; cmdb = cmdb->next) {
+		for (cur = cmdb->hdr.attrs; cur != cmdb->next_attr; cur++) {
+			if (cur->attr_id != UVERBS_UHW_IN &&
+			    cur->attr_id != UVERBS_UHW_OUT &&
+			    cur->flags & UVERBS_ATTR_F_MANDATORY)
+				goto not_supp;
+		}
+		fallback_require_ex |= cmdb->fallback_require_ex;
+		fallback_ioctl_only |= cmdb->fallback_ioctl_only;
+	}
+
+	if (fallback_ioctl_only)
+		return ERROR;
+
+	if (fallback_require_ex)
+		return TRY_WRITE_EX;
+	return TRY_WRITE;
+
+not_supp:
+	errno = EOPNOTSUPP;
+	*ret = EOPNOTSUPP;
+	return ERROR;
+}
+
+/*
+ * Used to support callers that have a fallback to the old write ABI
+ * interface.
+ */
+enum write_fallback _execute_ioctl_fallback(struct ibv_context *ctx,
+					    unsigned int cmd_bit,
+					    struct ibv_command_buffer *cmdb,
+					    int *ret)
+{
+	uint64_t cmd_val = 1ULL << cmd_bit;
+
+	BUILD_ASSERT(sizeof(struct verbs_context_ops) / sizeof(void *) < 64);
+
+	struct verbs_ex_private *priv =
+		container_of(ctx, struct verbs_context, context)->priv;
+
+	if (priv->unsupported_ioctls & cmd_val)
+		return _check_legacy(cmdb, ret);
+
+	*ret = execute_ioctl(ctx, cmdb);
+
+	if (likely(*ret == 0))
+		return SUCCESS;
+
+	if (*ret == ENOTTY) {
+		/* ENOTTY means the ioctl framework is entirely absent */
+		priv->unsupported_ioctls = UINT64_MAX;
+		return _check_legacy(cmdb, ret);
+	}
+
+	if (*ret == EPROTONOSUPPORT) {
+		/*
+		 * EPROTONOSUPPORT means we have the ioctl framework but this
+		 * specific method is not supported
+		 */
+		priv->unsupported_ioctls |= cmd_val;
+		return _check_legacy(cmdb, ret);
+	}
+
+	return ERROR;
+}
+
+/*
+ * Within the command implementation we get a pointer to the request and
+ * response buffers for the legacy interface. This pointer is either allocated
+ * on the stack (if the driver didn't provide a UHW) or arranged to be
+ * directly before the UHW memory (see _write_set_uhw)
+ */
+void *_write_get_req(struct ibv_command_buffer *link, void *onstack,
+		     size_t size)
+{
+	struct ib_uverbs_cmd_hdr *hdr;
+
+	size += sizeof(*hdr);
+
+	if (link->uhw_in_idx != _UHW_NO_INDEX) {
+		struct ib_uverbs_attr *uhw = &link->hdr.attrs[link->uhw_in_idx];
+
+		assert(uhw->attr_id == UVERBS_UHW_IN);
+		assert(link->uhw_in_headroom_dwords * 4 >= size);
+		hdr = (void *)((uintptr_t)uhw->data - size);
+		hdr->in_words = __check_divide(size + uhw->len, 4);
+	} else {
+		hdr = onstack;
+		hdr->in_words = __check_divide(size, 4);
+	}
+
+	return hdr + 1;
+}
+
+void *_write_get_req_ex(struct ibv_command_buffer *link, void *onstack,
+			size_t size)
+{
+	struct _ib_ex_hdr *hdr;
+	size_t full_size = size + sizeof(*hdr);
+
+	if (link->uhw_in_idx != _UHW_NO_INDEX) {
+		struct ib_uverbs_attr *uhw = &link->hdr.attrs[link->uhw_in_idx];
+
+		assert(uhw->attr_id == UVERBS_UHW_IN);
+		assert(link->uhw_in_headroom_dwords * 4 >= full_size);
+		hdr = (void *)((uintptr_t)uhw->data - full_size);
+		hdr->hdr.in_words = __check_divide(size, 8);
+		hdr->ex_hdr.provider_in_words = __check_divide(uhw->len, 8);
+	} else {
+		hdr = onstack;
+		hdr->hdr.in_words = __check_divide(size, 8);
+		hdr->ex_hdr.provider_in_words = 0;
+	}
+
+	return hdr + 1;
+}
+
+void *_write_get_resp(struct ibv_command_buffer *link,
+		      struct ib_uverbs_cmd_hdr *hdr, void *onstack,
+		      size_t resp_size)
+{
+	void *resp_start;
+
+	if (link->uhw_out_idx != _UHW_NO_INDEX) {
+		struct ib_uverbs_attr *uhw =
+			&link->hdr.attrs[link->uhw_out_idx];
+
+		assert(uhw->attr_id == UVERBS_UHW_OUT);
+		assert(link->uhw_out_headroom_dwords * 4 >= resp_size);
+		resp_start = (void *)((uintptr_t)uhw->data - resp_size);
+		hdr->out_words = __check_divide(resp_size + uhw->len, 4);
+	} else {
+		resp_start = onstack;
+		hdr->out_words = __check_divide(resp_size, 4);
+	}
+
+	return resp_start;
+}
+
+void *_write_get_resp_ex(struct ibv_command_buffer *link,
+			 struct _ib_ex_hdr *hdr, void *onstack,
+			 size_t resp_size)
+{
+	void *resp_start;
+
+	if (link->uhw_out_idx != _UHW_NO_INDEX) {
+		struct ib_uverbs_attr *uhw =
+			&link->hdr.attrs[link->uhw_out_idx];
+
+		assert(uhw->attr_id == UVERBS_UHW_OUT);
+		assert(link->uhw_out_headroom_dwords * 4 >= resp_size);
+		resp_start = (void *)((uintptr_t)uhw->data - resp_size);
+		hdr->hdr.out_words = __check_divide(resp_size, 8);
+		hdr->ex_hdr.provider_out_words = __check_divide(uhw->len, 8);
+	} else {
+		resp_start = onstack;
+		hdr->hdr.out_words = __check_divide(resp_size, 8);
+		hdr->ex_hdr.provider_out_words = 0;
+	}
+
+	return resp_start;
+}
+
+int _execute_write_raw(unsigned int cmdnum, struct ibv_context *ctx,
+		       struct ib_uverbs_cmd_hdr *hdr, void *resp)
+{
+	hdr->command = cmdnum;
+
+	/*
+	 * Users assumes the stack buffer is zeroed before passing to the
+	 * kernel for writing.
+	 */
+	memset(resp, 0, hdr->out_words * 4);
+
+	if (write(ctx->cmd_fd, hdr, hdr->in_words * 4) != hdr->in_words * 4)
+		return errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, hdr->out_words * 4);
+
+	return 0;
+}
+
+int _execute_write_raw_ex(uint32_t cmdnum, struct ibv_context *ctx,
+			  struct _ib_ex_hdr *hdr, void *resp)
+{
+	size_t write_bytes =
+		sizeof(*hdr) +
+		(hdr->hdr.in_words + hdr->ex_hdr.provider_in_words) * 8;
+	size_t resp_bytes =
+		(hdr->hdr.out_words + hdr->ex_hdr.provider_out_words) * 8;
+
+	hdr->hdr.command = (IB_USER_VERBS_CMD_FLAG_EXTENDED
+			    << IB_USER_VERBS_CMD_FLAGS_SHIFT) |
+			   cmdnum;
+	hdr->ex_hdr.cmd_hdr_reserved = 0;
+	hdr->ex_hdr.response =  ioctl_ptr_to_u64(resp);
+
+	/*
+	 * Users assumes the stack buffer is zeroed before passing to the
+	 * kernel for writing.
+	 */
+	memset(resp, 0, resp_bytes);
+
+	if (write(ctx->cmd_fd, hdr, write_bytes) != write_bytes)
+		return errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_bytes);
+
+	return 0;
+}
diff --git a/libibverbs/cmd_ioctl.c b/libibverbs/cmd_ioctl.c
index 722dabb565069e..fbd97bae703fb9 100644
--- a/libibverbs/cmd_ioctl.c
+++ b/libibverbs/cmd_ioctl.c
@@ -31,9 +31,10 @@ 
  */
 
 #include <infiniband/cmd_ioctl.h>
+#include <infiniband/cmd_write.h>
+#include "ibverbs.h"
 
 #include <sys/ioctl.h>
-#include <valgrind/memcheck.h>
 
 /* Number of attrs in this and all the link'd buffers */
 unsigned int __ioctl_final_num_attrs(unsigned int num_attrs,
@@ -57,6 +58,16 @@  static void prepare_attrs(struct ibv_command_buffer *cmd)
 		assert(cmd->hdr.object_id == link->hdr.object_id);
 		assert(cmd->hdr.method_id == link->hdr.method_id);
 
+		/*
+		 * Keep track of where the uhw_in lands in the final array if
+		 * we copy it from a link
+		 */
+		if (!VERBS_IOCTL_ONLY && link->uhw_in_idx != _UHW_NO_INDEX) {
+			assert(cmd->uhw_in_idx == _UHW_NO_INDEX);
+			cmd->uhw_in_idx =
+				link->uhw_in_idx + (end - cmd->hdr.attrs);
+		}
+
 		for (cur = link->hdr.attrs; cur != link->next_attr; cur++)
 			*end++ = *cur;
 
@@ -64,6 +75,20 @@  static void prepare_attrs(struct ibv_command_buffer *cmd)
 	}
 
 	cmd->hdr.num_attrs = end - cmd->hdr.attrs;
+
+	/*
+	 * We keep the in UHW uninlined until directly before sending to
+	 * support the compat path. See _fill_attr_in_uhw
+	 */
+	if (!VERBS_IOCTL_ONLY && cmd->uhw_in_idx != _UHW_NO_INDEX) {
+		struct ib_uverbs_attr *uhw = &cmd->hdr.attrs[cmd->uhw_in_idx];
+
+		assert(uhw->attr_id == UVERBS_UHW_IN);
+
+		if (uhw->len <= sizeof(uhw->data))
+			memcpy(&uhw->data, (void *)(uintptr_t)uhw->data,
+			       uhw->len);
+	}
 }
 
 static void finalize_attr(struct ib_uverbs_attr *attr)
@@ -110,3 +135,59 @@  int execute_ioctl(struct ibv_context *context, struct ibv_command_buffer *cmd)
 
 	return 0;
 }
+
+/*
+ * The compat scheme for UHW IN requires a pointer in .data, however the
+ * kernel protocol requires pointers < 8 to be inlined into .data. We defer
+ * that transformation until directly before the ioctl.
+ */
+static inline struct ib_uverbs_attr *
+_fill_attr_in_uhw(struct ibv_command_buffer *cmd, uint16_t attr_id,
+		 const void *data, size_t len)
+{
+	struct ib_uverbs_attr *attr = _ioctl_next_attr(cmd, attr_id);
+
+	assert(len <= UINT16_MAX);
+
+	attr->len = len;
+	attr->data = ioctl_ptr_to_u64(data);
+
+	return attr;
+}
+
+/*
+ * This helper is used in the driver compat wrappers to build the
+ * command buffer from the legacy input pointers format.
+ */
+void _write_set_uhw(struct ibv_command_buffer *cmdb, const void *req,
+		    size_t core_req_size, size_t req_size, void *resp,
+		    size_t core_resp_size, size_t resp_size)
+{
+	if (req && core_req_size < req_size) {
+		if (VERBS_IOCTL_ONLY)
+			cmdb->uhw_in_idx =
+				fill_attr_in(cmdb, UVERBS_UHW_IN,
+					     (uint8_t *)req + core_req_size,
+					     req_size - core_req_size) -
+				cmdb->hdr.attrs;
+		else
+			cmdb->uhw_in_idx =
+				_fill_attr_in_uhw(cmdb, UVERBS_UHW_IN,
+						  (uint8_t *)req +
+							  core_req_size,
+						  req_size - core_req_size) -
+				cmdb->hdr.attrs;
+		cmdb->uhw_in_headroom_dwords = __check_divide(core_req_size, 4);
+	}
+
+
+	if (resp && core_resp_size < resp_size) {
+		cmdb->uhw_out_idx =
+			fill_attr_out(cmdb, UVERBS_UHW_OUT,
+				      (uint8_t *)resp + core_resp_size,
+				      resp_size - core_resp_size) -
+			cmdb->hdr.attrs;
+		cmdb->uhw_out_headroom_dwords =
+			__check_divide(core_resp_size, 4);
+	}
+}
diff --git a/libibverbs/cmd_ioctl.h b/libibverbs/cmd_ioctl.h
index 13a1ca21c610d2..4bdd31221d468f 100644
--- a/libibverbs/cmd_ioctl.h
+++ b/libibverbs/cmd_ioctl.h
@@ -33,6 +33,8 @@ 
 #ifndef __INFINIBAND_VERBS_IOCTL_H
 #define __INFINIBAND_VERBS_IOCTL_H
 
+#include <config.h>
+
 #include <stdint.h>
 #include <assert.h>
 #include <rdma/rdma_user_ioctl.h>
@@ -69,9 +71,27 @@  struct ibv_command_buffer {
 	struct ibv_command_buffer *next;
 	struct ib_uverbs_attr *next_attr;
 	struct ib_uverbs_attr *last_attr;
+	/*
+	 * Used by the legacy write interface to keep track of where the UHW
+	 * buffer is located and the 'headroom' space that the common code
+	 * uses to construct the command header and common command struct
+	 * directly before the drivers' UHW.
+	 */
+	uint8_t uhw_in_idx;
+	uint8_t uhw_out_idx;
+	uint8_t uhw_in_headroom_dwords;
+	uint8_t uhw_out_headroom_dwords;
+	/*
+	 * These flags control what execute_ioctl_fallback does if the kernel
+	 * does not support ioctl
+	 */
+	uint8_t fallback_require_ex:1;
+	uint8_t fallback_ioctl_only:1;
 	struct ib_uverbs_ioctl_hdr hdr;
 };
 
+enum {_UHW_NO_INDEX = 0xFF};
+
 /*
  * Constructing an array of ibv_command_buffer is a reasonable way to expand
  * the VLA in hdr.attrs on the stack and also allocate some internal state in
@@ -102,6 +122,8 @@  unsigned int __ioctl_final_num_attrs(unsigned int num_attrs,
 				.method_id = (_method_id),                     \
 			},                                                     \
 		.next = _link,                                                 \
+		.uhw_in_idx = _UHW_NO_INDEX,                                   \
+		.uhw_out_idx = _UHW_NO_INDEX,                                  \
 		.next_attr = (_hdr).attrs,                                     \
 		.last_attr = (_hdr).attrs + _num_attrs})
 
@@ -171,6 +193,29 @@  _ioctl_next_attr(struct ibv_command_buffer *cmd, uint16_t attr_id)
 	return attr;
 }
 
+/*
+ * This construction is insane, an expression with a side effect that returns
+ * from the calling function, but it is a non-invasive way to get the compiler
+ * to elide the IOCTL support in the backwards compat command functions
+ * without disturbing native ioctl support.
+ *
+ * A command function will set last_attr on the stack to NULL, and if it is
+ * coded properly, the compiler will prove that last_attr is never changed and
+ * elide the function. Unfortunately this penalizes native ioctl uses with the
+ * extra if overhead.
+ *
+ * For this reason, _ioctl_next_attr must never be called outside a fill
+ * function.
+ */
+#if VERBS_WRITE_ONLY
+#define _ioctl_next_attr(cmd, attr_id)                                         \
+	({                                                                     \
+		if (!((cmd)->last_attr))                                       \
+			return NULL;                                           \
+		_ioctl_next_attr(cmd, attr_id);                                \
+	})
+#endif
+
 /* Make the attribute optional. */
 static inline struct ib_uverbs_attr *attr_optional(struct ib_uverbs_attr *attr)
 {
@@ -296,4 +341,10 @@  fill_attr_out(struct ibv_command_buffer *cmd, uint16_t attr_id, void *data,
 #define fill_attr_out_ptr(cmd, attr_id, ptr)                                 \
 	fill_attr_out(cmd, attr_id, ptr, sizeof(*(ptr)))
 
+static inline size_t __check_divide(size_t val, unsigned int div)
+{
+	assert(val % div == 0);
+	return val / div;
+}
+
 #endif
diff --git a/libibverbs/cmd_write.h b/libibverbs/cmd_write.h
new file mode 100644
index 00000000000000..25846bd43e7722
--- /dev/null
+++ b/libibverbs/cmd_write.h
@@ -0,0 +1,311 @@ 
+/*
+ * Copyright (c) 2018 Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __INFINIBAND_VERBS_WRITE_H
+#define __INFINIBAND_VERBS_WRITE_H
+
+#include <infiniband/cmd_ioctl.h>
+#include <infiniband/driver.h>
+#include <rdma/ib_user_verbs.h>
+
+#include <stdbool.h>
+
+static inline struct ib_uverbs_cmd_hdr *get_req_hdr(void *req)
+{
+	return ((struct ib_uverbs_cmd_hdr *)req) - 1;
+}
+
+struct _ib_ex_hdr {
+	struct ib_uverbs_cmd_hdr hdr;
+	struct ib_uverbs_ex_cmd_hdr ex_hdr;
+};
+
+static inline struct _ib_ex_hdr *get_req_hdr_ex(void *req)
+{
+	return ((struct _ib_ex_hdr *)req) - 1;
+}
+
+/*
+ * When using these new interfaces the kernel UAPI structs 'ib_uverbs_*' are
+ * used, not the structs from kern-abi.h. The only difference between the two
+ * is the inclusion of the header in the kern-abi.h struct. This macro creates
+ * memory on the stack that includes both the header and the struct.
+ */
+#define DECLARE_LEGACY_REQ_BUF_CORE(_name, _pattern)                           \
+	struct {                                                               \
+		struct ib_uverbs_cmd_hdr hdr;                                  \
+		struct ib_uverbs_##_pattern core_payload;                      \
+	} _name
+
+#define DECLARE_LEGACY_REQ_BUF_CORE_EX(_name, _pattern)                        \
+	struct {                                                               \
+		struct ib_uverbs_cmd_hdr hdr;                                  \
+		struct ib_uverbs_ex_cmd_hdr ex_hdr;                            \
+		struct ib_uverbs_ex_##_pattern core_payload;                   \
+	} _name
+
+void *_write_get_req(struct ibv_command_buffer *link, void *onstack,
+		     size_t size);
+void *_write_get_req_ex(struct ibv_command_buffer *link, void *onstack,
+			size_t size);
+void *_write_get_resp(struct ibv_command_buffer *link,
+		      struct ib_uverbs_cmd_hdr *hdr, void *onstack,
+		      size_t resp_size);
+void *_write_get_resp_ex(struct ibv_command_buffer *link,
+			 struct _ib_ex_hdr *hdr, void *onstack,
+			 size_t resp_size);
+
+#define DECLARE_LEGACY_REQ_BUF(_name, _link, _pattern)                         \
+	DECLARE_LEGACY_REQ_BUF_CORE(__##_name##_onstack, _pattern);            \
+	struct ib_uverbs_##_pattern *_name =                                   \
+		_write_get_req(_link, &__##_name##_onstack, sizeof(*_name))
+
+#define DECLARE_LEGACY_REQ_BUF_EX(_name, _link, _pattern)                      \
+	DECLARE_LEGACY_REQ_BUF_CORE_EX(__##_name##_onstack, _pattern);         \
+	struct ib_uverbs_ex_##_pattern *_name =                                \
+		_write_get_req_ex(_link, &__##_name##_onstack, sizeof(*_name))
+
+#define DECLARE_LEGACY_RESP_BUF(_name, _link, _req, _pattern)                  \
+	struct ib_uverbs_##_pattern##_resp __##_name##_onstack,                \
+		*_name = _write_get_resp(_link, get_req_hdr(_req),             \
+					 &__##_name##_onstack, sizeof(*_name))
+
+#define DECLARE_LEGACY_RESP_BUF_EX(_name, _link, _req, _pattern)               \
+	struct ib_uverbs_ex_##_pattern##_resp __##_name##_onstack,             \
+		*_name = _write_get_resp_ex(_link, get_req_hdr_ex(_req),       \
+					    &__##_name##_onstack,              \
+					    sizeof(*_name))
+
+/*
+ * This macro creates 'req' and 'resp' pointers in the local stack frame that
+ * point to the core code write command structures patterned off _pattern.
+ *
+ * This should be done before calling execute_write_bufs
+ */
+#define DECLARE_LEGACY_UHW_BUFS(_link, _pattern)                               \
+	DECLARE_LEGACY_REQ_BUF(req, _link, _pattern);                          \
+	DECLARE_LEGACY_RESP_BUF(resp, _link, req, _pattern)
+
+#define DECLARE_LEGACY_UHW_BUFS_EX(_link, _pattern)                            \
+	DECLARE_LEGACY_REQ_BUF_EX(req, _link, _pattern);                       \
+	DECLARE_LEGACY_RESP_BUF_EX(resp, _link, req, _pattern)
+
+/*
+ * This macro is used to implement the compatibility command call wrappers.
+ * Compatibility calls do not accept a command_buffer, and cannot use the new
+ * attribute id mechanism. They accept the legacy kern-abi.h structs that have
+ * the embedded header.
+ */
+void _write_set_uhw(struct ibv_command_buffer *cmdb, const void *req,
+		    size_t core_req_size, size_t req_size, void *resp,
+		    size_t core_resp_size, size_t resp_size);
+#define DECLARE_CMD_BUFFER_COMPAT(_name, _object_id, _method_id)               \
+	DECLARE_COMMAND_BUFFER(_name, _object_id, _method_id, 2);              \
+	_write_set_uhw(_name, cmd, sizeof(*cmd), cmd_size, resp,               \
+		       sizeof(*resp), resp_size)
+
+/*
+ * The fallback scheme keeps track of which ioctls succeed in a per-context
+ * bitmap. If ENOTTY or EPROTONOSUPPORT is seen then the ioctl is never
+ * retried.
+ *
+ * cmd_name should be the name of the function op from verbs_context_ops
+ * that is being implemented.
+ */
+#define _CMD_BIT(cmd_name)                                                     \
+	(offsetof(struct verbs_context_ops, cmd_name) / sizeof(void *))
+
+enum write_fallback { TRY_WRITE, TRY_WRITE_EX, ERROR, SUCCESS };
+
+/*
+ * This bitmask indicate the required behavior of execute_ioctl_fallback when
+ * the ioctl is not supported. It is a priority list where the highest set bit
+ * takes precedence. This approach simplifies the typical required control
+ * flow of the user.
+ */
+static inline void fallback_require_ex(struct ibv_command_buffer *cmdb)
+{
+	cmdb->fallback_require_ex = 1;
+}
+
+static inline void fallback_require_ioctl(struct ibv_command_buffer *cmdb)
+{
+	cmdb->fallback_ioctl_only = 1;
+}
+
+enum write_fallback _check_legacy(struct ibv_command_buffer *cmdb, int *ret);
+
+enum write_fallback _execute_ioctl_fallback(struct ibv_context *ctx,
+					    unsigned int cmd_bit,
+					    struct ibv_command_buffer *cmdb,
+					    int *ret);
+
+#define execute_ioctl_fallback(ctx, cmd_name, cmdb, ret)                       \
+	_execute_ioctl_fallback(ctx, _CMD_BIT(cmd_name), cmdb, ret)
+
+/* These helpers replace the raw write() and IBV_INIT_CMD macros */
+int _execute_write_raw(unsigned int cmdnum, struct ibv_context *ctx,
+		       struct ib_uverbs_cmd_hdr *req, void *resp);
+
+/* For users of DECLARE_LEGACY_UHW_BUFS */
+#define execute_write_bufs(cmdnum, ctx, req, resp)                             \
+	({                                                                     \
+		(req)->response = ioctl_ptr_to_u64(resp);                      \
+		_execute_write_raw(cmdnum, ctx, get_req_hdr(req), resp);       \
+	})
+
+int _execute_write_raw_ex(uint32_t cmdnum, struct ibv_context *ctx,
+			  struct _ib_ex_hdr *req, void *resp);
+
+/* For users of DECLARE_LEGACY_UHW_BUFS_EX */
+#define execute_write_bufs_ex(cmdnum, ctx, req, resp)                          \
+	_execute_write_raw_ex(cmdnum, ctx, get_req_hdr_ex(req), resp)
+
+static inline int _execute_write(uint32_t cmdnum, struct ibv_context *ctx,
+				 void *req, size_t req_len, void *resp,
+				 size_t resp_len)
+{
+	struct ib_uverbs_cmd_hdr *hdr = get_req_hdr(req);
+
+	hdr->in_words = req_len / 4;
+	hdr->out_words = resp_len / 4;
+	return _execute_write_raw(cmdnum, ctx, hdr, resp);
+}
+
+/* For users with no possible UHW bufs. */
+#define DECLARE_LEGACY_CORE_BUFS(_pattern)                                     \
+	DECLARE_LEGACY_REQ_BUF_CORE(__req_onstack, _pattern);                  \
+	struct ib_uverbs_##_pattern *const req = &__req_onstack.core_payload;  \
+	struct ib_uverbs_##_pattern##_resp resp
+
+/*
+ * For users with no UHW bufs. To be used in conjunction with
+ * DECLARE_LEGACY_CORE_BUFS. req points to the core payload (with headroom for
+ * the header).
+ */
+#define execute_write(cmdnum, ctx, req, resp)                                  \
+	({                                                                     \
+		(req)->response = ioctl_ptr_to_u64(resp);                      \
+		_execute_write(cmdnum, ctx, req, sizeof(*req), resp,           \
+			       sizeof(*resp));                                 \
+	})
+
+/*
+ * These two macros are used only with execute_ioctl_fallback - they allow the
+ * IOCTL code to be elided by the compiler when disabled.
+ */
+#define DECLARE_FBCMD_BUFFER DECLARE_COMMAND_BUFFER_LINK
+
+/*
+ * Munge the macros above to remove certain paths during compilation based on
+ * the cmake flag.
+ */
+#if VERBS_IOCTL_ONLY
+static inline enum write_fallback
+_execute_ioctl_only(struct ibv_context *context, struct ibv_command_buffer *cmd,
+		    int *ret)
+{
+	*ret = execute_ioctl(context, cmd);
+	if (*ret)
+		return ERROR;
+
+	return SUCCESS;
+}
+
+#undef execute_ioctl_fallback
+#define execute_ioctl_fallback(ctx, cmd_name, cmdb, ret)                       \
+	_execute_ioctl_only(ctx, cmdb, ret)
+
+#undef execute_write_bufs
+static inline int execute_write_bufs(uint32_t cmdnum,
+				     struct ibv_context *ctx, void *req,
+				     void *resp)
+{
+	return ENOSYS;
+}
+
+#undef execute_write_bufs_ex
+static inline int execute_write_bufs_ex(uint32_t cmdnum,
+					struct ibv_context *ctx, void *req,
+					void *resp)
+{
+	return ENOSYS;
+}
+
+#undef execute_write
+static inline int execute_write(uint32_t cmdnum,
+				struct ibv_context *ctx, void *req,
+				void *resp)
+{
+	return ENOSYS;
+}
+
+#endif
+
+#if VERBS_WRITE_ONLY
+static inline enum write_fallback
+_execute_write_only(struct ibv_context *context, struct ibv_command_buffer *cmd,
+		    int *ret)
+{
+	/*
+	 * write only still has the command buffer, and the command buffer
+	 * carries the fallback guidance that we need to inspect. This is
+	 * written in this odd way so the compiler knows that SUCCESS is not a
+	 * possible return and optimizes accordingly.
+	 */
+	switch (_check_legacy(cmd, ret)) {
+	case TRY_WRITE:
+		return TRY_WRITE;
+	case TRY_WRITE_EX:
+		return TRY_WRITE_EX;
+	default:
+		return ERROR;
+	}
+}
+
+#undef execute_ioctl_fallback
+#define execute_ioctl_fallback(ctx, cmd_name, cmdb, ret)                       \
+	_execute_write_only(ctx, cmdb, ret)
+
+#undef DECLARE_FBCMD_BUFFER
+#define DECLARE_FBCMD_BUFFER(_name, _object_id, _method_id, _num_attrs, _link) \
+	struct ibv_command_buffer _name[1] = {                                 \
+		{                                                              \
+			.next = _link,                                         \
+			.uhw_in_idx = _UHW_NO_INDEX,                           \
+			.uhw_out_idx = _UHW_NO_INDEX,                          \
+		},                                                             \
+	}
+
+#endif
+
+#endif
diff --git a/libibverbs/ibverbs.h b/libibverbs/ibverbs.h
index 7238d79168733f..98000ff66e4ccc 100644
--- a/libibverbs/ibverbs.h
+++ b/libibverbs/ibverbs.h
@@ -60,6 +60,8 @@  void ibverbs_device_hold(struct ibv_device *dev);
 struct verbs_ex_private {
 	struct ibv_cq_ex *(*create_cq_ex)(struct ibv_context *context,
 					  struct ibv_cq_init_attr_ex *init_attr);
+
+	uint64_t unsupported_ioctls;
 };
 
 #define IBV_INIT_CMD(cmd, size, opcode)					\