@@ -33,6 +33,7 @@
#ifndef DOORBELL_H
#define DOORBELL_H
+#ifndef __s390x__
#if SIZEOF_LONG == 8
#if __BYTE_ORDER == __LITTLE_ENDIAN
@@ -59,5 +60,71 @@ static inline void mlx4_write64(uint32_t
}
#endif
+#else /* __s390x__ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <infiniband/driver.h>
+#include <infiniband/arch.h>
+
+enum ib_uverbs_kwrite_mmio_location {
+ IB_UVERBS_KWRITE_MMIO_UAR,
+ IB_UVERBS_KWRITE_MMIO_BF_PAGE
+};
+
+struct mlx4_kwrite_mmio {
+ __u32 command;
+ __u16 in_words;
+ __u16 out_words;
+ __u16 offset;
+ __u16 length;
+ __u8 location;
+ __u8 reserved[3];
+ __u8 value[0];
+};
+
+static inline int mlx4_kwrite_mmio(int fd,
+ uint8_t location,
+ uint32_t offset,
+ uint32_t length,
+ void *value)
+{
+ struct mlx4_kwrite_mmio *cmd = NULL;
+ int cmd_size = 0;
+ int ret = 0;
+
+ cmd_size = sizeof(*cmd) + length;
+ cmd = calloc(1, cmd_size);
+ if (!cmd)
+ return -ENOMEM;
+
+ cmd->command = IB_USER_VERBS_CMD_KWRITE_MMIO;
+ cmd->in_words = cmd_size / 4;
+ cmd->out_words = 0;
+ cmd->offset = offset;
+ cmd->length = length;
+ cmd->location = location;
+
+ memcpy(cmd->value, value, length);
+
+ ret = write(fd, cmd, cmd_size);
+ free(cmd);
+
+ return ret;
+}
+
+static inline void mlx4_write64(uint32_t val[2],
+ struct mlx4_context *ctx,
+ int offset)
+{
+ mlx4_kwrite_mmio(ctx->ibv_ctx.cmd_fd,
+ IB_UVERBS_KWRITE_MMIO_UAR, /* UAR page */
+ offset,
+ 2 * sizeof(val[0]),
+ val);
+}
+
+#endif
#endif /* DOORBELL_H */
@@ -116,6 +116,59 @@ static struct ibv_context_ops mlx4_ctx_o
.detach_mcast = ibv_cmd_detach_mcast
};
+#ifdef __s390x__
+static inline int mlx4_context_init_mapping(struct mlx4_context *context,
+ const int cmd_fd,
+ const __u16 bf_reg_size,
+ const struct mlx4_device *dev)
+{
+ if (bf_reg_size) {
+ context->bf_buf_size = bf_reg_size / 2;
+ context->bf_offset = 0;
+ pthread_spin_init(&context->bf_lock, PTHREAD_PROCESS_PRIVATE);
+ } else
+ context->bf_buf_size = 0;
+
+ return 0;
+}
+#else
+static inline int mlx4_context_init_mapping(struct mlx4_context *context,
+ const int cmd_fd,
+ const __u16 bf_reg_size,
+ const struct mlx4_device *dev)
+{
+ static const char *mlx4_bf_format = "%s %s\n";
+ static const char *mlx4_bf_warning =
+ "Warning: BlueFlame available, but failed to mmap() BlueFlame page.";
+
+ context->uar = mmap(NULL, dev->page_size, PROT_WRITE,
+ MAP_SHARED, cmd_fd, 0);
+ if (context->uar == MAP_FAILED)
+ return errno;
+
+ if (bf_reg_size) {
+ context->bf_page = mmap(NULL, dev->page_size,
+ PROT_WRITE, MAP_SHARED, cmd_fd,
+ dev->page_size);
+ if (context->bf_page == MAP_FAILED) {
+ fprintf(stderr, mlx4_bf_format, PFX, mlx4_bf_warning);
+ context->bf_page = NULL;
+ context->bf_buf_size = 0;
+ } else {
+ context->bf_buf_size = bf_reg_size / 2;
+ context->bf_offset = 0;
+ pthread_spin_init(&context->bf_lock,
+ PTHREAD_PROCESS_PRIVATE);
+ }
+ } else {
+ context->bf_page = NULL;
+ context->bf_buf_size = 0;
+ }
+
+ return 0;
+}
+#endif
+
static int mlx4_init_context(struct verbs_device *v_device,
struct ibv_context *ibv_ctx, int cmd_fd)
{
@@ -127,6 +180,7 @@ static int mlx4_init_context(struct verb
__u16 bf_reg_size;
struct mlx4_device *dev = to_mdev(&v_device->device);
struct verbs_context *verbs_ctx = verbs_get_ctx(ibv_ctx);
+ int ret;
/* memory footprint of mlx4_context and verbs_context share
* struct ibv_context.
@@ -168,29 +222,9 @@ static int mlx4_init_context(struct verb
mlx4_init_xsrq_table(&context->xsrq_table, context->num_qps);
pthread_mutex_init(&context->db_list_mutex, NULL);
- context->uar = mmap(NULL, dev->page_size, PROT_WRITE,
- MAP_SHARED, cmd_fd, 0);
- if (context->uar == MAP_FAILED)
- return errno;
-
- if (bf_reg_size) {
- context->bf_page = mmap(NULL, dev->page_size,
- PROT_WRITE, MAP_SHARED, cmd_fd,
- dev->page_size);
- if (context->bf_page == MAP_FAILED) {
- fprintf(stderr, PFX "Warning: BlueFlame available, "
- "but failed to mmap() BlueFlame page.\n");
- context->bf_page = NULL;
- context->bf_buf_size = 0;
- } else {
- context->bf_buf_size = bf_reg_size / 2;
- context->bf_offset = 0;
- pthread_spin_init(&context->bf_lock, PTHREAD_PROCESS_PRIVATE);
- }
- } else {
- context->bf_page = NULL;
- context->bf_buf_size = 0;
- }
+ ret = mlx4_context_init_mapping(context, cmd_fd, bf_reg_size, dev);
+ if (ret)
+ return ret;
pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE);
ibv_ctx->ops = mlx4_ctx_ops;
@@ -208,6 +242,12 @@ static int mlx4_init_context(struct verb
}
+#ifdef __s390x__
+static void mlx4_uninit_context(struct verbs_device *v_device,
+ struct ibv_context *ibv_ctx)
+{
+}
+#else
static void mlx4_uninit_context(struct verbs_device *v_device,
struct ibv_context *ibv_ctx)
{
@@ -218,6 +258,7 @@ static void mlx4_uninit_context(struct v
munmap(context->bf_page, to_mdev(&v_device->device)->page_size);
}
+#endif
static struct verbs_device *mlx4_driver_init(const char *uverbs_sys_path, int abi_version)
{
@@ -74,6 +74,8 @@
#define wc_wmb() asm volatile("sfence" ::: "memory")
#elif defined(__ia64__)
#define wc_wmb() asm volatile("fwb" ::: "memory")
+#elif defined(__s390x__)
+#define wc_wmb { asm volatile("" : : : "memory") }
#else
#define wc_wmb() wmb()
#endif
@@ -168,10 +170,14 @@ struct mlx4_db_page;
struct mlx4_context {
struct ibv_context ibv_ctx;
+#ifndef __s390x__
void *uar;
+#endif
pthread_spinlock_t uar_lock;
+#ifndef __s390x__
void *bf_page;
+#endif
int bf_buf_size;
int bf_offset;
pthread_spinlock_t bf_lock;
@@ -173,13 +173,41 @@ static void set_data_seg(struct mlx4_wqe
dseg->byte_count = htonl(sg->length);
}
+#ifdef __s390x__
+static inline void mlx4_bf_copy(struct mlx4_context *ctx,
+ unsigned long *src,
+ unsigned bytecnt)
+{
+ mlx4_kwrite_mmio(ctx->ibv_ctx.cmd_fd,
+ IB_UVERBS_KWRITE_MMIO_BF_PAGE, /* BF page */
+ ctx->bf_offset,
+ bytecnt,
+ src);
+
+}
+
+static inline void mlx4_send_doorbell(struct mlx4_context *ctx,
+ uint32_t offset,
+ uint32_t value)
+{
+ mlx4_kwrite_mmio(ctx->ibv_ctx.cmd_fd,
+ IB_UVERBS_KWRITE_MMIO_UAR, /* UAR page */
+ offset,
+ sizeof(value),
+ &value);
+}
+#else
/*
* Avoid using memcpy() to copy to BlueFlame page, since memcpy()
* implementations may use move-string-buffer assembler instructions,
* which do not guarantee order of copying.
*/
-static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
+static void mlx4_bf_copy(struct mlx4_context *ctx,
+ unsigned long *src,
+ unsigned bytecnt)
{
+ unsigned long *dst = (unsigned long *)(ctx->bf_page + ctx->bf_offset);
+
while (bytecnt > 0) {
*dst++ = *src++;
*dst++ = *src++;
@@ -187,6 +215,14 @@ static void mlx4_bf_copy(unsigned long *
}
}
+static inline void mlx4_send_doorbell(struct mlx4_context *ctx,
+ uint32_t offset,
+ uint32_t value)
+{
+ *(uint32_t *) (ctx->uar + offset) = value;
+}
+#endif
+
int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad_wr)
{
@@ -418,8 +454,7 @@ out:
pthread_spin_lock(&ctx->bf_lock);
- mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
- align(size * 16, 64));
+ mlx4_bf_copy(ctx, (unsigned long *) ctrl, align(size * 16, 64));
wc_wmb();
ctx->bf_offset ^= ctx->bf_buf_size;
@@ -434,7 +469,7 @@ out:
*/
wmb();
- *(uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn;
+ mlx4_send_doorbell(ctx, MLX4_SEND_DOORBELL, qp->doorbell_qpn);
}
if (nreq)
Since s390x platform requires execution of privileged CPU instructions to work with PCI I/O memory, the PCI I/O memory cannot be accessed from the userspace programs via the mapped memory areas. The current implementation of the Inifiniband verbs uses mapped memory areas to write data to device UAR and Blueflame page to initiate the I/O operations, these verbs cannot be used on the s390x platfrom without modification. There are two approaches could be implemented to solve this problem: * using a page fault handler to intercept mapped memory area access errors, and handle them in the handler by issuing the appropriate privileged CPU instructions; * modification of the existing verbs to avoid the mapped memory areas usage on the s390x platform. The page fault handler solution is the most complex one because it requires not only modifcation of the virtual memory handling in Linux kernel but also makes the developer to provide code for all the CPU instrutions which work with memory program interpretation. This approcach requires lots of lines of code and noticable overhead during the program execution. The modification of the existing verbs solution is much simpler and more realible. It requires modification of the libraries provided in the DAPL support packages to replace the usage of mapped memory areas used to access the device UAR and Blueflame page with the device driver write primitive calls supplying a special verb command to kernelspace. The modification of the existing verbs solution has been choosen for implementation. This patch contains the changes to the libmlx4 userspace Mellanox device driver library required to provide support for the DAPL API on the s390x platform. The code that used mapped memory areas to access the PCI I/O memory of the Mellanox networking device is replaced with the kernelspace device driver write primitive system calls that pass to kernel the new IB_USER_VERBS_CMD_KWRITE_MMIO verb command with apporpriate parameters. Signed-off-by: Alexey Ishchuk <alexey_ishchuk@ru.ibm.com> --- src/doorbell.h | 67 +++++++++++++++++++++++++++++++++++++++++++ src/mlx4.c | 87 +++++++++++++++++++++++++++++++++++++++++---------------- src/mlx4.h | 6 +++ src/qp.c | 43 +++++++++++++++++++++++++--- 4 files changed, 176 insertions(+), 27 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html