Soft RoCE driver

Soft RoCE (RXE) - The software RoCE driver

ib_rxe implements the RDMA transport and registers to the RDMA core
device as a kernel verbs provider. It also implements the packet IO
layer. On the other hand ib_rxe registers to the Linux netdev stack
as a udp encapsulating protocol, in that case RDMA, for sending and
receiving packets over any Ethernet device.  This yields a RDMA
transport over the UDP/Ethernet network layer forming a RoCEv2
compatible device.

The configuration procedure of the Soft RoCE drivers requires
binding to any existing Ethernet network device. This is done with
/sys interface.

A userspace Soft RoCE library (librxe) provides user applications
the ability to run with Soft RoCE devices.  The use of rxe verbs ins
user space requires the inclusion of librxe as a device specifics
plug-in to libibverbs. librxe is packaged separately.

Architecture:

     +-----------------------------------------------------------+
     |                          Application                      |
     +-----------------------------------------------------------+
                            +-----------------------------------+
                            |             libibverbs            |
User                        +-----------------------------------+
                            +----------------+ +----------------+
                            | librxe         | | HW RoCE lib    |
                            +----------------+ +----------------+
+---------------------------------------------------------------+
     +--------------+                           +------------+
     | Sockets      |                           | RDMA ULP   |
     +--------------+                           +------------+
     +--------------+                  +---------------------+
     | TCP/IP       |                  | ib_core             |
     +--------------+                  +---------------------+
                             +------------+ +----------------+
Kernel                       | ib_rxe     | | HW RoCE driver |
                             +------------+ +----------------+
     +------------------------------------+
     | NIC driver                         |
     +------------------------------------+

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     +-----------------------------------------------------------+
     |                          Application                      |
     +-----------------------------------------------------------+
                            +-----------------------------------+
                            |             libibverbs            |
User                        +-----------------------------------+
                            +----------------+ +----------------+
                            | librxe         | | HW RoCE lib    |
                            +----------------+ +----------------+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     +--------------+                           +------------+
     | Sockets      |                           | RDMA ULP   |
     +--------------+                           +------------+
     +--------------+                  +---------------------+
     | TCP/IP       |                  | ib_core             |
     +--------------+                  +---------------------+
                             +------------+ +----------------+
Kernel                       | ib_rxe     | | HW RoCE driver |
                             +------------+ +----------------+
     +------------------------------------+
     | NIC driver                         |
     +------------------------------------+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Soft RoCE resources:

[1[ https://github.com/SoftRoCE/librxe-dev librxe - source code in
Github
[2] https://github.com/SoftRoCE/rxe-dev/wiki/rxe-dev:-Home - Soft RoCE
Wiki page
[3] https://github.com/SoftRoCE/librxe-dev - Soft RoCE userspace library

Signed-off-by: Kamal Heib <kamalh@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: Moni Shoua <monis@mellanox.com>
Reviewed-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
new file mode 100644
index 0000000..cac1d52
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *	   Redistribution and use in source and binary forms, with or
+ *	   without modification, are permitted provided that the following
+ *	   conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_VERBS_H
+#define RXE_VERBS_H
+
+#include <linux/interrupt.h>
+#include <rdma/rdma_user_rxe.h>
+#include "rxe_pool.h"
+#include "rxe_task.h"
+
+static inline int pkey_match(u16 key1, u16 key2)
+{
+	return (((key1 & 0x7fff) != 0) &&
+		((key1 & 0x7fff) == (key2 & 0x7fff)) &&
+		((key1 & 0x8000) || (key2 & 0x8000))) ? 1 : 0;
+}
+
+/* Return >0 if psn_a > psn_b
+ *	   0 if psn_a == psn_b
+ *	  <0 if psn_a < psn_b
+ */
+static inline int psn_compare(u32 psn_a, u32 psn_b)
+{
+	s32 diff;
+
+	diff = (psn_a - psn_b) << 8;
+	return diff;
+}
+
+struct rxe_ucontext {
+	struct rxe_pool_entry	pelem;
+	struct ib_ucontext	ibuc;
+};
+
+struct rxe_pd {
+	struct rxe_pool_entry	pelem;
+	struct ib_pd		ibpd;
+};
+
+struct rxe_ah {
+	struct rxe_pool_entry	pelem;
+	struct ib_ah		ibah;
+	struct rxe_pd		*pd;
+	struct rxe_av		av;
+};
+
+struct rxe_cqe {
+	union {
+		struct ib_wc		ibwc;
+		struct ib_uverbs_wc	uibwc;
+	};
+};
+
+struct rxe_cq {
+	struct rxe_pool_entry	pelem;
+	struct ib_cq		ibcq;
+	struct rxe_queue	*queue;
+	spinlock_t		cq_lock;
+	u8			notify;
+	int			is_user;
+	struct tasklet_struct	comp_task;
+};
+
+enum wqe_state {
+	wqe_state_posted,
+	wqe_state_processing,
+	wqe_state_pending,
+	wqe_state_done,
+	wqe_state_error,
+};
+
+struct rxe_sq {
+	int			max_wr;
+	int			max_sge;
+	int			max_inline;
+	spinlock_t		sq_lock; /* guard queue */
+	struct rxe_queue	*queue;
+};
+
+struct rxe_rq {
+	int			max_wr;
+	int			max_sge;
+	spinlock_t		producer_lock; /* guard queue producer */
+	spinlock_t		consumer_lock; /* guard queue consumer */
+	struct rxe_queue	*queue;
+};
+
+struct rxe_srq {
+	struct rxe_pool_entry	pelem;
+	struct ib_srq		ibsrq;
+	struct rxe_pd		*pd;
+	struct rxe_rq		rq;
+	u32			srq_num;
+
+	int			limit;
+	int			error;
+};
+
+enum rxe_qp_state {
+	QP_STATE_RESET,
+	QP_STATE_INIT,
+	QP_STATE_READY,
+	QP_STATE_DRAIN,		/* req only */
+	QP_STATE_DRAINED,	/* req only */
+	QP_STATE_ERROR
+};
+
+extern char *rxe_qp_state_name[];
+
+struct rxe_req_info {
+	enum rxe_qp_state	state;
+	int			wqe_index;
+	u32			psn;
+	int			opcode;
+	atomic_t		rd_atomic;
+	int			wait_fence;
+	int			need_rd_atomic;
+	int			wait_psn;
+	int			need_retry;
+	int			noack_pkts;
+	struct rxe_task		task;
+};
+
+struct rxe_comp_info {
+	u32			psn;
+	int			opcode;
+	int			timeout;
+	int			timeout_retry;
+	u32			retry_cnt;
+	u32			rnr_retry;
+	struct rxe_task		task;
+};
+
+enum rdatm_res_state {
+	rdatm_res_state_next,
+	rdatm_res_state_new,
+	rdatm_res_state_replay,
+};
+
+struct resp_res {
+	int			type;
+	u32			first_psn;
+	u32			last_psn;
+	u32			cur_psn;
+	enum rdatm_res_state	state;
+
+	union {
+		struct {
+			struct sk_buff	*skb;
+		} atomic;
+		struct {
+			struct rxe_mem	*mr;
+			u64		va_org;
+			u32		rkey;
+			u32		length;
+			u64		va;
+			u32		resid;
+		} read;
+	};
+};
+
+struct rxe_resp_info {
+	enum rxe_qp_state	state;
+	u32			msn;
+	u32			psn;
+	int			opcode;
+	int			drop_msg;
+	int			goto_error;
+	int			sent_psn_nak;
+	enum ib_wc_status	status;
+	u8			aeth_syndrome;
+
+	/* Receive only */
+	struct rxe_recv_wqe	*wqe;
+
+	/* RDMA read / atomic only */
+	u64			va;
+	struct rxe_mem		*mr;
+	u32			resid;
+	u32			rkey;
+	u64			atomic_orig;
+
+	/* SRQ only */
+	struct {
+		struct rxe_recv_wqe	wqe;
+		struct ib_sge		sge[RXE_MAX_SGE];
+	} srq_wqe;
+
+	/* Responder resources. It's a circular list where the oldest
+	 * resource is dropped first.
+	 */
+	struct resp_res		*resources;
+	unsigned int		res_head;
+	unsigned int		res_tail;
+	struct resp_res		*res;
+	struct rxe_task		task;
+};
+
+struct rxe_qp {
+	struct rxe_pool_entry	pelem;
+	struct ib_qp		ibqp;
+	struct ib_qp_attr	attr;
+	unsigned int		valid;
+	unsigned int		mtu;
+	int			is_user;
+
+	struct rxe_pd		*pd;
+	struct rxe_srq		*srq;
+	struct rxe_cq		*scq;
+	struct rxe_cq		*rcq;
+
+	enum ib_sig_type	sq_sig_type;
+
+	struct rxe_sq		sq;
+	struct rxe_rq		rq;
+
+	struct socket		*sk;
+
+	struct rxe_av		pri_av;
+	struct rxe_av		alt_av;
+
+	/* list of mcast groups qp has joined (for cleanup) */
+	struct list_head	grp_list;
+	spinlock_t		grp_lock; /* guard grp_list */
+
+	struct sk_buff_head	req_pkts;
+	struct sk_buff_head	resp_pkts;
+	struct sk_buff_head	send_pkts;
+
+	struct rxe_req_info	req;
+	struct rxe_comp_info	comp;
+	struct rxe_resp_info	resp;
+
+	atomic_t		ssn;
+	atomic_t		skb_out;
+	int			need_req_skb;
+
+	/* Timer for retranmitting packet when ACKs have been lost. RC
+	 * only. The requester sets it when it is not already
+	 * started. The responder resets it whenever an ack is
+	 * received.
+	 */
+	struct timer_list retrans_timer;
+	u64 qp_timeout_jiffies;
+
+	/* Timer for handling RNR NAKS. */
+	struct timer_list rnr_nak_timer;
+
+	spinlock_t		state_lock; /* guard requester and completer */
+};
+
+enum rxe_mem_state {
+	RXE_MEM_STATE_ZOMBIE,
+	RXE_MEM_STATE_INVALID,
+	RXE_MEM_STATE_FREE,
+	RXE_MEM_STATE_VALID,
+};
+
+enum rxe_mem_type {
+	RXE_MEM_TYPE_NONE,
+	RXE_MEM_TYPE_DMA,
+	RXE_MEM_TYPE_MR,
+	RXE_MEM_TYPE_FMR,
+	RXE_MEM_TYPE_MW,
+};
+
+#define RXE_BUF_PER_MAP		(PAGE_SIZE / sizeof(struct rxe_phys_buf))
+
+struct rxe_phys_buf {
+	u64      addr;
+	u64      size;
+};
+
+struct rxe_map {
+	struct rxe_phys_buf	buf[RXE_BUF_PER_MAP];
+};
+
+struct rxe_mem {
+	struct rxe_pool_entry	pelem;
+	union {
+		struct ib_mr		ibmr;
+		struct ib_mw		ibmw;
+	};
+
+	struct rxe_pd		*pd;
+	struct ib_umem		*umem;
+
+	u32			lkey;
+	u32			rkey;
+
+	enum rxe_mem_state	state;
+	enum rxe_mem_type	type;
+	u64			va;
+	u64			iova;
+	size_t			length;
+	u32			offset;
+	int			access;
+
+	int			page_shift;
+	int			page_mask;
+	int			map_shift;
+	int			map_mask;
+
+	u32			num_buf;
+	u32			nbuf;
+
+	u32			max_buf;
+	u32			num_map;
+
+	struct rxe_map		**map;
+};
+
+struct rxe_mc_grp {
+	struct rxe_pool_entry	pelem;
+	spinlock_t		mcg_lock; /* guard group */
+	struct rxe_dev		*rxe;
+	struct list_head	qp_list;
+	union ib_gid		mgid;
+	int			num_qp;
+	u32			qkey;
+	u16			pkey;
+};
+
+struct rxe_mc_elem {
+	struct rxe_pool_entry	pelem;
+	struct list_head	qp_list;
+	struct list_head	grp_list;
+	struct rxe_qp		*qp;
+	struct rxe_mc_grp	*grp;
+};
+
+struct rxe_port {
+	struct ib_port_attr	attr;
+	u16			*pkey_tbl;
+	__be64			port_guid;
+	__be64			subnet_prefix;
+	spinlock_t		port_lock; /* guard port */
+	unsigned int		mtu_cap;
+	/* special QPs */
+	u32			qp_smi_index;
+	u32			qp_gsi_index;
+};
+
+/* callbacks from rdma_rxe to network interface layer */
+struct rxe_ifc_ops {
+	void (*release)(struct rxe_dev *rxe);
+	__be64 (*node_guid)(struct rxe_dev *rxe);
+	__be64 (*port_guid)(struct rxe_dev *rxe);
+	struct device *(*dma_device)(struct rxe_dev *rxe);
+	int (*mcast_add)(struct rxe_dev *rxe, union ib_gid *mgid);
+	int (*mcast_delete)(struct rxe_dev *rxe, union ib_gid *mgid);
+	int (*prepare)(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+		       struct sk_buff *skb, u32 *crc);
+	int (*send)(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+		    struct sk_buff *skb);
+	int (*loopback)(struct sk_buff *skb);
+	struct sk_buff *(*init_packet)(struct rxe_dev *rxe, struct rxe_av *av,
+				       int paylen, struct rxe_pkt_info *pkt);
+	char *(*parent_name)(struct rxe_dev *rxe, unsigned int port_num);
+	enum rdma_link_layer (*link_layer)(struct rxe_dev *rxe,
+					   unsigned int port_num);
+};
+
+struct rxe_dev {
+	struct ib_device	ib_dev;
+	struct ib_device_attr	attr;
+	int			max_ucontext;
+	int			max_inline_data;
+	struct kref		ref_cnt;
+	struct mutex	usdev_lock;
+
+	struct rxe_ifc_ops	*ifc_ops;
+
+	struct net_device	*ndev;
+
+	int			xmit_errors;
+
+	struct rxe_pool		uc_pool;
+	struct rxe_pool		pd_pool;
+	struct rxe_pool		ah_pool;
+	struct rxe_pool		srq_pool;
+	struct rxe_pool		qp_pool;
+	struct rxe_pool		cq_pool;
+	struct rxe_pool		mr_pool;
+	struct rxe_pool		mw_pool;
+	struct rxe_pool		mc_grp_pool;
+	struct rxe_pool		mc_elem_pool;
+
+	spinlock_t		pending_lock; /* guard pending_mmaps */
+	struct list_head	pending_mmaps;
+
+	spinlock_t		mmap_offset_lock; /* guard mmap_offset */
+	int			mmap_offset;
+
+	struct rxe_port		port;
+	struct list_head	list;
+};
+
+static inline struct rxe_dev *to_rdev(struct ib_device *dev)
+{
+	return dev ? container_of(dev, struct rxe_dev, ib_dev) : NULL;
+}
+
+static inline struct rxe_ucontext *to_ruc(struct ib_ucontext *uc)
+{
+	return uc ? container_of(uc, struct rxe_ucontext, ibuc) : NULL;
+}
+
+static inline struct rxe_pd *to_rpd(struct ib_pd *pd)
+{
+	return pd ? container_of(pd, struct rxe_pd, ibpd) : NULL;
+}
+
+static inline struct rxe_ah *to_rah(struct ib_ah *ah)
+{
+	return ah ? container_of(ah, struct rxe_ah, ibah) : NULL;
+}
+
+static inline struct rxe_srq *to_rsrq(struct ib_srq *srq)
+{
+	return srq ? container_of(srq, struct rxe_srq, ibsrq) : NULL;
+}
+
+static inline struct rxe_qp *to_rqp(struct ib_qp *qp)
+{
+	return qp ? container_of(qp, struct rxe_qp, ibqp) : NULL;
+}
+
+static inline struct rxe_cq *to_rcq(struct ib_cq *cq)
+{
+	return cq ? container_of(cq, struct rxe_cq, ibcq) : NULL;
+}
+
+static inline struct rxe_mem *to_rmr(struct ib_mr *mr)
+{
+	return mr ? container_of(mr, struct rxe_mem, ibmr) : NULL;
+}
+
+static inline struct rxe_mem *to_rmw(struct ib_mw *mw)
+{
+	return mw ? container_of(mw, struct rxe_mem, ibmw) : NULL;
+}
+
+int rxe_register_device(struct rxe_dev *rxe);
+int rxe_unregister_device(struct rxe_dev *rxe);
+
+void rxe_mc_cleanup(void *arg);
+
+#endif /* RXE_VERBS_H */