Home Home > GIT Browse > vanilla
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichal Marek <mmarek@suse.com>2016-05-06 16:09:47 +0200
committerMichal Marek <mmarek@suse.com>2016-05-06 16:09:47 +0200
commita229b4173f5521d1137ef98f19371e4953e28023 (patch)
tree91ce441f57f5c5b1bc51cd5b17a1d1be0209559a
parentbb2e1a6bda571b667b88e8fd9491f3a26851da93 (diff)
parente10abe88de947d43cf75773150c58a0e643f03bd (diff)
Merge branch 'users/jjolly/SLE12-SP2/for-next' into SLE12-SP2rpm-4.4.9-36
Pull SMC (Shared Memory Communications) support from John Jolly (fate#319593, bsc#978258). Conflicts: config/arm64/default config/ppc64le/debug config/ppc64le/default config/s390x/default config/s390x/zfcpdump config/x86_64/debug config/x86_64/default suse-commit: 0a6fce8dcc63dcb1046ec9aafe5a2dc32012943d
-rw-r--r--MAINTAINERS7
-rw-r--r--include/linux/socket.h7
-rw-r--r--include/net/sock.h1
-rw-r--r--net/Kconfig1
-rw-r--r--net/Makefile1
-rw-r--r--net/core/sock.c7
-rw-r--r--net/ipv4/tcp_ipv4.c1
-rw-r--r--net/ipv4/tcp_timer.c1
-rw-r--r--net/ipv6/tcp_ipv6.c1
-rw-r--r--net/smc/Kconfig11
-rw-r--r--net/smc/Makefile2
-rw-r--r--net/smc/af_smc.c1672
-rw-r--r--net/smc/smc.h253
-rw-r--r--net/smc/smc_cdc.c259
-rw-r--r--net/smc/smc_cdc.h162
-rw-r--r--net/smc/smc_clc.c273
-rw-r--r--net/smc/smc_clc.h113
-rw-r--r--net/smc/smc_core.c571
-rw-r--r--net/smc/smc_core.h178
-rw-r--r--net/smc/smc_ib.c471
-rw-r--r--net/smc/smc_ib.h66
-rw-r--r--net/smc/smc_llc.c152
-rw-r--r--net/smc/smc_llc.h63
-rw-r--r--net/smc/smc_pnet.c605
-rw-r--r--net/smc/smc_pnet.h26
-rw-r--r--net/smc/smc_rx.c189
-rw-r--r--net/smc/smc_rx.h24
-rw-r--r--net/smc/smc_tx.c516
-rw-r--r--net/smc/smc_tx.h36
-rw-r--r--net/smc/smc_wr.c557
-rw-r--r--net/smc/smc_wr.h101
31 files changed, 6321 insertions, 6 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 02d7deb5292b..a7b87fc1eee0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9378,6 +9378,13 @@ L: linux-serial@vger.kernel.org
S: Maintained
F: drivers/tty/serial/
+SHARED MEMORY COMMUNICATIONS (SMC) SOCKETS
+M: Ursula Braun <ursula.braun@linux.vnet.ibm.com>
+L: linux-s390@vger.kernel.org
+W: http://www.ibm.com/developerworks/linux/linux390/
+S: Supported
+F: net/smc/
+
SYNOPSYS DESIGNWARE DMAC DRIVER
M: Viresh Kumar <vireshk@kernel.org>
M: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 5bf59c8493b7..1822ee0a4298 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -200,7 +200,11 @@ struct ucred {
#define AF_ALG 38 /* Algorithm sockets */
#define AF_NFC 39 /* NFC sockets */
#define AF_VSOCK 40 /* vSockets */
-#define AF_MAX 41 /* For now.. */
+#define AF_SMC 41 /* smc sockets: reserve number for
+ * PF_SMC protocol family that
+ * reuses AF_INET address family
+ */
+#define AF_MAX 42 /* For now.. */
/* Protocol families, same as address families. */
#define PF_UNSPEC AF_UNSPEC
@@ -246,6 +250,7 @@ struct ucred {
#define PF_ALG AF_ALG
#define PF_NFC AF_NFC
#define PF_VSOCK AF_VSOCK
+#define PF_SMC AF_SMC
#define PF_MAX AF_MAX
/* Maximum queue length specifiable by listen. */
diff --git a/include/net/sock.h b/include/net/sock.h
index a26c300a8c0c..81de87b416df 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -976,6 +976,7 @@ struct proto {
int (*getsockopt)(struct sock *sk, int level,
int optname, char __user *optval,
int __user *option);
+ void (*keepalive)(struct sock *sk, int valbool);
#ifdef CONFIG_COMPAT
int (*compat_setsockopt)(struct sock *sk,
int level,
diff --git a/net/Kconfig b/net/Kconfig
index 127da94ae25e..7529f4f77a69 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -54,6 +54,7 @@ source "net/packet/Kconfig"
source "net/unix/Kconfig"
source "net/xfrm/Kconfig"
source "net/iucv/Kconfig"
+source "net/smc/Kconfig"
config INET
bool "TCP/IP networking"
diff --git a/net/Makefile b/net/Makefile
index a5d04098dfce..f94c871c563a 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_MAC80211) += mac80211/
obj-$(CONFIG_TIPC) += tipc/
obj-$(CONFIG_NETLABEL) += netlabel/
obj-$(CONFIG_IUCV) += iucv/
+obj-$(CONFIG_SMC) += smc/
obj-$(CONFIG_RFKILL) += rfkill/
obj-$(CONFIG_NET_9P) += 9p/
obj-$(CONFIG_CAIF) += caif/
diff --git a/net/core/sock.c b/net/core/sock.c
index 0d91f7dca751..cc35543b9e6d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -792,11 +792,8 @@ set_rcvbuf:
goto set_rcvbuf;
case SO_KEEPALIVE:
-#ifdef CONFIG_INET
- if (sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM)
- tcp_set_keepalive(sk, valbool);
-#endif
+ if (sk->sk_prot->keepalive)
+ sk->sk_prot->keepalive(sk, valbool);
sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
break;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 048418b049d8..12671495f7d2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2316,6 +2316,7 @@ struct proto tcp_prot = {
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
+ .keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 193ba1fa8a9a..2113165667e3 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -569,6 +569,7 @@ void tcp_set_keepalive(struct sock *sk, int val)
else if (!val)
inet_csk_delete_keepalive_timer(sk);
}
+EXPORT_SYMBOL(tcp_set_keepalive);
static void tcp_keepalive_timer (unsigned long data)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b8d405623f4f..b4ad7b9dc229 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1884,6 +1884,7 @@ struct proto tcpv6_prot = {
.rsk_prot = &tcp6_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
+ .keepalive = tcp_set_keepalive,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
new file mode 100644
index 000000000000..bc029803e728
--- /dev/null
+++ b/net/smc/Kconfig
@@ -0,0 +1,11 @@
+config SMC
+ tristate "SMC socket protocol family"
+ depends on INET && INFINIBAND
+ ---help---
+ SMC-R provides a "sockets over RDMA" solution making use of
+ RDMA over Converged Ethernet (RoCE) technology to upgrade
+ AF_INET TCP connections transparently.
+ The Linux implementation of the SMC-R solution is designed as
+ a separate socket family SMC.
+
+ Select this option if you want to run SMC socket applications
diff --git a/net/smc/Makefile b/net/smc/Makefile
new file mode 100644
index 000000000000..c009323845d3
--- /dev/null
+++ b/net/smc/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_SMC) += smc.o
+smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc_cdc.o smc_tx.o smc_rx.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
new file mode 100644
index 000000000000..4d2610a31fb2
--- /dev/null
+++ b/net/smc/af_smc.c
@@ -0,0 +1,1672 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * AF_SMC protocol family socket handler keeping the AF_INET sock address type
+ * applies to SOCK_STREAM sockets only
+ * offers an alternative communication option for TCP-protocol sockets
+ * applicable with RoCE-cards only
+ *
+ * Initial restrictions:
+ * - non-blocking connect postponed
+ * - IPv6 support postponed
+ * - support for alternate links postponed
+ * - partial support for non-blocking sockets only
+ * - support for urgent data postponed
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ * based on prototype from Frank Blaschka
+ */
+
+#define KMSG_COMPONENT "smc"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/inetdevice.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include "smc.h"
+#include "smc_clc.h"
+#include "smc_llc.h"
+#include "smc_cdc.h"
+#include "smc_core.h"
+#include "smc_ib.h"
+#include "smc_pnet.h"
+#include "smc_tx.h"
+#include "smc_rx.h"
+
+#define SMC_LISTEN_WORK_WAIT 20
+#define SMC_WAIT_TX_PENDS_TIME (5 * HZ)
+#define SMC_TIMEWAIT_LEN TCP_TIMEWAIT_LEN
+
+DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group creation */
+
+struct smc_lgr_list smc_lgr_list = { /* established link groups */
+ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
+ .list = LIST_HEAD_INIT(smc_lgr_list.list),
+};
+
+static void smc_tcp_listen_worker(struct work_struct *);
+
+static void smc_set_keepalive(struct sock *sk, int val)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
+}
+
+static struct proto smc_proto = {
+ .name = "SMC",
+ .owner = THIS_MODULE,
+ .keepalive = smc_set_keepalive,
+ .obj_size = sizeof(struct smc_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
+};
+
+static void smc_destruct_non_accepted(struct sock *sk);
+static struct sock *smc_accept_dequeue(struct sock *, struct socket *);
+
+static void smc_sock_cleanup_listen(struct sock *parent)
+{
+ struct sock *sk;
+
+ /* Close non-accepted connections */
+ while ((sk = smc_accept_dequeue(parent, NULL)))
+ smc_destruct_non_accepted(sk);
+}
+
+static int smc_wait_tx_pends(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ signed long timeout;
+ DEFINE_WAIT(wait);
+ int rc = 0;
+
+ timeout = SMC_WAIT_TX_PENDS_TIME;
+ if (smc_cdc_wr_tx_pends(conn) && !(current->flags & PF_EXITING)) {
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ do {
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ if (sk_wait_event(sk, &timeout,
+ !smc_cdc_wr_tx_pends(conn)))
+ break;
+ } while (!signal_pending(current) && timeout);
+ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ finish_wait(sk_sleep(sk), &wait);
+ }
+ if (!timeout) { /* timeout reached, kill tx_pends */
+ smc_cdc_put_conn_slots(conn);
+ rc = -ETIME;
+ }
+ return rc;
+}
+
+static void smc_wait_close_tx_prepared(struct smc_sock *smc, long timeout)
+{
+ struct sock *sk = &smc->sk;
+
+ if (timeout) {
+ DEFINE_WAIT(wait);
+
+ do {
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ if (sk_wait_event(sk, &timeout,
+ !smc_tx_prepared_sends(&smc->conn)))
+ break;
+ } while (!signal_pending(current) && timeout);
+
+ finish_wait(sk_sleep(sk), &wait);
+ }
+}
+
+void smc_wake_close_tx_prepared(struct smc_sock *smc)
+{
+ if (smc->sk.sk_state == SMC_PEERCLW1)
+ /* wake up socket closing */
+ smc->sk.sk_state_change(&smc->sk);
+}
+
+static inline int smc_stream_closing(struct smc_connection *conn)
+{
+ return (!smc_cdc_wr_tx_pends(conn) &&
+ smc_close_received(conn));
+}
+
+static void smc_stream_wait_close(struct smc_sock *smc, long lingertime)
+{
+ struct sock *sk = &smc->sk;
+
+ if (lingertime) {
+ DEFINE_WAIT(wait);
+
+ do {
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ if (sk_wait_event(sk, &lingertime,
+ smc_stream_closing(&smc->conn)))
+ break;
+ } while (!signal_pending(current) && lingertime);
+
+ finish_wait(sk_sleep(sk), &wait);
+ }
+}
+
+static int smc_conn_release(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ long timeout = MAX_SCHEDULE_TIMEOUT;
+ struct sock *sk = &smc->sk;
+ long lingertime = 0;
+ int old_state;
+ int rc = 0;
+
+ if (sock_flag(sk, SOCK_LINGER) &&
+ !(current->flags & PF_EXITING)) {
+ lingertime = sk->sk_lingertime;
+ timeout = sk->sk_lingertime;
+ }
+
+ old_state = sk->sk_state;
+ switch (old_state) {
+ case SMC_INIT:
+ sk->sk_state = SMC_CLOSED;
+ schedule_delayed_work(&smc->fin_work, SMC_TIMEWAIT_LEN);
+ break;
+ case SMC_LISTEN:
+ sk->sk_state = SMC_CLOSED;
+ sk->sk_state_change(sk);
+ old_state = SMC_CLOSED;
+ if (smc->clcsock && smc->clcsock->sk) {
+ rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
+ /* wake up kernel_accept of smc_tcp_listen_worker */
+ smc->clcsock->sk->sk_data_ready(smc->clcsock->sk);
+ }
+ release_sock(sk);
+ smc_sock_cleanup_listen(sk);
+ flush_work(&smc->tcp_listen_work);
+ flush_work(&smc->smc_listen_work);
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+ schedule_delayed_work(&smc->fin_work, SMC_TIMEWAIT_LEN);
+ break;
+ case SMC_ACTIVE:
+ /* active close */
+ /* wait for sndbuf data being posted */
+ /* SLD: postpone smc_tx_close, return immediately, no wait ???*/
+ smc_wait_close_tx_prepared(smc, timeout);
+ /* wait for confirmation of previous postings */
+ smc_wait_tx_pends(smc);
+ /* send close request */
+ rc = smc_tx_close(conn);
+ if (conn->local_rx_ctrl.conn_state_flags.sending_done)
+ sk->sk_state = SMC_PEERCLW2;
+ else
+ sk->sk_state = SMC_PEERCLW1;
+ /* fall through */
+ case SMC_PEERCLW1:
+ case SMC_PEERCLW2:
+ /* wait for confirmation of close request posting */
+ smc_wait_tx_pends(smc);
+ /* wait for close request from peer - comparable to
+ * sk_stream_wait_close call of tcp
+ */
+ smc_stream_wait_close(smc, lingertime);
+ if (smc_close_received(conn)) {
+ sk->sk_state = SMC_CLOSED;
+ schedule_delayed_work(&smc->fin_work, SMC_TIMEWAIT_LEN);
+ }
+ break;
+ case SMC_APPLFINCLW:
+ /* socket already shutdown wr or both (active close) */
+ sk->sk_state = SMC_CLOSED;
+ schedule_delayed_work(&smc->fin_work, SMC_TIMEWAIT_LEN);
+ break;
+ case SMC_APPLCLW1:
+ case SMC_APPLCLW2:
+ /* passive close */
+ if (!smc_close_received(conn))
+ /* wait for sndbuf data being posted */
+ smc_wait_close_tx_prepared(smc, timeout);
+ /* wait for confirmation of previous postings */
+ smc_wait_tx_pends(smc);
+ /* confirm close from peer */
+ rc = smc_tx_close(conn);
+ /* wait for confirmation of close request posting */
+ smc_wait_tx_pends(smc);
+ if (smc_close_received(conn)) {
+ sk->sk_state = SMC_CLOSED;
+ schedule_delayed_work(&smc->fin_work, SMC_TIMEWAIT_LEN);
+ } else {
+ sk->sk_state = SMC_PEERFINCLW;
+ }
+ break;
+ case SMC_PEERFINCLW:
+ case SMC_CLOSED:
+ default:
+ break;
+ }
+
+ if (old_state != sk->sk_state)
+ sk->sk_state_change(&smc->sk);
+ return rc;
+}
+
+static int smc_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = 0;
+
+ if (!sk || (sk->sk_state == SMC_DESTRUCT))
+ goto out;
+
+ smc = smc_sk(sk);
+ sock_hold(sk);
+ if (sk->sk_state == SMC_LISTEN)
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+ else
+ lock_sock(sk);
+
+ if (smc->use_fallback) {
+ sk->sk_state = SMC_CLOSED;
+ sk->sk_state_change(sk);
+ } else {
+ sock_set_flag(sk, SOCK_DEAD);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ rc = smc_conn_release(smc);
+ }
+ if (smc->clcsock) {
+ sock_release(smc->clcsock);
+ smc->clcsock = NULL;
+ }
+
+ /* detach socket */
+ sock_set_flag(sk, SOCK_ZAPPED);
+ sock_orphan(sk);
+ sock->sk = NULL;
+ release_sock(sk);
+
+ sock_put(sk);
+out:
+ return rc;
+}
+
+static void smc_accept_unlink(struct sock *);
+
+/* some kind of closing has been received - normal, abnormal, or sending_done */
+void smc_conn_release_handler(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ int old_state;
+
+ old_state = sk->sk_state;
+ switch (sk->sk_state) {
+ /* Normal termination - Passive close part */
+ case SMC_INIT:
+ case SMC_ACTIVE:
+ if (conn->local_rx_ctrl.conn_state_flags.sending_done ||
+ conn->local_rx_ctrl.conn_state_flags.closed_conn) {
+ /* complete any outstanding recv with zero-length
+ * if peerclosedconn and pending data to be written
+ * then reset conn
+ */
+ sk->sk_state = SMC_APPLCLW1;
+ }
+ break;
+ case SMC_PEERFINCLW:
+ if (conn->local_rx_ctrl.conn_state_flags.closed_conn)
+ sk->sk_state = SMC_CLOSED;
+ break;
+ /* Normal termination - Active close part */
+ case SMC_PEERCLW1:
+ if (conn->local_rx_ctrl.conn_state_flags.sending_done) {
+ /* complete any outstanding recv with zero-length */
+ sk->sk_state = SMC_PEERCLW2;
+ } /* fall through */
+ case SMC_PEERCLW2:
+ if (conn->local_rx_ctrl.conn_state_flags.closed_conn) {
+ struct smc_host_cdc_msg *tx_ctrl = &conn->local_tx_ctrl;
+ /* complete any outstanding recv with zero-length */
+ if (sk->sk_shutdown == SHUTDOWN_MASK &&
+ (tx_ctrl->conn_state_flags.closed_conn ||
+ tx_ctrl->conn_state_flags.abnormal_close)) {
+ sk->sk_state = SMC_CLOSED;
+ } else {
+ sk->sk_state = SMC_APPLFINCLW;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+
+ sock_set_flag(&smc->sk, SOCK_DONE);
+ if (smc_stop_received(conn)) {
+ sk->sk_shutdown = sk->sk_shutdown | RCV_SHUTDOWN;
+ if (smc->clcsock && smc->clcsock->sk) {
+ struct sock *tcpsk;
+
+ tcpsk = smc->clcsock->sk;
+ tcpsk->sk_shutdown = tcpsk->sk_shutdown | RCV_SHUTDOWN;
+ }
+ }
+ if (smc_close_received(conn) &&
+ (sk->sk_state == SMC_CLOSED) &&
+ sock_flag(sk, SOCK_DEAD) &&
+ !smc_cdc_wr_tx_pends(conn)) /* make sure socket is freed */
+ schedule_delayed_work(&smc->fin_work, SMC_TIMEWAIT_LEN);
+ if ((old_state != sk->sk_state) &&
+ (old_state != SMC_INIT))
+ sk->sk_state_change(sk);
+
+ smc->sk.sk_data_ready(&smc->sk);
+ smc->sk.sk_write_space(&smc->sk);
+}
+
+static void smc_destruct(struct sock *sk)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ if (sk->sk_state != SMC_CLOSED) {
+ pr_err("Attempt to release SMC socket in state %d %p\n",
+ sk->sk_state, sk);
+ return;
+ }
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ pr_err("Attempt to release alive smc socket %p\n", sk);
+ return;
+ }
+
+ sk->sk_state = SMC_DESTRUCT;
+ if (smc->conn.lgr)
+ smc_conn_free(&smc->conn);
+
+ sk_refcnt_debug_dec(sk);
+}
+
+static void smc_fin_worker(struct work_struct *work)
+{
+ struct smc_sock *smc =
+ container_of(work, struct smc_sock, fin_work.work);
+
+ cancel_delayed_work(&smc->fin_work);
+ sock_put(&smc->sk);
+}
+
+static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
+{
+ struct smc_sock *smc;
+ struct sock *sk;
+
+ sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
+ if (!sk)
+ return NULL;
+
+ sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
+ sk->sk_state = SMC_INIT;
+ sk->sk_destruct = smc_destruct;
+ sk->sk_protocol = SMCPROTO_SMC;
+ sk_refcnt_debug_inc(sk);
+
+ smc = smc_sk(sk);
+ smc->clcsock = NULL;
+ smc->use_fallback = 0;
+ memset(&smc->conn, 0, sizeof(smc->conn));
+ smc->addr = NULL;
+ smc->listen_smc = NULL;
+ INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_worker);
+ INIT_LIST_HEAD(&smc->accept_q);
+ spin_lock_init(&smc->accept_q_lock);
+ INIT_DELAYED_WORK(&smc->fin_work, smc_fin_worker);
+
+ return sk;
+}
+
+static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len)
+{
+ struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc;
+
+ smc = smc_sk(sk);
+
+ /* replicate tests from inet_bind(), to be safe wrt. future changes */
+ rc = -EINVAL;
+ if (addr_len < sizeof(struct sockaddr_in))
+ goto out;
+
+ rc = -EAFNOSUPPORT;
+ /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
+ if ((addr->sin_family != AF_INET) &&
+ ((addr->sin_family != AF_UNSPEC) ||
+ (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
+ goto out;
+
+ lock_sock(sk);
+
+ /* Check if socket is already active */
+ rc = -EINVAL;
+ if (sk->sk_state != SMC_INIT)
+ goto out_rel;
+
+ smc->clcsock->sk->sk_reuse = sk->sk_reuse;
+ rc = kernel_bind(smc->clcsock, uaddr, addr_len);
+
+out_rel:
+ release_sock(sk);
+out:
+ return rc;
+}
+
+static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
+ unsigned long mask)
+{
+ /* options we don't get control via setsockopt for */
+ nsk->sk_type = osk->sk_type;
+ nsk->sk_sndbuf = osk->sk_sndbuf;
+ nsk->sk_rcvbuf = osk->sk_rcvbuf;
+ nsk->sk_sndtimeo = osk->sk_sndtimeo;
+ nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
+ nsk->sk_mark = osk->sk_mark;
+ nsk->sk_priority = osk->sk_priority;
+ nsk->sk_rcvlowat = osk->sk_rcvlowat;
+ nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
+ nsk->sk_err = osk->sk_err;
+
+ nsk->sk_flags &= ~mask;
+ nsk->sk_flags |= osk->sk_flags & mask;
+}
+
+#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
+ (1UL << SOCK_KEEPOPEN) | \
+ (1UL << SOCK_LINGER) | \
+ (1UL << SOCK_BROADCAST) | \
+ (1UL << SOCK_TIMESTAMP) | \
+ (1UL << SOCK_DBG) | \
+ (1UL << SOCK_RCVTSTAMP) | \
+ (1UL << SOCK_RCVTSTAMPNS) | \
+ (1UL << SOCK_LOCALROUTE) | \
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
+ (1UL << SOCK_RXQ_OVFL) | \
+ (1UL << SOCK_WIFI_STATUS) | \
+ (1UL << SOCK_NOFCS) | \
+ (1UL << SOCK_FILTER_LOCKED))
+/* copy only relevant settings and flags of SOL_SOCKET level from smc to
+ * clc socket (since smc is not called for these options from net/core)
+ */
+static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
+{
+ smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
+}
+
+#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
+ (1UL << SOCK_KEEPOPEN) | \
+ (1UL << SOCK_LINGER) | \
+ (1UL << SOCK_DBG))
+/* copy only settings and flags relevant for smc from clc to smc socket */
+static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
+{
+ smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
+}
+
+/* determine subnet and mask of internal TCP socket */
+int smc_netinfo_by_tcpsk(struct socket *clcsock,
+ __be32 *subnet, u8 *prefix_len)
+{
+ struct dst_entry *dst = sk_dst_get(clcsock->sk);
+ struct sockaddr_in addr;
+ int rc = 0;
+ int len;
+
+ if (!dst) {
+ rc = -ENOTCONN;
+ goto out;
+ }
+ if (!dst->dev) {
+ rc = -ENODEV;
+ goto out_rel;
+ }
+
+ /* get address to which the internal TCP socket is bound */
+ kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
+ /* analyze IPv4 specific data of net_device belonging to TCP socket */
+ for_ifa(dst->dev->ip_ptr) {
+ if (ifa->ifa_address != addr.sin_addr.s_addr)
+ continue;
+ *prefix_len = inet_mask_len(ifa->ifa_mask);
+ *subnet = ifa->ifa_address & ifa->ifa_mask;
+ rc = 0;
+ break;
+ } endfor_ifa(dst->dev->ip_ptr);
+
+out_rel:
+ dst_release(dst);
+out:
+ return rc;
+}
+
+static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
+{
+ struct smc_link_group *lgr = smc->conn.lgr;
+ struct smc_link *link;
+ int rest;
+ int rc;
+
+ link = &lgr->lnk[SMC_SINGLE_LINK];
+ /* receive CONFIRM LINK request from server over RoCE fabric */
+ rest = wait_for_completion_interruptible_timeout(
+ &link->llc_confirm,
+ SMC_LLC_WAIT_FIRST_TIME);
+ if (rest <= 0) {
+ struct smc_clc_msg_decline dclc;
+
+ rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+ SMC_CLC_DECLINE);
+ return rc;
+ }
+
+ rc = smc_ib_modify_qp_rts(link);
+ if (rc)
+ return SMC_CLC_DECL_INTERR;
+
+ smc_wr_remember_qp_attr(link);
+ /* send CONFIRM LINK response over RoCE fabric */
+ rc = smc_llc_send_confirm_link(link,
+ link->smcibdev->mac[link->ibport - 1],
+ gid, SMC_LLC_RESP);
+ if (rc < 0)
+ return SMC_CLC_DECL_TCL;
+
+ return rc;
+}
+
+static void smc_conn_save_peer_info(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ smc->conn.peer_conn_idx = clc->conn_idx;
+ smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
+ smc->conn.peer_rmbe_len = smc_uncompress_bufsize(clc->rmbe_size);
+ atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_len);
+}
+
+static void smc_link_save_peer_info(struct smc_link *link,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ link->peer_qpn = ntoh24(clc->qpn);
+ memcpy(link->peer_gid, clc->lcl.gid.raw, SMC_GID_SIZE);
+ memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
+ link->peer_psn = ntoh24(clc->psn);
+ link->peer_mtu = clc->qp_mtu;
+}
+
+/* setup for RDMA connection of client */
+static int smc_connect_rdma(struct smc_sock *smc)
+{
+ struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
+ struct smc_clc_msg_accept_confirm aclc;
+ int local_contact = SMC_FIRST_CONTACT;
+ struct smc_ib_device *smcibdev;
+ struct smc_link *link;
+ u8 srv_first_contact;
+ int reason_code = 0;
+ int rc = 0;
+ u8 ibport;
+
+ if (smc->clc_started)
+ return rc;
+ smc->clc_started = 1;
+
+ /* IPSec connections opt out of SMC-R optimizations */
+ if (using_ipsec(smc)) {
+ reason_code = SMC_CLC_DECL_IPSEC;
+ goto decline_rdma;
+ }
+
+ /* PNET table look up: search active ib_device and port
+ * within same PNETID that also contains the ethernet device
+ * used for the internal TCP socket
+ */
+ smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
+ if (!smcibdev) {
+ reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+ goto decline_rdma;
+ }
+
+ /* do inband token exchange */
+ reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
+ if (reason_code < 0) {
+ rc = reason_code;
+ goto out_err;
+ }
+ if (reason_code > 0) /* configuration error */
+ goto decline_rdma;
+ /* receive SMC Accept CLC message */
+ reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
+ SMC_CLC_ACCEPT);
+ if (reason_code < 0) {
+ rc = reason_code;
+ goto out_err;
+ }
+ if (reason_code > 0)
+ goto decline_rdma;
+
+ srv_first_contact = aclc.hdr.flag;
+ mutex_lock(&smc_create_lgr_pending);
+ local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
+ ibport, &aclc.lcl, srv_first_contact);
+ if (local_contact < 0) {
+ rc = local_contact;
+ if (rc == -ENOMEM)
+ reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
+ else if (rc == -ENOLINK)
+ reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
+ goto decline_rdma_unlock;
+ }
+ link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+
+ smc_conn_save_peer_info(smc, &aclc);
+
+ rc = smc_sndbuf_create(smc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_MEM;
+ goto decline_rdma_unlock;
+ }
+ rc = smc_rmb_create(smc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_MEM;
+ goto decline_rdma_unlock;
+ }
+
+ if (local_contact == SMC_FIRST_CONTACT)
+ smc_link_save_peer_info(link, &aclc);
+
+ rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_INTERR;
+ goto decline_rdma_unlock;
+ }
+
+ if (local_contact == SMC_FIRST_CONTACT) {
+ rc = smc_ib_ready_link(link);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_INTERR;
+ goto decline_rdma_unlock;
+ }
+ }
+
+ rc = smc_clc_send_confirm(smc);
+ if (rc)
+ goto out_err_unlock;
+
+ if (local_contact == SMC_FIRST_CONTACT) {
+ /* QP confirmation over RoCE fabric */
+ reason_code = smc_clnt_conf_first_link(
+ smc, &smcibdev->gid[ibport - 1]);
+ if (reason_code < 0) {
+ rc = reason_code;
+ goto out_err_unlock;
+ }
+ if (reason_code > 0)
+ goto decline_rdma_unlock;
+ }
+
+ mutex_unlock(&smc_create_lgr_pending);
+out_connected:
+ smc_copy_sock_settings_to_clc(smc);
+ smc->sk.sk_state = SMC_ACTIVE;
+ smc_tx_init(smc);
+ smc_rx_init(smc);
+
+ return rc ? rc : local_contact;
+
+decline_rdma_unlock:
+ mutex_unlock(&smc_create_lgr_pending);
+ smc_conn_free(&smc->conn);
+decline_rdma:
+ /* RDMA setup failed, switch back to TCP */
+ smc->use_fallback = 1;
+ if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
+ rc = smc_clc_send_decline(smc, reason_code, 0);
+ if (rc < sizeof(struct smc_clc_msg_decline))
+ goto out_err;
+ }
+ goto out_connected;
+
+out_err_unlock:
+ mutex_unlock(&smc_create_lgr_pending);
+ smc_conn_free(&smc->conn);
+out_err:
+ return rc;
+}
+
+static int smc_connect(struct socket *sock, struct sockaddr *addr,
+ int alen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -EINVAL;
+
+ smc = smc_sk(sk);
+
+ /* separate smc parameter checking to be safe */
+ if (alen < sizeof(addr->sa_family))
+ goto out_err;
+ if (addr->sa_family != AF_INET)
+ goto out_err;
+ smc->addr = addr; /* needed for nonblocking connect */
+
+ lock_sock(sk);
+ switch (sk->sk_state) {
+ default:
+ goto out;
+ case SMC_ACTIVE:
+ rc = -EISCONN;
+ goto out;
+ case SMC_INIT:
+ rc = 0;
+ break;
+ }
+
+ smc_copy_sock_settings_to_clc(smc);
+ rc = kernel_connect(smc->clcsock, addr, alen, flags);
+ if (rc)
+ goto out;
+
+ /* setup RDMA connection */
+ rc = smc_connect_rdma(smc);
+ if (rc < 0)
+ goto out;
+ else
+ rc = 0; /* success cases including fallback */
+
+out:
+ release_sock(sk);
+out_err:
+ return rc;
+}
+
+static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
+{
+ struct sock *sk = &lsmc->sk;
+ struct socket *new_clcsock;
+ struct sock *new_sk;
+ int rc;
+
+ release_sock(&lsmc->sk);
+ new_sk = smc_sock_alloc(sock_net(sk), NULL);
+ if (!new_sk) {
+ rc = -ENOMEM;
+ lsmc->sk.sk_err = ENOMEM;
+ lsmc->sk.sk_state = SMC_CLOSED;
+ *new_smc = NULL;
+ lock_sock(&lsmc->sk);
+ goto out;
+ }
+ *new_smc = smc_sk(new_sk);
+
+ rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
+ lock_sock(&lsmc->sk);
+ if (rc < 0) {
+ lsmc->sk.sk_err = -rc;
+ new_sk->sk_state = SMC_CLOSED;
+ sock_set_flag(sk, SOCK_DEAD);
+ sock_put(new_sk);
+ *new_smc = NULL;
+ goto out;
+ }
+ if (lsmc->sk.sk_state == SMC_CLOSED) {
+ if (new_clcsock)
+ sock_release(new_clcsock);
+ new_sk->sk_state = SMC_CLOSED;
+ sock_set_flag(sk, SOCK_DEAD);
+ sock_put(new_sk);
+ *new_smc = NULL;
+ goto out;
+ }
+
+ (*new_smc)->clcsock = new_clcsock;
+out:
+ return rc;
+}
+
+/* add a just created sock to the accept queue of the listen sock as
+ * candidate for a following socket accept call from user space
+ */
+static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
+{
+ struct smc_sock *par = smc_sk(parent);
+
+ sock_hold(sk);
+ spin_lock(&par->accept_q_lock);
+ list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
+ spin_unlock(&par->accept_q_lock);
+ sk_acceptq_added(parent);
+}
+
+/* remove a socket from the accept queue of its parental listening socket */
+static void smc_accept_unlink(struct sock *sk)
+{
+ struct smc_sock *par = smc_sk(sk)->listen_smc;
+
+ spin_lock(&par->accept_q_lock);
+ list_del_init(&smc_sk(sk)->accept_q);
+ spin_unlock(&par->accept_q_lock);
+ sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
+ sock_put(sk);
+}
+
+/* remove a sock from the accept queue to bind it to a new socket created
+ * for a socket accept call from user space
+ */
+static struct sock *smc_accept_dequeue(struct sock *parent,
+ struct socket *new_sock)
+{
+ struct smc_sock *isk, *n;
+ struct sock *new_sk;
+
+ list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
+ new_sk = (struct sock *)isk;
+
+ smc_accept_unlink(new_sk);
+ if (new_sk->sk_state == SMC_CLOSED) {
+ /* tbd in follow-on patch: close this sock */
+ continue;
+ }
+ if (new_sock)
+ sock_graft(new_sk, new_sock);
+ return new_sk;
+ }
+ return NULL;
+}
+
+/* clean up for a created but never accepted sock */
+static void smc_destruct_non_accepted(struct sock *sk)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ sock_hold(sk);
+ lock_sock(sk);
+ if (!sk->sk_lingertime)
+ /* wait long for peer closing */
+ sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
+ smc_conn_release(smc);
+ if (smc->clcsock) {
+ struct socket *tcp;
+
+ tcp = smc->clcsock;
+ smc->clcsock = NULL;
+ sock_release(tcp);
+ }
+ release_sock(sk);
+ sock_set_flag(sk, SOCK_ZAPPED);
+ sock_set_flag(sk, SOCK_DEAD);
+ sock_put(sk);
+}
+
+static int smc_serv_conf_first_link(struct smc_sock *smc)
+{
+ struct smc_link_group *lgr = smc->conn.lgr;
+ struct smc_link *link;
+ int rest;
+ int rc;
+
+ link = &lgr->lnk[SMC_SINGLE_LINK];
+ /* send CONFIRM LINK request to client over the RoCE fabric */
+ rc = smc_llc_send_confirm_link(link,
+ link->smcibdev->mac[link->ibport - 1],
+ &link->smcibdev->gid[link->ibport - 1],
+ SMC_LLC_REQ);
+ if (rc < 0)
+ return SMC_CLC_DECL_TCL;
+
+ /* receive CONFIRM LINK response from client over the RoCE fabric */
+ rest = wait_for_completion_interruptible_timeout(
+ &link->llc_confirm_resp,
+ SMC_LLC_WAIT_FIRST_TIME);
+ if (rest <= 0) {
+ struct smc_clc_msg_decline dclc;
+
+ rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+ SMC_CLC_DECLINE);
+ }
+
+ return rc;
+}
+
+/* setup for RDMA connection of server */
+static void smc_listen_worker(struct work_struct *work)
+{
+ struct smc_sock *new_smc = container_of(work, struct smc_sock,
+ smc_listen_work);
+ struct socket *newclcsock = new_smc->clcsock;
+ struct smc_sock *lsmc = new_smc->listen_smc;
+ struct smc_clc_msg_accept_confirm cclc;
+ int local_contact = SMC_REUSE_CONTACT;
+ struct sock *newsmcsk = &new_smc->sk;
+ struct smc_clc_msg_proposal pclc;
+ struct smc_ib_device *smcibdev;
+ struct sockaddr_in peeraddr;
+ struct smc_link *link;
+ int reason_code = 0;
+ int rc = 0, len;
+ __be32 subnet;
+ u8 prefix_len;
+ u8 ibport;
+
+ /* do inband token exchange -
+ *wait for and receive SMC Proposal CLC message
+ */
+ reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
+ SMC_CLC_PROPOSAL);
+ if (reason_code < 0)
+ goto out_err;
+ if (reason_code > 0)
+ goto decline_rdma;
+
+ /* IPSec connections opt out of SMC-R optimizations */
+ if (using_ipsec(new_smc)) {
+ reason_code = SMC_CLC_DECL_IPSEC;
+ goto decline_rdma;
+ }
+
+ /* PNET table look up: search active ib_device and port
+ * within same PNETID that also contains the ethernet device
+ * used for the internal TCP socket
+ */
+ smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
+ if (!smcibdev) {
+ reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+ goto decline_rdma;
+ }
+
+ /* determine subnet and mask from internal TCP socket */
+ rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+ goto decline_rdma;
+ }
+ if ((pclc.outgoing_subnet != subnet) ||
+ (pclc.prefix_len != prefix_len)) {
+ reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+ goto decline_rdma;
+ }
+
+ /* get address of the peer connected to the internal TCP socket */
+ kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
+
+ /* allocate connection / link group */
+ mutex_lock(&smc_create_lgr_pending);
+ local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
+ smcibdev, ibport, &pclc.lcl, 0);
+ if (local_contact == SMC_REUSE_CONTACT)
+ /* lock no longer needed, free it due to following
+ * smc_clc_wait_msg() call
+ */
+ mutex_unlock(&smc_create_lgr_pending);
+ if (local_contact < 0) {
+ rc = local_contact;
+ if (rc == -ENOMEM)
+ reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
+ else if (rc == -ENOLINK)
+ reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
+ goto decline_rdma;
+ }
+ link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+
+ rc = smc_sndbuf_create(new_smc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_MEM;
+ goto decline_rdma;
+ }
+ rc = smc_rmb_create(new_smc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_MEM;
+ goto decline_rdma;
+ }
+
+ rc = smc_clc_send_accept(new_smc, local_contact);
+ if (rc)
+ goto out_err;
+
+ /* receive SMC Confirm CLC message */
+ reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
+ SMC_CLC_CONFIRM);
+ if (reason_code < 0)
+ goto out_err;
+ if (reason_code > 0)
+ goto decline_rdma;
+ smc_conn_save_peer_info(new_smc, &cclc);
+ if (local_contact == SMC_FIRST_CONTACT)
+ smc_link_save_peer_info(link, &cclc);
+
+ rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_INTERR;
+ goto decline_rdma;
+ }
+
+ if (local_contact == SMC_FIRST_CONTACT) {
+ rc = smc_ib_ready_link(link);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_INTERR;
+ goto decline_rdma;
+ }
+ /* QP confirmation over RoCE fabric */
+ reason_code = smc_serv_conf_first_link(new_smc);
+ if (reason_code < 0) {
+ /* peer is not aware of a problem */
+ rc = reason_code;
+ goto out_err;
+ }
+ if (reason_code > 0)
+ goto decline_rdma;
+ }
+
+out_connected:
+ sk_refcnt_debug_inc(newsmcsk);
+ newsmcsk->sk_state = SMC_ACTIVE;
+ smc_tx_init(new_smc);
+ smc_rx_init(new_smc);
+enqueue:
+ if (local_contact == SMC_FIRST_CONTACT)
+ mutex_unlock(&smc_create_lgr_pending);
+ lock_sock(&lsmc->sk);
+ if (lsmc->sk.sk_state == SMC_LISTEN) {
+ smc_accept_enqueue(&lsmc->sk, newsmcsk);
+ } else { /* no longer listening */
+ smc_destruct_non_accepted(newsmcsk);
+ }
+ release_sock(&lsmc->sk);
+
+ /* Wake up accept */
+ lsmc->sk.sk_data_ready(&lsmc->sk);
+ sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_worker */
+ return;
+
+decline_rdma:
+ /* RDMA setup failed, switch back to TCP */
+ smc_conn_free(&new_smc->conn);
+ new_smc->use_fallback = 1;
+ if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
+ rc = smc_clc_send_decline(new_smc, reason_code, 0);
+ if (rc < sizeof(struct smc_clc_msg_decline))
+ goto out_err;
+ }
+ goto out_connected;
+
+out_err:
+ newsmcsk->sk_state = SMC_CLOSED;
+ schedule_delayed_work(&new_smc->fin_work, TCP_TIMEWAIT_LEN);
+ goto enqueue; /* queue new sock with sk_err set */
+}
+
+static void smc_tcp_listen_worker(struct work_struct *work)
+{
+ struct smc_sock *lsmc = container_of(work, struct smc_sock,
+ tcp_listen_work);
+ struct smc_sock *new_smc;
+ int rc = 0;
+
+ lock_sock(&lsmc->sk);
+ while (lsmc->sk.sk_state == SMC_LISTEN) {
+ rc = smc_clcsock_accept(lsmc, &new_smc);
+ if (rc)
+ goto out;
+ if (!new_smc)
+ continue;
+
+ new_smc->listen_smc = lsmc;
+ new_smc->use_fallback = 0; /* assume rdma capability first */
+ sock_hold(&lsmc->sk); /* sock_put in smc_listen_worker */
+ INIT_WORK(&new_smc->smc_listen_work, smc_listen_worker);
+ smc_copy_sock_settings_to_smc(new_smc);
+ schedule_work(&new_smc->smc_listen_work);
+ }
+
+out:
+ release_sock(&lsmc->sk);
+ lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
+}
+
+static int smc_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+
+ rc = -EINVAL;
+ if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
+ goto out;
+
+ rc = 0;
+ if (sk->sk_state == SMC_LISTEN) {
+ sk->sk_max_ack_backlog = backlog;
+ goto out;
+ }
+ /* some socket options are handled in core, so we could not apply
+ * them to the clc socket -- copy smc socket options to clc socket
+ */
+ smc_copy_sock_settings_to_clc(smc);
+
+ rc = kernel_listen(smc->clcsock, backlog);
+ if (rc)
+ goto out;
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_ack_backlog = 0;
+ sk->sk_state = SMC_LISTEN;
+ INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_worker);
+ schedule_work(&smc->tcp_listen_work);
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static int smc_accept(struct socket *sock, struct socket *new_sock,
+ int flags)
+{
+ struct sock *sk = sock->sk, *nsk;
+ DECLARE_WAITQUEUE(wait, current);
+ struct smc_sock *lsmc;
+ long timeo;
+ int rc = 0;
+
+ lsmc = smc_sk(sk);
+ lock_sock(sk);
+
+ if (lsmc->sk.sk_state != SMC_LISTEN) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* Wait for an incoming connection */
+ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+ add_wait_queue_exclusive(sk_sleep(sk), &wait);
+ while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!timeo) {
+ rc = -EAGAIN;
+ break;
+ }
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ /* wakeup by sk_data_ready in smc_listen_worker() */
+ sched_annotate_sleep();
+ lock_sock(sk);
+ if (signal_pending(current)) {
+ rc = sock_intr_errno(timeo);
+ break;
+ }
+ }
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ if (rc)
+ goto out;
+ rc = sock_error(nsk);
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static int smc_getname(struct socket *sock, struct sockaddr *addr,
+ int *len, int peer)
+{
+ struct smc_sock *smc;
+
+ if (peer && (sock->sk->sk_state != SMC_ACTIVE))
+ return -ENOTCONN;
+
+ smc = smc_sk(sock->sk);
+
+ return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
+}
+
+static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -EPIPE;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+ if (sk->sk_state != SMC_ACTIVE)
+ goto out;
+ if (smc->use_fallback)
+ rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
+ else
+ rc = smc_tx_sendmsg(smc, msg, len);
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -ENOTCONN;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+ if ((sk->sk_state != SMC_ACTIVE) &&
+ (sk->sk_state != SMC_PEERCLW1) &&
+ (sk->sk_state != SMC_PEERCLW2) &&
+ (sk->sk_state != SMC_APPLCLW1) &&
+ (sk->sk_state != SMC_APPLCLW2) &&
+ (sk->sk_state != SMC_PEERABORTW) &&
+ (sk->sk_state != SMC_PROCESSABORT))
+ goto out;
+
+ if (smc->use_fallback)
+ rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
+ else
+ rc = smc_rx_recvmsg(smc, msg, len, flags);
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static unsigned int smc_accept_poll(struct sock *parent)
+{
+ struct smc_sock *isk;
+ struct sock *sk;
+
+ lock_sock(parent);
+ list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
+ sk = (struct sock *)isk;
+
+ if (sk->sk_state == SMC_ACTIVE) {
+ release_sock(parent);
+ return POLLIN | POLLRDNORM;
+ }
+ }
+ release_sock(parent);
+
+ return 0;
+}
+
+static unsigned int smc_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ unsigned int mask = 0;
+ struct smc_sock *smc;
+ int rc;
+
+ smc = smc_sk(sock->sk);
+ if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
+ /* delegate to CLC child sock */
+ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
+ /* if non-blocking connect finished ... */
+ lock_sock(sk);
+ if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
+ sk->sk_err = smc->clcsock->sk->sk_err;
+ if (sk->sk_err) {
+ mask |= POLLERR;
+ } else {
+ rc = smc_connect_rdma(smc);
+ if (rc < 0)
+ mask |= POLLERR;
+ else
+ /* success cases including fallback */
+ mask |= POLLOUT | POLLWRNORM;
+ }
+ }
+ release_sock(sk);
+ } else {
+ sock_poll_wait(file, sk_sleep(sk), wait);
+ if (sk->sk_state == SMC_LISTEN)
+ /* woken up by sk_data_ready in smc_listen_worker() */
+ mask |= smc_accept_poll(sk);
+ if (sk->sk_err)
+ mask |= POLLERR;
+ if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
+ (sk->sk_state == SMC_CLOSED))
+ mask |= POLLHUP;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+ if (atomic_read(&smc->conn.bytes_to_rcv))
+ mask |= POLLIN | POLLRDNORM; /* in earlier patch */
+ if (sk->sk_state == SMC_APPLCLW1)
+ mask |= POLLIN;
+ if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { /* in earlier patch */
+ if (atomic_read(&smc->conn.sndbuf_space)) {
+ mask |= POLLOUT | POLLWRNORM;
+ } else {
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ }
+ } else {
+ mask |= POLLOUT | POLLWRNORM;
+ }
+ }
+
+ return mask;
+}
+
+static int smc_conn_shutdown_write(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ long timeout = MAX_SCHEDULE_TIMEOUT;
+ struct sock *sk = &smc->sk;
+ int old_state;
+ int rc = 0;
+
+ if (sock_flag(sk, SOCK_LINGER))
+ timeout = sk->sk_lingertime;
+
+ old_state = sk->sk_state;
+ switch (old_state) {
+ case SMC_ACTIVE:
+ /* active close */
+ /* wait for sndbuf data being posted */
+ smc_wait_close_tx_prepared(smc, timeout);
+ rc = smc_tx_close_wr(conn);
+ if (conn->local_rx_ctrl.conn_state_flags.sending_done)
+ sk->sk_state = SMC_PEERCLW2;
+ else
+ sk->sk_state = SMC_PEERCLW1;
+ sk->sk_state_change(sk);
+ break;
+ case SMC_APPLCLW1:
+ /* passive close */
+ if (!smc_close_received(conn))
+ /* wait for sndbuf data being posted */
+ smc_wait_close_tx_prepared(smc, timeout);
+ /* confirm close from peer */
+ rc = smc_tx_close_wr(conn);
+ sk->sk_state = SMC_APPLCLW2;
+ break;
+ default:
+ break;
+ }
+
+ if (old_state != sk->sk_state)
+ sk->sk_state_change(&smc->sk);
+ return rc;
+}
+
+static int smc_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -EINVAL;
+
+ smc = smc_sk(sk);
+
+ if ((how < SHUT_RD) || (how > SHUT_RDWR))
+ goto out_err;
+
+ lock_sock(sk);
+
+ rc = -ENOTCONN;
+ if ((sk->sk_state != SMC_ACTIVE) &&
+ (sk->sk_state != SMC_PEERCLW1) &&
+ (sk->sk_state != SMC_PEERCLW2) &&
+ (sk->sk_state != SMC_APPLCLW1) &&
+ (sk->sk_state != SMC_APPLCLW2))
+ goto out;
+ if (smc->use_fallback) {
+ rc = kernel_sock_shutdown(smc->clcsock, how);
+ sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ sk->sk_state = SMC_CLOSED;
+ } else {
+ switch (how) {
+ case SHUT_RDWR: /* shutdown in both directions */
+ rc = smc_conn_release(smc);
+ break;
+ case SHUT_WR:
+ rc = smc_conn_shutdown_write(smc);
+ break;
+ case SHUT_RD:
+ break;
+ }
+ rc = kernel_sock_shutdown(smc->clcsock, how);
+ sk->sk_shutdown |= ++how;
+ }
+
+out:
+ release_sock(sk);
+
+out_err:
+ return rc;
+}
+
+static int smc_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+
+ smc = smc_sk(sk);
+
+ /* generic setsockopts reaching us here always apply to the
+ * CLC socket
+ */
+ return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
+ optval, optlen);
+}
+
+static int smc_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct smc_sock *smc;
+
+ smc = smc_sk(sock->sk);
+ /* socket options apply to the CLC socket */
+ return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
+ optval, optlen);
+}
+
+static int smc_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ struct smc_sock *smc;
+
+ smc = smc_sk(sock->sk);
+ if (smc->use_fallback)
+ return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
+ else
+ return sock_no_ioctl(sock, cmd, arg);
+}
+
+static ssize_t smc_sendpage(struct socket *sock, struct page *page,
+ int offset, size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -EPIPE;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+ if (sk->sk_state != SMC_ACTIVE)
+ goto out;
+ if (smc->use_fallback)
+ rc = kernel_sendpage(smc->clcsock, page, offset,
+ size, flags);
+ else
+ rc = sock_no_sendpage(sock, page, offset, size, flags);
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -ENOTCONN;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+ if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
+ goto out;
+ if (smc->use_fallback) {
+ rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
+ pipe, len, flags);
+ } else {
+ rc = -EOPNOTSUPP;
+ }
+out:
+ release_sock(sk);
+ return rc;
+}
+
+/* must look like tcp */
+static const struct proto_ops smc_sock_ops = {
+ .family = PF_SMC,
+ .owner = THIS_MODULE,
+ .release = smc_release,
+ .bind = smc_bind,
+ .connect = smc_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = smc_accept,
+ .getname = smc_getname,
+ .poll = smc_poll,
+ .ioctl = smc_ioctl,
+ .listen = smc_listen,
+ .shutdown = smc_shutdown,
+ .setsockopt = smc_setsockopt,
+ .getsockopt = smc_getsockopt,
+ .sendmsg = smc_sendmsg,
+ .recvmsg = smc_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = smc_sendpage,
+ .splice_read = smc_splice_read,
+};
+
+static int smc_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct smc_sock *smc;
+ struct sock *sk;
+ int rc;
+
+ rc = -ESOCKTNOSUPPORT;
+ if (sock->type != SOCK_STREAM)
+ goto out;
+
+ rc = -EPROTONOSUPPORT;
+ if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
+ goto out;
+
+ rc = -ENOBUFS;
+ sock->ops = &smc_sock_ops;
+ sk = smc_sock_alloc(net, sock);
+ if (!sk)
+ goto out;
+
+ /* create internal TCP socket for CLC handshake and fallback */
+ smc = smc_sk(sk);
+ smc->use_fallback = 0; /* assume rdma capability first */
+ rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
+ IPPROTO_TCP, &smc->clcsock);
+ if (rc)
+ sk_common_release(sk);
+ smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
+ smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
+
+out:
+ return rc;
+}
+
+static const struct net_proto_family smc_sock_family_ops = {
+ .family = PF_SMC,
+ .owner = THIS_MODULE,
+ .create = smc_create,
+};
+
+static int __init smc_init(void)
+{
+ int rc;
+
+ rc = smc_pnet_init();
+ if (rc)
+ return rc;
+
+ rc = smc_llc_init();
+ if (rc) {
+ pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
+ goto out_pnet;
+ }
+
+ rc = smc_cdc_init();
+ if (rc) {
+ pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
+ goto out_pnet;
+ }
+
+ rc = proto_register(&smc_proto, 1);
+ if (rc) {
+ pr_err("%s: proto_register fails with %d\n", __func__, rc);
+ goto out_pnet;
+ }
+
+ rc = sock_register(&smc_sock_family_ops);
+ if (rc) {
+ pr_err("%s: sock_register fails with %d\n", __func__, rc);
+ goto out_proto;
+ }
+
+ rc = smc_ib_register_client();
+ if (rc) {
+ pr_err("%s: ib_register fails with %d\n", __func__, rc);
+ goto out_sock;
+ }
+
+ return 0;
+
+out_sock:
+ sock_unregister(PF_SMC);
+out_proto:
+ proto_unregister(&smc_proto);
+out_pnet:
+ smc_pnet_exit();
+ return rc;
+}
+
+static void __exit smc_exit(void)
+{
+ LIST_HEAD(lgr_freeing_list);
+ struct smc_link_group *lgr, *lg;
+
+ spin_lock(&smc_lgr_list.lock);
+ if (!list_empty(&smc_lgr_list.list))
+ list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
+ spin_unlock(&smc_lgr_list.lock);
+ list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
+ cancel_delayed_work_sync(&lgr->free_work);
+ list_del_init(&lgr->list);
+ smc_lgr_free(lgr); /* free link group */
+ }
+ smc_ib_unregister_client();
+ sock_unregister(PF_SMC);
+ proto_unregister(&smc_proto);
+ smc_pnet_exit();
+}
+
+module_init(smc_init);
+module_exit(smc_exit);
+
+MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("smc socket address family");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_SMC);
diff --git a/net/smc/smc.h b/net/smc/smc.h
new file mode 100644
index 000000000000..4e5a42665c67
--- /dev/null
+++ b/net/smc/smc.h
@@ -0,0 +1,253 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for the SMC module (socket related)
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ursula.braun@linux.vnet.ibm.com>
+ */
+#ifndef _SMC_H
+#define _SMC_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+#include <net/sock.h>
+
+#include "smc_ib.h"
+
+#define SMCPROTO_SMC 0 /* SMC protocol */
+
+#define smc_stop_received(conn) \
+ (conn->local_rx_ctrl.conn_state_flags.sending_done || \
+ conn->local_rx_ctrl.conn_state_flags.abnormal_close || \
+ conn->local_rx_ctrl.conn_state_flags.closed_conn)
+
+#define smc_close_received(conn) \
+ (conn->local_rx_ctrl.conn_state_flags.abnormal_close || \
+ conn->local_rx_ctrl.conn_state_flags.closed_conn)
+
+enum smc_state { /* possible states of an SMC socket */
+ SMC_ACTIVE = 1,
+ SMC_INIT = 2,
+ SMC_CLOSED = 7,
+ SMC_LISTEN = 10,
+ SMC_PEERCLW1 = 20,
+ SMC_PEERCLW2 = 21,
+ SMC_APPLCLW1 = 22,
+ SMC_APPLCLW2 = 23,
+ SMC_APPLFINCLW = 24,
+ SMC_PEERFINCLW = 25,
+ SMC_PEERABORTW = 26,
+ SMC_PROCESSABORT = 27,
+ SMC_DESTRUCT = 32
+};
+
+struct smc_link_group;
+
+struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */
+ u8 type;
+} __packed;
+
+struct smc_cdc_conn_state_flags {
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 sending_done : 1; /* Sending done indicator */
+ u8 closed_conn : 1; /* Peer connection closed indicator */
+ u8 abnormal_close : 1; /* Abnormal close indicator */
+ u8 reserved : 5;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 5;
+ u8 abnormal_close : 1;
+ u8 closed_conn : 1;
+ u8 sending_done : 1;
+#endif
+} __packed;
+
+struct smc_cdc_producer_flags {
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 write_blocked : 1; /* Writing Blocked, no rx buf space */
+ u8 urg_data_pending : 1; /* Urgent Data Pending */
+ u8 urg_data_present : 1; /* Urgent Data Present */
+ u8 cons_curs_upd_req : 1; /* cursor update requested */
+ u8 failover_validation : 1;/* message replay due to failover */
+ u8 reserved : 3;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 3;
+ u8 failover_validation : 1;
+ u8 cons_curs_upd_req : 1;
+ u8 urg_data_present : 1;
+ u8 urg_data_pending : 1;
+ u8 write_blocked : 1;
+#endif
+} __packed;
+
+/* in host byte order */
+struct smc_host_cursor { /* SMC cursor - an offset in an RMBE */
+ u16 reserved;
+ u16 wrap; /* window wrap sequence number */
+ u32 count; /* cursor (= offset) part */
+} __aligned(8);
+
+/* in host byte order */
+union smc_host_cursor_ovl { /* overlay for atomic cursor handling */
+ struct smc_host_cursor curs;
+ u64 acurs;
+} __aligned(8);
+
+/* in host byte order, except for flag bitfields in network byte order */
+struct smc_host_cdc_msg { /* Connection Data Control message */
+ struct smc_wr_rx_hdr common; /* .type = 0xFE */
+ u8 len; /* length = 44 */
+ u16 seqno; /* connection seq # */
+ u32 token; /* alert_token */
+ union smc_host_cursor_ovl prod; /* producer cursor */
+ union smc_host_cursor_ovl cons; /* consumer cursor,
+ * piggy backed "ack"
+ */
+ struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */
+ struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/
+ u8 reserved[18];
+} __packed __aligned(8);
+
+struct smc_connection {
+ struct rb_node alert_node;
+ struct smc_link_group *lgr; /* link group of connection */
+ u32 alert_token_local; /* unique conn. id */
+ u8 peer_conn_idx; /* from tcp handshake */
+ int peer_rmbe_len; /* size of peer rx buffer */
+ atomic_t peer_rmbe_space;/* remaining free bytes in peer
+ * rmbe
+ */
+ int rtoken_idx; /* idx to peer RMB rkey/addr */
+
+ struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */
+ int sndbuf_size; /* sndbuf size <== sock wmem */
+ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */
+ int rmbe_size; /* RMBE size <== sock rmem */
+ int rmbe_size_short;/* compressed notation */
+ int rmbe_update_limit;
+ /* lower limit for consumer
+ * cursor update
+ */
+
+ struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging
+ * buffer for CDC msg send
+ * .prod cf. TCP snd_nxt
+ * .cons cf. TCP sends ack
+ */
+ union smc_host_cursor_ovl tx_curs_prep; /* tx - prepared data
+ * snd_max..wmem_alloc
+ */
+ union smc_host_cursor_ovl tx_curs_sent; /* tx - sent data
+ * snd_nxt ?
+ */
+ union smc_host_cursor_ovl tx_curs_fin; /* tx - confirmed by peer
+ * snd-wnd-begin ?
+ */
+ atomic_t sndbuf_space; /* remaining space in sndbuf */
+ u16 tx_cdc_seq; /* sequence # for CDC send */
+ spinlock_t send_lock; /* protect wr_sends */
+ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */
+
+ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl.
+ * .prod cf. TCP rcv_nxt
+ * .cons cf. TCP snd_una
+ */
+ union smc_host_cursor_ovl rx_curs_confirmed; /* confirmed to peer
+ * source of snd_una ?
+ */
+ atomic_t bytes_to_rcv; /* arrived data,
+ * not yet received
+ */
+};
+
+struct smc_sock { /* smc sock container */
+ struct sock sk;
+ struct socket *clcsock; /* internal tcp socket */
+ struct smc_connection conn; /* smc connection */
+ struct sockaddr *addr; /* inet connect address */
+ struct smc_sock *listen_smc; /* listen parent */
+ struct work_struct tcp_listen_work;/* handle tcp socket accepts */
+ struct work_struct smc_listen_work;/* prepare new accept socket */
+ struct list_head accept_q; /* sockets to be accepted */
+ spinlock_t accept_q_lock; /* protects accept_q */
+ struct delayed_work fin_work; /* final socket freeing */
+ u8 use_fallback : 1, /* fallback to tcp */
+ clc_started : 1;/* smc_connect_rdma ran */
+};
+
+static inline struct smc_sock *smc_sk(const struct sock *sk)
+{
+ return (struct smc_sock *)sk;
+}
+
+#define SMC_SYSTEMID_LEN 8
+
+extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
+
+extern struct mutex smc_create_lgr_pending;
+
+/* convert an u32 value into network byte order, store it into a 3 byte field */
+static inline void hton24(u8 *net, u32 host)
+{
+ __be32 t;
+
+ t = cpu_to_be32(host);
+ memcpy(net, ((u8 *)&t) + 1, 3);
+}
+
+/* convert a received 3 byte field into host byte order*/
+static inline u32 ntoh24(u8 *net)
+{
+ __be32 t = 0;
+
+ memcpy(((u8 *)&t) + 1, net, 3);
+ return be32_to_cpu(t);
+}
+
+#define SMC_RMB_SIZES 16 /* number of distinct sizes for an RMB*/
+
+/* convert the RMB size into the compressed notation - minimum 16K */
+static inline u8 smc_compress_bufsize(int size)
+{
+ u8 compressed = 0;
+
+ size = (size - 1) >> 14;
+ compressed = ilog2(size) + 1;
+ if (compressed >= SMC_RMB_SIZES)
+ compressed = SMC_RMB_SIZES - 1;
+ return compressed;
+}
+
+/* convert the RMB size from compressed notation into integer */
+static inline int smc_uncompress_bufsize(u8 compressed)
+{
+ u32 size;
+
+ size = 0x00000001 << (((int)compressed) + 14);
+ return (int)size;
+}
+
+#ifdef CONFIG_XFRM
+static inline bool using_ipsec(struct smc_sock *smc)
+{
+ return (smc->clcsock->sk->sk_policy[0] ||
+ smc->clcsock->sk->sk_policy[1]) ? 1 : 0;
+}
+#else
+static inline bool using_ipsec(struct smc_sock *smc)
+{
+ return 0;
+}
+#endif
+
+struct smc_clc_msg_local;
+
+int smc_netinfo_by_tcpsk(struct socket *, __be32 *, u8 *);
+void smc_conn_free(struct smc_connection *);
+int smc_conn_create(struct smc_sock *, __be32, struct smc_ib_device *, u8,
+ struct smc_clc_msg_local *, int);
+void smc_conn_release_handler(struct smc_sock *);
+void smc_wake_close_tx_prepared(struct smc_sock *);
+
+#endif /* _SMC_H */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
new file mode 100644
index 000000000000..f3cef65b8da1
--- /dev/null
+++ b/net/smc/smc_cdc.c
@@ -0,0 +1,259 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Connection Data Control (CDC)
+ * handles flow control
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ursula.braun@de.ibm.com>
+ */
+
+#include <linux/spinlock.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+#include "smc_cdc.h"
+#include "smc_tx.h"
+#include "smc_rx.h"
+
+struct smc_cdc_tx_pend {
+ struct smc_connection *conn; /* socket connection */
+ union smc_host_cursor_ovl cursor; /* tx sndbuf cursor sent */
+ union smc_host_cursor_ovl p_cursor; /* rx RMBE cursor produced */
+ u16 ctrl_seq; /* conn. tx sequence # */
+};
+
+/* handler for send/transmission completion of a CDC msg */
+static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
+ struct smc_link *link,
+ enum ib_wc_status wc_status)
+{
+ struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd;
+ struct smc_link_group *lgr;
+ struct smc_sock *smc;
+ int diff;
+
+ smc = container_of(cdcpend->conn, struct smc_sock, conn);
+ lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+ bh_lock_sock(&smc->sk);
+ if (!wc_status) {
+ diff = smc_curs_diff(cdcpend->conn->sndbuf_size,
+ &cdcpend->conn->tx_curs_fin,
+ &cdcpend->cursor);
+ smp_mb__before_atomic();
+ atomic_add(diff, &cdcpend->conn->sndbuf_space);
+ smp_mb__after_atomic();
+ xchg(&cdcpend->conn->tx_curs_fin.acurs,
+ cdcpend->cursor.acurs);
+ }
+ smc_tx_sndbuf_nonfull(smc);
+ bh_unlock_sock(&smc->sk);
+}
+
+int smc_cdc_get_free_slot(struct smc_link *link,
+ struct smc_wr_buf **wr_buf,
+ struct smc_cdc_tx_pend **pend)
+{
+ return smc_wr_tx_get_free_slot(
+ link, smc_cdc_tx_handler, wr_buf,
+ (struct smc_wr_tx_pend_priv **)pend);
+}
+
+static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
+ struct smc_cdc_tx_pend *pend)
+{
+ BUILD_BUG_ON_MSG(
+ sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE,
+ "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)");
+ BUILD_BUG_ON_MSG(
+ sizeof(struct smc_cdc_msg) != SMC_WR_TX_SIZE,
+ "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
+ BUILD_BUG_ON_MSG(
+ sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
+ "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_cdc_tx_pend)");
+ pend->conn = conn;
+ pend->cursor.curs = conn->tx_curs_sent.curs;
+ pend->p_cursor.curs = conn->local_tx_ctrl.prod.curs;
+ pend->ctrl_seq = conn->tx_cdc_seq;
+}
+
+int smc_cdc_msg_send(struct smc_connection *conn,
+ struct smc_wr_buf *wr_buf,
+ struct smc_cdc_tx_pend *pend)
+{
+ struct smc_link *link;
+ int rc;
+
+ link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+
+ smc_cdc_add_pending_send(conn, pend);
+
+ conn->tx_cdc_seq++;
+ conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
+ smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, &conn->local_tx_ctrl);
+ rc = smc_wr_tx_send(link, conn, (struct smc_wr_tx_pend_priv *)pend);
+ if (rc)
+ goto out;
+ xchg(&conn->rx_curs_confirmed.acurs,
+ smc_curs_read(conn->local_tx_ctrl.cons.acurs));
+
+out:
+ return rc;
+}
+
+int smc_cdc_wr_tx_pends(struct smc_connection *conn)
+{
+ struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+ int i;
+
+ for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
+ struct smc_cdc_tx_pend *tx_pend;
+
+ tx_pend = (struct smc_cdc_tx_pend *)&link->wr_tx_pends[i].priv;
+ if (tx_pend->conn == conn)
+ return 1;
+ }
+ return 0;
+}
+
+void smc_cdc_put_conn_slots(struct smc_connection *conn)
+{
+ struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+ int i;
+
+ for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
+ struct smc_wr_tx_pend_priv *tx_pend;
+
+ tx_pend = &link->wr_tx_pends[i].priv;
+ if (((struct smc_cdc_tx_pend *)tx_pend)->conn == conn)
+ smc_wr_tx_put_slot(link, tx_pend);
+ }
+}
+
+static inline bool smc_cdc_before(u16 seq1, u16 seq2)
+{
+ return (s16)(seq1 - seq2) < 0;
+}
+
+static void smc_cdc_msg_recv_action(struct smc_sock *smc,
+ struct smc_link *link,
+ struct smc_cdc_msg *cdc)
+{
+ union smc_host_cursor_ovl cons_old, prod_old;
+ struct smc_connection *conn = &smc->conn;
+ int diff_cons, diff_prod;
+
+ if (!cdc->prod_flags.failover_validation) {
+ if (smc_cdc_before(ntohs(cdc->seqno),
+ conn->local_rx_ctrl.seqno))
+ /* received seqno is old */
+ return;
+ }
+ prod_old.acurs = smc_curs_read(conn->local_rx_ctrl.prod.acurs);
+ cons_old.acurs = smc_curs_read(conn->local_rx_ctrl.cons.acurs);
+ smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc);
+
+ diff_cons = smc_curs_diff(conn->peer_rmbe_len, &cons_old,
+ &conn->local_rx_ctrl.cons);
+ if (diff_cons) {
+ smp_mb__before_atomic();
+ atomic_add(diff_cons, &conn->peer_rmbe_space);
+ smp_mb__after_atomic();
+ smc_rx_handler(smc);
+ }
+
+ diff_prod = smc_curs_diff(conn->rmbe_size, &prod_old,
+ &conn->local_rx_ctrl.prod);
+ if (diff_prod) {
+ smp_mb__before_atomic();
+ atomic_add(diff_prod, &conn->bytes_to_rcv);
+ smp_mb__after_atomic();
+ }
+
+ if (conn->local_rx_ctrl.conn_state_flags.abnormal_close)
+ smc->sk.sk_err = ECONNRESET;
+ if (smc_stop_received(conn))
+ smc_conn_release_handler(smc);
+
+ /* piggy backed tx info */
+ /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
+ if (diff_cons && smc_tx_prepared_sends(conn)) {
+ smc_tx_sndbuf_nonempty(conn);
+ smc_wake_close_tx_prepared(smc);
+ }
+
+ /* socket connected but not accepted */
+ if (!smc->sk.sk_socket)
+ return;
+
+ /* data available */
+ if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
+ (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req))
+ smc_tx_consumer_update(conn);
+}
+
+/* called under tasklet context */
+static inline void smc_cdc_msg_recv(struct smc_cdc_msg *cdc,
+ struct smc_link *link, u64 wr_id)
+{
+ struct smc_link_group *lgr = container_of(link, struct smc_link_group,
+ lnk[SMC_SINGLE_LINK]);
+ struct smc_connection *connection;
+ struct smc_sock *smc;
+
+ /* lookup connection */
+ read_lock_bh(&lgr->conns_lock);
+ connection = smc_lgr_find_conn(ntohl(cdc->token), lgr);
+ if (!connection) {
+ read_unlock_bh(&lgr->conns_lock);
+ return;
+ }
+ smc = container_of(connection, struct smc_sock, conn);
+ if (smc->sk.sk_state == SMC_DESTRUCT) {
+ read_unlock_bh(&lgr->conns_lock);
+ return;
+ }
+ sock_hold(&smc->sk);
+ read_unlock_bh(&lgr->conns_lock);
+ bh_lock_sock(&smc->sk);
+ smc_cdc_msg_recv_action(smc, link, cdc);
+ bh_unlock_sock(&smc->sk);
+ sock_put(&smc->sk); /* no free sk in softirq-context */
+}
+
+static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
+{
+ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+ struct smc_cdc_msg *cdc = buf;
+
+ if (wc->byte_len < sizeof(*cdc))
+ return; /* short message */
+ if (cdc->len != sizeof(*cdc))
+ return; /* invalid message */
+ smc_cdc_msg_recv(cdc, link, wc->wr_id);
+}
+
+static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = {
+ {
+ .handler = smc_cdc_rx_handler,
+ .type = SMC_CDC_MSG_TYPE
+ },
+ {
+ .handler = NULL,
+ }
+};
+
+int __init smc_cdc_init(void)
+{
+ struct smc_wr_rx_handler *handler;
+ int rc = 0;
+
+ for (handler = smc_cdc_rx_handlers; handler->handler; handler++) {
+ INIT_HLIST_NODE(&handler->list);
+ rc = smc_wr_rx_register_handler(handler);
+ if (rc)
+ break;
+ }
+ return rc;
+}
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
new file mode 100644
index 000000000000..7269794a7de2
--- /dev/null
+++ b/net/smc/smc_cdc.h
@@ -0,0 +1,162 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Connection Data Control (CDC)
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ursula.braun@de.ibm.com>
+ */
+
+#ifndef SMC_CDC_H
+#define SMC_CDC_H
+
+#include <linux/kernel.h> /* max_t */
+#include <linux/compiler.h> /* __packed */
+#include <linux/atomic.h> /* xchg */
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_wr.h"
+
+#define SMC_CDC_MSG_TYPE 0xFE
+
+/* in network byte order */
+struct smc_cdc_cursor { /* SMC cursor */
+ __be16 reserved;
+ __be16 wrap;
+ __be32 count;
+} __packed __aligned(8);
+
+/* in network byte order */
+union smc_cdc_cursor_ovl {
+ struct smc_cdc_cursor curs;
+ __be64 acurs;
+} __packed __aligned(8);
+
+/* in network byte order */
+struct smc_cdc_msg {
+ struct smc_wr_rx_hdr common; /* .type = 0xFE */
+ u8 len; /* 44 */
+ __be16 seqno;
+ __be32 token;
+ union smc_cdc_cursor_ovl prod;
+ union smc_cdc_cursor_ovl cons; /* piggy backed "ack" */
+ struct smc_cdc_producer_flags prod_flags;
+ struct smc_cdc_conn_state_flags conn_state_flags;
+ u8 reserved[18];
+} __packed;
+
+static inline void smc_curs_add(int size, struct smc_host_cursor *curs,
+ int value)
+{
+ curs->wrap += (curs->count + value) / size;
+ curs->count = (curs->count + value) % size;
+}
+
+static inline u64 smc_curs_read(u64 c)
+{
+#if BITS_PER_LONG != 64
+ /* We must enforce atomic readout on 32bit, otherwise the
+ * update on another cpu can hit inbetween the readout of
+ * the low 32bit and the high 32bit portion.
+ */
+ return cmpxchg64(&c, 0, 0);
+#else
+ /* On 64 bit the cursor read is atomic versus the update */
+ return c;
+#endif
+}
+
+static inline __be64 smc_curs_read_net(__be64 c)
+{
+#if BITS_PER_LONG != 64
+ /* We must enforce atomic readout on 32bit, otherwise the
+ * update on another cpu can hit inbetween the readout of
+ * the low 32bit and the high 32bit portion.
+ */
+ return cmpxchg64(&c, 0, 0);
+#else
+ /* On 64 bit the cursor read is atomic versus the update */
+ return c;
+#endif
+}
+
+/* calculate cursor difference between old and new, where old <= new */
+static inline int smc_curs_diff(unsigned int size,
+ union smc_host_cursor_ovl *old,
+ union smc_host_cursor_ovl *new)
+{
+ if (old->curs.wrap != new->curs.wrap)
+ return max_t(int, 0,
+ ((size - old->curs.count) + new->curs.count));
+
+ return max_t(int, 0, (new->curs.count - old->curs.count));
+}
+
+static inline void smc_host_cursor_to_cdc(struct smc_cdc_cursor *peer,
+ union smc_host_cursor_ovl *local)
+{
+ union smc_host_cursor_ovl temp;
+
+ temp.acurs = smc_curs_read(local->acurs);
+ peer->count = htonl(temp.curs.count);
+ peer->wrap = htons(temp.curs.wrap);
+ /* peer->reserved = htons(0); must be ensured by caller */
+}
+
+static inline void smc_host_msg_to_cdc(struct smc_cdc_msg *peer,
+ struct smc_host_cdc_msg *local)
+{
+ peer->common.type = local->common.type;
+ peer->len = local->len;
+ peer->seqno = htons(local->seqno);
+ peer->token = htonl(local->token);
+ smc_host_cursor_to_cdc(&peer->prod.curs, &local->prod);
+ smc_host_cursor_to_cdc(&peer->cons.curs, &local->cons);
+ peer->prod_flags = local->prod_flags;
+ peer->conn_state_flags = local->conn_state_flags;
+}
+
+static inline void smc_cdc_cursor_to_host(union smc_host_cursor_ovl *local,
+ union smc_cdc_cursor_ovl *peer)
+{
+ union smc_host_cursor_ovl temp, old;
+ union smc_cdc_cursor_ovl net;
+
+ old.acurs = smc_curs_read(local->acurs);
+ net.acurs = smc_curs_read_net(peer->acurs);
+ temp.curs.count = ntohl(net.curs.count);
+ temp.curs.wrap = ntohs(net.curs.wrap);
+ if ((old.curs.wrap > temp.curs.wrap) && temp.curs.wrap)
+ return;
+ if ((old.curs.wrap == temp.curs.wrap) &&
+ (old.curs.count > temp.curs.count))
+ return;
+ xchg(&local->acurs, temp.acurs);
+}
+
+static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
+ struct smc_cdc_msg *peer)
+{
+ local->common.type = peer->common.type;
+ local->len = peer->len;
+ local->seqno = ntohs(peer->seqno);
+ local->token = ntohl(peer->token);
+ smc_cdc_cursor_to_host(&local->prod, &peer->prod);
+ smc_cdc_cursor_to_host(&local->cons, &peer->cons);
+ local->prod_flags = peer->prod_flags;
+ local->conn_state_flags = peer->conn_state_flags;
+}
+
+struct smc_cdc_tx_pend;
+
+int smc_cdc_get_free_slot(struct smc_link *, struct smc_wr_buf **,
+ struct smc_cdc_tx_pend **);
+void smc_cdc_put_conn_slots(struct smc_connection *conn);
+int smc_cdc_msg_send(struct smc_connection *, struct smc_wr_buf *,
+ struct smc_cdc_tx_pend *);
+int smc_cdc_wr_tx_pends(struct smc_connection *);
+int smc_cdc_init(void) __init;
+
+#endif /* SMC_CDC_H */
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
new file mode 100644
index 000000000000..58742ee3de06
--- /dev/null
+++ b/net/smc/smc_clc.c
@@ -0,0 +1,273 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * CLC (connection layer control) handshake over initial TCP socket to
+ * prepare for RDMA traffic
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_clc.h"
+#include "smc_ib.h"
+
+/* Wait for data on the tcp-socket, analyze received data
+ * Returns:
+ * 0 if success and it was not a decline that we received.
+ * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send.
+ * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise.
+ */
+int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
+ u8 expected_type)
+{
+ struct smc_clc_msg_hdr *clcm = buf;
+ struct sock *clc_sk = smc->clcsock->sk;
+ struct msghdr msg = {NULL, 0};
+ int reason_code = 0;
+ DEFINE_WAIT(wait);
+ struct kvec vec;
+ int len, datlen;
+ int krflags;
+
+ /* peek the first few bytes to determine length of data to receive
+ * so we don't consume any subsequent CLC message or payload data
+ * in the TCP byte stream
+ */
+ vec.iov_base = buf;
+ vec.iov_len = buflen;
+ krflags = MSG_PEEK | MSG_WAITALL;
+ smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
+ len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1,
+ sizeof(struct smc_clc_msg_hdr), krflags);
+ if (signal_pending(current)) {
+ reason_code = -EINTR;
+ clc_sk->sk_err = -reason_code;
+ smc->sk.sk_err = clc_sk->sk_err;
+ goto out;
+ }
+ if (clc_sk->sk_err) {
+ reason_code = -clc_sk->sk_err;
+ smc->sk.sk_err = clc_sk->sk_err;
+ goto out;
+ }
+ if (!len) { /* peer has performed orderly shutdown */
+ smc->sk.sk_err = ECONNRESET;
+ reason_code = -ECONNRESET;
+ goto out;
+ }
+ datlen = ntohs(clcm->length);
+ if ((len < sizeof(struct smc_clc_msg_hdr)) ||
+ (datlen < sizeof(struct smc_clc_msg_decline)) ||
+ memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) ||
+ ((clcm->type != SMC_CLC_DECLINE) &&
+ (clcm->type != expected_type))) {
+ smc->sk.sk_err = EPROTO;
+ reason_code = -EPROTO;
+ goto out;
+ }
+
+ /* receive the complete CLC message */
+ vec.iov_base = buf;
+ vec.iov_len = buflen;
+ memset(&msg, 0, sizeof(struct msghdr));
+ krflags = MSG_WAITALL;
+ smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
+ len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags);
+ if (len < datlen) {
+ smc->sk.sk_err = EPROTO;
+ reason_code = -EPROTO;
+ goto out;
+ }
+ if (clcm->type == SMC_CLC_DECLINE)
+ reason_code = SMC_CLC_DECL_REPLY;
+out:
+ return reason_code;
+}
+
+/* send CLC DECLINE message across internal TCP socket */
+int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,
+ u8 out_of_sync)
+{
+ struct smc_clc_msg_decline dclc;
+ struct msghdr msg;
+ struct kvec vec;
+ int len;
+
+ memset(&dclc, 0, sizeof(dclc));
+ memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ dclc.hdr.type = SMC_CLC_DECLINE;
+ dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
+ dclc.hdr.version = SMC_CLC_V1;
+ dclc.hdr.flag = out_of_sync ? 1 : 0;
+ memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid));
+ dclc.peer_diagnosis = htonl(peer_diag_info);
+ memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+ memset(&msg, 0, sizeof(msg));
+ vec.iov_base = &dclc;
+ vec.iov_len = sizeof(struct smc_clc_msg_decline);
+ len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
+ sizeof(struct smc_clc_msg_decline));
+ if (len < sizeof(struct smc_clc_msg_decline))
+ smc->sk.sk_err = EPROTO;
+ if (len < 0)
+ smc->sk.sk_err = -len;
+ return len;
+}
+
+/* send CLC PROPOSAL message across internal TCP socket */
+int smc_clc_send_proposal(struct smc_sock *smc,
+ struct smc_ib_device *smcibdev,
+ u8 ibport)
+{
+ struct smc_clc_msg_proposal pclc;
+ int reason_code = 0;
+ struct msghdr msg;
+ struct kvec vec;
+ int len, rc;
+
+ /* send SMC Proposal CLC message */
+ memset(&pclc, 0, sizeof(pclc));
+ memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ pclc.hdr.type = SMC_CLC_PROPOSAL;
+ pclc.hdr.length = htons(sizeof(pclc));
+ pclc.hdr.version = SMC_CLC_V1; /* SMC version */
+ memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
+ memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE);
+ memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1],
+ sizeof(smcibdev->mac[ibport - 1]));
+
+ /* determine subnet and mask from internal TCP socket */
+ rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet,
+ &pclc.prefix_len);
+ if (rc)
+ return SMC_CLC_DECL_CNFERR; /* configuration error */
+ memcpy(&pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ memset(&msg, 0, sizeof(msg));
+ vec.iov_base = &pclc;
+ vec.iov_len = sizeof(pclc);
+ /* due to the few bytes needed for clc-handshake this cannot block */
+ len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc));
+ if (len < sizeof(pclc)) {
+ if (len >= 0) {
+ reason_code = -ENETUNREACH;
+ smc->sk.sk_err = -reason_code;
+ } else {
+ smc->sk.sk_err = smc->clcsock->sk->sk_err;
+ reason_code = -smc->sk.sk_err;
+ }
+ }
+
+ return reason_code;
+}
+
+/* send CLC CONFIRM message across internal TCP socket */
+int smc_clc_send_confirm(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct smc_clc_msg_accept_confirm cclc;
+ struct smc_link *link;
+ int reason_code = 0;
+ struct msghdr msg;
+ struct kvec vec;
+ int len;
+
+ link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+ /* send SMC Confirm CLC msg */
+ memset(&cclc, 0, sizeof(cclc));
+ memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ cclc.hdr.type = SMC_CLC_CONFIRM;
+ cclc.hdr.length = htons(sizeof(cclc));
+ cclc.hdr.version = SMC_CLC_V1; /* SMC version */
+ memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
+ memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
+ SMC_GID_SIZE);
+ memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
+ sizeof(link->smcibdev->mac));
+ hton24(cclc.qpn, link->roce_qp->qp_num);
+ cclc.rmb_rkey =
+ htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
+ cclc.rmbe_alert_token = htonl(conn->alert_token_local);
+ cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
+ cclc.rmbe_size = conn->rmbe_size_short;
+ cclc.rmb_dma_addr =
+ cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]);
+ hton24(cclc.psn, link->psn_initial);
+
+ memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+ memset(&msg, 0, sizeof(msg));
+ vec.iov_base = &cclc;
+ vec.iov_len = sizeof(cclc);
+ len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc));
+ if (len < sizeof(cclc)) {
+ if (len >= 0) {
+ reason_code = -ENETUNREACH;
+ smc->sk.sk_err = -reason_code;
+ } else {
+ smc->sk.sk_err = smc->clcsock->sk->sk_err;
+ reason_code = -smc->sk.sk_err;
+ }
+ }
+ return reason_code;
+}
+
+/* send CLC ACCEPT message across internal TCP socket */
+int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
+{
+ struct smc_connection *conn = &new_smc->conn;
+ struct smc_clc_msg_accept_confirm aclc;
+ struct smc_link *link;
+ struct msghdr msg;
+ struct kvec vec;
+ int rc = 0;
+ int len;
+
+ link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+ memset(&aclc, 0, sizeof(aclc));
+ memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ aclc.hdr.type = SMC_CLC_ACCEPT;
+ aclc.hdr.length = htons(sizeof(aclc));
+ aclc.hdr.version = SMC_CLC_V1; /* SMC version */
+ if (srv_first_contact)
+ aclc.hdr.flag = 1;
+ memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
+ memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
+ SMC_GID_SIZE);
+ memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1],
+ sizeof(link->smcibdev->mac[link->ibport - 1]));
+ hton24(aclc.qpn, link->roce_qp->qp_num);
+ aclc.rmb_rkey =
+ htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */
+ aclc.rmbe_alert_token = htonl(conn->alert_token_local);
+ aclc.qp_mtu = link->path_mtu;
+ aclc.rmbe_size = conn->rmbe_size_short,
+ aclc.rmb_dma_addr =
+ cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]);
+ hton24(aclc.psn, link->psn_initial);
+ memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+ memset(&msg, 0, sizeof(msg));
+ vec.iov_base = &aclc;
+ vec.iov_len = sizeof(aclc);
+ len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc));
+ if (len < sizeof(aclc)) {
+ if (len >= 0) {
+ rc = -EPROTO;
+ new_smc->sk.sk_err = -rc;
+ } else {
+ new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err;
+ rc = -new_smc->sk.sk_err;
+ }
+ }
+
+ return rc;
+}
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
new file mode 100644
index 000000000000..896afb2acfae
--- /dev/null
+++ b/net/smc/smc_clc.h
@@ -0,0 +1,113 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * CLC (connection layer control) handshake over initial TCP socket to
+ * prepare for RDMA traffic
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_CLC_H
+#define _SMC_CLC_H
+
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+
+#define SMC_CLC_PROPOSAL 0x01
+#define SMC_CLC_ACCEPT 0x02
+#define SMC_CLC_CONFIRM 0x03
+#define SMC_CLC_DECLINE 0x04
+
+/* eye catcher "SMCR" EBCDIC for CLC messages */
+static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
+
+#define SMC_CLC_V1 0x1 /* SMC version */
+#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */
+#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */
+#define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */
+#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */
+#define SMC_CLC_DECL_IPSEC 0x03030000 /* IPsec usage */
+#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */
+#define SMC_CLC_DECL_REPLY 0x06000000 /* reply to a received decline */
+#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */
+#define SMC_CLC_DECL_TCL 0x02040000 /* timeout w4 QP confirm */
+#define SMC_CLC_DECL_SEND 0x07000000 /* sending problem */
+
+struct smc_clc_msg_hdr { /* header1 of clc messages */
+ u8 eyecatcher[4]; /* eye catcher */
+ u8 type; /* proposal / accept / confirm / decline */
+ __be16 length;
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 version : 4,
+ flag : 1,
+ rsvd : 3;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 rsvd : 3,
+ flag : 1,
+ version : 4;
+#endif
+} __packed;
+
+struct smc_clc_msg_trail { /* trailer of clc messages */
+ u8 eyecatcher[4];
+} __packed;
+
+struct smc_clc_msg_local { /* header2 of clc messages */
+ u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */
+ union ib_gid gid; /* gid of ib_device port */
+ u8 mac[6]; /* mac of ib_device port */
+} __packed;
+
+struct smc_clc_msg_proposal { /* clc proposal message */
+ struct smc_clc_msg_hdr hdr;
+ struct smc_clc_msg_local lcl;
+ __be16 iparea_offset; /* offset to IP address information area */
+ __be32 outgoing_subnet; /* subnet mask */
+ u8 prefix_len; /* number of significant bits in mask */
+ u8 reserved[2];
+ u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */
+ struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+} __packed;
+
+struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
+ struct smc_clc_msg_hdr hdr;
+ struct smc_clc_msg_local lcl;
+ u8 qpn[3]; /* QP number */
+ __be32 rmb_rkey; /* RMB rkey */
+ u8 conn_idx; /* Connection index, which RMBE in RMB */
+ __be32 rmbe_alert_token;/* unique connection id */
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 rmbe_size : 4, /* server's RMB buf size (compressed not.) */
+ qp_mtu : 4; /* QP mtu */
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 qp_mtu : 4,
+ rmbe_size : 4;
+#endif
+ u8 reserved;
+ __be64 rmb_dma_addr; /* RMB virtual address */
+ u8 reserved2;
+ u8 psn[3]; /* initial packet sequence number */
+ struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+} __packed;
+
+struct smc_clc_msg_decline { /* clc decline message */
+ struct smc_clc_msg_hdr hdr;
+ u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
+ __be32 peer_diagnosis; /* diagnosis information */
+ u8 reserved2[4];
+ struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+} __packed;
+
+struct smc_sock;
+struct smc_ib_device;
+
+int smc_clc_wait_msg(struct smc_sock *, void *, int, u8);
+int smc_clc_send_decline(struct smc_sock *, u32, u8);
+int smc_clc_send_proposal(struct smc_sock *, struct smc_ib_device *, u8);
+int smc_clc_send_confirm(struct smc_sock *);
+int smc_clc_send_accept(struct smc_sock *, int);
+
+#endif
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
new file mode 100644
index 000000000000..81adc015f69b
--- /dev/null
+++ b/net/smc/smc_core.c
@@ -0,0 +1,571 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Basic Transport Functions exploiting Infiniband API
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/socket.h>
+#include <linux/if_vlan.h>
+#include <linux/random.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_clc.h"
+#include "smc_core.h"
+#include "smc_ib.h"
+#include "smc_wr.h"
+#include "smc_llc.h"
+#include "smc_cdc.h"
+
+#define SMC_LGR_NUM_INCR 256
+
+static u32 smc_lgr_num; /* unique link group number */
+
+/* Register connection's alert token in our lookup structure.
+ * To use rbtrees we have to implement our own insert core.
+ * Requires @conns_lock
+ * @smc connection to register
+ * Returns 0 on success, != otherwise.
+ */
+static void smc_lgr_add_alert_token(struct smc_connection *conn)
+{
+ struct rb_node **link, *parent = NULL;
+ u32 token = conn->alert_token_local;
+
+ link = &conn->lgr->conns_all.rb_node;
+ while (*link) {
+ struct smc_connection *cur = rb_entry(*link,
+ struct smc_connection, alert_node);
+
+ parent = *link;
+ if (cur->alert_token_local > token)
+ link = &parent->rb_left;
+ else
+ link = &parent->rb_right;
+ }
+ /* Put the new node there */
+ rb_link_node(&conn->alert_node, parent, link);
+ rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
+}
+
+/* Register connection in link group by assigning an alert token
+ * registered in a search tree.
+ * Requires @conns_lock
+ * Note that '0' is a reserved value and not assigned.
+ */
+static void smc_lgr_register_conn(struct smc_connection *conn)
+{
+ static atomic_t nexttoken = ATOMIC_INIT(0);
+
+ /* find a new alert_token_local value not yet used by some connection
+ * in this link group
+ */
+ while (!conn->alert_token_local) {
+ conn->alert_token_local = atomic_inc_return(&nexttoken);
+ if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
+ conn->alert_token_local = 0;
+ }
+ smc_lgr_add_alert_token(conn);
+ conn->lgr->conns_num++;
+}
+
+/* Unregister connection and reset the alert token of the given connection
+ */
+static void smc_lgr_unregister_conn(struct smc_connection *conn)
+{
+ struct smc_link_group *lgr = conn->lgr;
+
+ write_lock_bh(&lgr->conns_lock);
+ rb_erase(&conn->alert_node, &lgr->conns_all);
+ lgr->conns_num--;
+ write_unlock_bh(&lgr->conns_lock);
+}
+
+/* create a new SMC link group */
+static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
+ struct smc_ib_device *smcibdev, u8 ibport,
+ char *peer_systemid, unsigned short vlan_id)
+{
+ struct smc_link_group *lgr;
+ struct smc_link *lnk;
+ u8 rndvec[3];
+ int rc = 0;
+ int i;
+
+ lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
+ if (!lgr) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+ lgr->daddr = peer_in_addr;
+ memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
+ lgr->vlan_id = vlan_id;
+ rwlock_init(&lgr->sndbufs_lock);
+ rwlock_init(&lgr->rmbs_lock);
+ for (i = 0; i < SMC_RMB_SIZES; i++) {
+ INIT_LIST_HEAD(&lgr->sndbufs[i]);
+ INIT_LIST_HEAD(&lgr->rmbs[i]);
+ }
+ smc_lgr_num += SMC_LGR_NUM_INCR;
+ lgr->id = smc_lgr_num;
+
+ lnk = &lgr->lnk[SMC_SINGLE_LINK];
+ /* initialize link */
+ lnk->smcibdev = smcibdev;
+ lnk->ibport = ibport;
+ lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
+ get_random_bytes(rndvec, sizeof(rndvec));
+ lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
+ rc = smc_wr_alloc_link_mem(lnk);
+ if (rc)
+ goto free_lgr;
+ init_waitqueue_head(&lnk->wr_tx_wait);
+ rc = smc_ib_create_protection_domain(lnk);
+ if (rc)
+ goto free_link_mem;
+ rc = smc_ib_get_memory_region(lnk->roce_pd, IB_ACCESS_LOCAL_WRITE,
+ &lnk->mr_tx);
+ if (rc)
+ goto dealloc_pd;
+ rc = smc_ib_create_queue_pair(lnk);
+ if (rc)
+ goto dereg_mr;
+ rc = smc_wr_create_lgr(lnk);
+ if (rc)
+ goto destroy_qp;
+ init_completion(&lnk->llc_confirm);
+ init_completion(&lnk->llc_confirm_resp);
+
+ smc->conn.lgr = lgr;
+ rwlock_init(&lgr->conns_lock);
+ spin_lock(&smc_lgr_list.lock);
+ list_add_tail(&lgr->list, &smc_lgr_list.list);
+ spin_unlock(&smc_lgr_list.lock);
+ return 0;
+
+destroy_qp:
+ smc_ib_destroy_queue_pair(lnk);
+dereg_mr:
+ smc_ib_dereg_memory_region(lnk->mr_tx);
+dealloc_pd:
+ smc_ib_dealloc_protection_domain(lnk);
+free_link_mem:
+ smc_wr_free_link_mem(lnk);
+free_lgr:
+ kfree(lgr);
+out:
+ return rc;
+}
+
+static void smc_sndbuf_free(struct smc_connection *conn)
+{
+ if (conn->sndbuf_desc) {
+ xchg(&conn->sndbuf_desc->used, 0);
+ conn->sndbuf_size = 0;
+ }
+}
+
+static void smc_rmb_free(struct smc_connection *conn)
+{
+ if (conn->rmb_desc) {
+ xchg(&conn->rmb_desc->used, 0);
+ conn->rmbe_size = 0;
+ }
+}
+
+/* remove a finished connection from its link group */
+void smc_conn_free(struct smc_connection *conn)
+{
+ struct smc_link_group *lgr = conn->lgr;
+
+ if (!lgr)
+ return;
+ smc_lgr_unregister_conn(conn);
+ conn->lgr = NULL;
+ smc_rmb_free(conn);
+ smc_sndbuf_free(conn);
+}
+
+static void smc_link_clear(struct smc_link_group *lgr)
+{
+ struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+
+ lnk->peer_qpn = 0;
+}
+
+/* remove a link group */
+void smc_lgr_free(struct smc_link_group *lgr)
+{
+ smc_link_clear(lgr);
+ kfree(lgr);
+}
+
+/* Client checks if creation / reuse of a link group happens
+ * synchronously with Server.
+ * Returns true iff client and server disagree.
+ */
+static bool smc_lgr_clt_srv_disagree(struct smc_connection *conn, int new_lgr,
+ int srv_first_contact)
+{
+ if (!srv_first_contact && new_lgr) {
+ /* Server reuses a link group, but Client wants to start
+ * a new one
+ */
+ return true;
+ }
+ if (srv_first_contact && !new_lgr) {
+ /* Server starts a new link group, but Client wants to reuse
+ * an existing link group
+ */
+ spin_lock(&smc_lgr_list.lock);
+ list_del_init(&conn->lgr->list);
+ spin_unlock(&smc_lgr_list.lock);
+ smc_lgr_unregister_conn(conn); /* takes conns_lock */
+ /* tbd: terminate existing connections */
+ return true;
+ }
+ return false;
+}
+
+/* Determine vlan of internal TCP socket.
+ * @vlan_id: address to store the determined vlan id into
+ */
+static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
+{
+ struct dst_entry *dst = sk_dst_get(clcsock->sk);
+ int rc = 0;
+
+ *vlan_id = 0;
+ if (!dst) {
+ rc = -ENOTCONN;
+ goto out;
+ }
+ if (!dst->dev) {
+ rc = -ENODEV;
+ goto out_rel;
+ }
+
+ if (is_vlan_dev(dst->dev))
+ *vlan_id = vlan_dev_vlan_id(dst->dev);
+
+out_rel:
+ dst_release(dst);
+out:
+ return rc;
+}
+
+/* determine the link gid matching the vlan id of the link group */
+static int smc_link_determine_gid(struct smc_link_group *lgr)
+{
+ struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+ struct ib_gid_attr gattr;
+ union ib_gid gid;
+ int i;
+
+ if (!lgr->vlan_id) {
+ lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1];
+ return 0;
+ }
+
+ for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len;
+ i++) {
+ ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
+ &gattr);
+ if (gattr.ndev &&
+ (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id)) {
+ lnk->gid = gid;
+ return 0;
+ }
+ }
+ return -ENODEV;
+}
+
+/* create a new SMC connection (and a new link group if necessary) */
+int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
+ struct smc_ib_device *smcibdev, u8 ibport,
+ struct smc_clc_msg_local *lcl, int srv_first_contact)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct smc_link_group *lgr;
+ unsigned short vlan_id;
+ enum smc_lgr_role role;
+ int local_contact = SMC_FIRST_CONTACT;
+ int rc = 0;
+
+ /* wait when another process is already creating a connection
+ * till either this process can reuse an existing link group or
+ * till this process has finished creating a new link group
+ */
+ role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+ rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id);
+ if (rc)
+ return rc;
+
+ /* determine if an existing link group can be reused */
+ spin_lock(&smc_lgr_list.lock);
+ list_for_each_entry(lgr, &smc_lgr_list.list, list) {
+ write_lock_bh(&lgr->conns_lock);
+ if (!memcmp(lgr->peer_systemid, lcl->id_for_peer,
+ SMC_SYSTEMID_LEN) &&
+ !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
+ SMC_GID_SIZE) &&
+ !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
+ sizeof(lcl->mac)) &&
+ (lgr->role == role) &&
+ (lgr->vlan_id == vlan_id)) {
+ /* link group found */
+ local_contact = SMC_REUSE_CONTACT;
+ conn->lgr = lgr;
+ smc_lgr_register_conn(conn); /* add smc conn to lgr */
+ write_unlock_bh(&lgr->conns_lock);
+ break;
+ }
+ write_unlock_bh(&lgr->conns_lock);
+ }
+ spin_unlock(&smc_lgr_list.lock);
+
+ if (role == SMC_CLNT) {
+ if (smc_lgr_clt_srv_disagree(conn, local_contact,
+ srv_first_contact)) {
+ /* send out_of_sync decline, reason synchr. error */
+ smc->sk.sk_err = ENOLINK;
+ return -ENOLINK;
+ }
+ }
+
+ if (local_contact == SMC_FIRST_CONTACT) {
+ rc = smc_lgr_create(smc, peer_in_addr, smcibdev, ibport,
+ lcl->id_for_peer, vlan_id);
+ if (rc)
+ goto out;
+ smc_lgr_register_conn(conn); /* add smc conn to lgr */
+ rc = smc_link_determine_gid(conn->lgr);
+ }
+ conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
+ conn->local_tx_ctrl.len = sizeof(struct smc_cdc_msg);
+
+out:
+ return rc ? rc : local_contact;
+}
+
+/* try to reuse a sndbuf description slot of the sndbufs list for a certain
+ * buf_size; if not available, return NULL
+ */
+static inline
+struct smc_buf_desc *smc_sndbuf_get_slot(struct smc_link_group *lgr,
+ int compressed_bufsize)
+{
+ struct smc_buf_desc *sndbuf_slot;
+
+ read_lock_bh(&lgr->sndbufs_lock);
+ list_for_each_entry(sndbuf_slot, &lgr->sndbufs[compressed_bufsize],
+ list) {
+ if (cmpxchg(&sndbuf_slot->used, 0, 1) == 0) {
+ read_unlock_bh(&lgr->sndbufs_lock);
+ return sndbuf_slot;
+ }
+ }
+ read_unlock_bh(&lgr->sndbufs_lock);
+ return NULL;
+}
+
+/* try to reuse an rmb description slot of the rmbs list for a certain
+ * rmbe_size; if not available, return NULL
+ */
+static inline
+struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr,
+ int compressed_bufsize)
+{
+ struct smc_buf_desc *rmb_slot;
+
+ read_lock_bh(&lgr->rmbs_lock);
+ list_for_each_entry(rmb_slot, &lgr->rmbs[compressed_bufsize],
+ list) {
+ if (cmpxchg(&rmb_slot->used, 0, 1) == 0) {
+ read_unlock_bh(&lgr->rmbs_lock);
+ return rmb_slot;
+ }
+ }
+ read_unlock_bh(&lgr->rmbs_lock);
+ return NULL;
+}
+
+/* create the tx buffer for an SMC socket */
+int smc_sndbuf_create(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct smc_link_group *lgr = conn->lgr;
+ int tmp_bufsize, tmp_bufsize_short;
+ struct smc_buf_desc *sndbuf_desc;
+ int rc;
+
+ /* use socket send buffer size (w/o overhead) as start value */
+ for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2);
+ tmp_bufsize_short >= 0; tmp_bufsize_short--) {
+ tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
+ /* check for reusable sndbuf_slot in the link group */
+ sndbuf_desc = smc_sndbuf_get_slot(lgr, tmp_bufsize_short);
+ if (sndbuf_desc) {
+ memset(conn->sndbuf_desc->cpu_addr, 0, tmp_bufsize);
+ break; /* found reusable slot */
+ }
+ /* try to alloc a new send buffer */
+ sndbuf_desc = kzalloc(sizeof(*sndbuf_desc), GFP_KERNEL);
+ if (!sndbuf_desc)
+ break; /* give up with -ENOMEM */
+ sndbuf_desc->cpu_addr = kzalloc(tmp_bufsize,
+ GFP_KERNEL | __GFP_NOWARN |
+ __GFP_NOMEMALLOC |
+ __GFP_NORETRY);
+ if (!sndbuf_desc->cpu_addr) {
+ kfree(sndbuf_desc);
+ /* if send buffer allocation has failed,
+ * try a smaller one
+ */
+ continue;
+ }
+ rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+ tmp_bufsize, sndbuf_desc,
+ DMA_TO_DEVICE);
+ if (rc) {
+ kfree(sndbuf_desc->cpu_addr);
+ kfree(sndbuf_desc);
+ continue; /* if mapping failed, try smaller one */
+ }
+ sndbuf_desc->used = 1;
+ write_lock_bh(&lgr->sndbufs_lock);
+ list_add(&sndbuf_desc->list,
+ &lgr->sndbufs[tmp_bufsize_short]);
+ write_unlock_bh(&lgr->sndbufs_lock);
+ }
+ if (sndbuf_desc && sndbuf_desc->cpu_addr) {
+ conn->sndbuf_desc = sndbuf_desc;
+ conn->sndbuf_size = tmp_bufsize;
+ smc->sk.sk_sndbuf = tmp_bufsize * 2;
+ atomic_set(&conn->sndbuf_space, tmp_bufsize);
+ return 0;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+/* create the RMB for an SMC socket (even though the SMC protocol
+ * allows more than one RMB-element per RMB, the Linux implementation
+ * uses just one RMB-element per RMB, i.e. uses an extra RMB for every
+ * connection in a link group
+ */
+int smc_rmb_create(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct smc_link_group *lgr = conn->lgr;
+ int tmp_bufsize, tmp_bufsize_short;
+ struct smc_buf_desc *rmb_desc;
+ int rc;
+
+ /* use socket recv buffer size (w/o overhead) as start value */
+ for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_rcvbuf / 2);
+ tmp_bufsize_short >= 0; tmp_bufsize_short--) {
+ tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
+ /* check for reusable rmb_slot in the link group */
+ rmb_desc = smc_rmb_get_slot(lgr, tmp_bufsize_short);
+ if (rmb_desc) {
+ memset(conn->rmb_desc->cpu_addr, 0, tmp_bufsize);
+ break; /* found reusable slot */
+ }
+ /* try to alloc a new RMB */
+ rmb_desc = kzalloc(sizeof(*rmb_desc), GFP_KERNEL);
+ if (!rmb_desc)
+ break; /* give up with -ENOMEM */
+ rmb_desc->cpu_addr = kzalloc(tmp_bufsize,
+ GFP_KERNEL | __GFP_NOWARN |
+ __GFP_NOMEMALLOC |
+ __GFP_NORETRY);
+ if (!rmb_desc->cpu_addr) {
+ kfree(rmb_desc);
+ /* if RMB allocation has failed,
+ * try a smaller one
+ */
+ continue;
+ }
+ rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+ tmp_bufsize, rmb_desc,
+ DMA_FROM_DEVICE);
+ if (rc) {
+ kfree(rmb_desc->cpu_addr);
+ kfree(rmb_desc);
+ continue; /* if mapping failed, try smaller one */
+ }
+ rc = smc_ib_get_memory_region(lgr->lnk[SMC_SINGLE_LINK].roce_pd,
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_LOCAL_WRITE,
+ &rmb_desc->mr_rx[SMC_SINGLE_LINK]);
+ if (rc) {
+ smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+ tmp_bufsize, rmb_desc,
+ DMA_FROM_DEVICE);
+ kfree(rmb_desc->cpu_addr);
+ kfree(rmb_desc);
+ continue;
+ }
+ rmb_desc->used = 1;
+ write_lock_bh(&lgr->rmbs_lock);
+ list_add(&rmb_desc->list,
+ &lgr->rmbs[tmp_bufsize_short]);
+ write_unlock_bh(&lgr->rmbs_lock);
+ }
+ if (rmb_desc && rmb_desc->cpu_addr) {
+ conn->rmb_desc = rmb_desc;
+ conn->rmbe_size = tmp_bufsize;
+ conn->rmbe_size_short = tmp_bufsize_short;
+ smc->sk.sk_rcvbuf = tmp_bufsize * 2;
+ atomic_set(&conn->bytes_to_rcv, 0);
+ conn->rmbe_update_limit =
+ min_t(int, conn->rmbe_size / 10,
+ SOCK_MIN_SNDBUF / 2);
+ return 0;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
+{
+ int i;
+
+ for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
+ if (!test_and_set_bit(i, lgr->rtokens_used_mask))
+ return i;
+ }
+ return -ENOSPC;
+}
+
+/* save rkey and dma_addr received from peer during clc handshake */
+int smc_rmb_rtoken_handling(struct smc_connection *conn,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ u64 dma_addr = be64_to_cpu(clc->rmb_dma_addr);
+ struct smc_link_group *lgr = conn->lgr;
+ u32 rkey = ntohl(clc->rmb_rkey);
+ int i;
+
+ for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+ if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
+ test_bit(i, lgr->rtokens_used_mask)) {
+ conn->rtoken_idx = i;
+ return 0;
+ }
+ }
+ conn->rtoken_idx = smc_rmb_reserve_rtoken_idx(lgr);
+ if (conn->rtoken_idx < 0)
+ return conn->rtoken_idx;
+ lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey = rkey;
+ lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr = dma_addr;
+ return 0;
+}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
new file mode 100644
index 000000000000..4b2878cf8dfd
--- /dev/null
+++ b/net/smc/smc_core.h
@@ -0,0 +1,178 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for SMC Connections, Link Groups and Links
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_CORE_H
+#define _SMC_CORE_H
+
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_ib.h"
+
+#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */
+#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */
+
+struct smc_lgr_list { /* list of link group definition */
+ struct list_head list;
+ spinlock_t lock; /* protects list of link groups */
+ u32 conn_pending; /* delay 2nd connection creation*/
+};
+
+extern struct smc_lgr_list smc_lgr_list; /* list of link groups */
+
+enum smc_lgr_role { /* possible roles of a link group */
+ SMC_CLNT, /* client */
+ SMC_SERV /* server */
+};
+
+#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */
+
+struct smc_wr_buf {
+ u8 raw[SMC_WR_BUF_SIZE];
+} __packed;
+
+struct smc_link {
+ struct smc_ib_device *smcibdev; /* ib-device */
+ u8 ibport; /* port - values 1 | 2 */
+ struct ib_pd *roce_pd; /* IB protection domain,
+ * unique for every RoCE QP
+ */
+ struct ib_qp *roce_qp; /* IB queue pair */
+ struct ib_qp_attr qp_attr; /* IB queue pair attributes */
+
+ struct ib_mr *mr_tx; /* send IB DMA memory region */
+
+ struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */
+ struct ib_send_wr *wr_tx_ibs; /* WR send meta data */
+ struct ib_sge *wr_tx_sges; /* WR send gather meta data */
+ struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */
+ /* above four vectors have wr_tx_cnt elements and use the same index */
+ dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */
+ atomic64_t wr_tx_id; /* seq # of last sent WR */
+ unsigned long *wr_tx_mask; /* bit mask of used indexes */
+ u32 wr_tx_cnt; /* number of WR send buffers */
+ wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */
+
+ struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */
+ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
+ struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */
+ /* above three vectors have wr_rx_cnt elements and use the same index */
+ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
+ u64 wr_rx_id; /* seq # of last recv WR */
+ u32 wr_rx_cnt; /* number of WR recv buffers */
+
+ union ib_gid gid; /* gid matching used vlan id */
+ u32 peer_qpn; /* QP number of peer */
+ enum ib_mtu path_mtu; /* used mtu */
+ enum ib_mtu peer_mtu; /* mtu size of peer */
+ u32 psn_initial; /* QP tx initial packet seqno */
+ u32 peer_psn; /* QP rx initial packet seqno */
+ u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */
+ u8 peer_gid[sizeof(union ib_gid)]; /* gid of peer*/
+ u8 link_id; /* unique # within link group */
+ struct completion llc_confirm; /* wait for rx of conf link */
+ struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */
+};
+
+/* For now we just allow one parallel link per link group. The SMC protocol
+ * allows more (up to 8).
+ */
+#define SMC_LINKS_PER_LGR_MAX 1
+#define SMC_SINGLE_LINK 0
+
+#define SMC_FIRST_CONTACT 1 /* first contact to a peer */
+#define SMC_REUSE_CONTACT 0 /* follow-on contact to a peer*/
+
+/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
+struct smc_buf_desc {
+ struct list_head list;
+ u64 dma_addr[SMC_LINKS_PER_LGR_MAX];
+ /* mapped address of buffer */
+ void *cpu_addr; /* virtual address of buffer */
+ struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; /* for rmb only:
+ * rkey provided to peer
+ */
+ u32 used; /* currently used / unused */
+};
+
+struct smc_rtoken { /* address/key of remote RMB */
+ u64 dma_addr;
+ u32 rkey;
+};
+
+struct smc_link_group {
+ struct list_head list;
+ enum smc_lgr_role role; /* client or server */
+ __be32 daddr; /* destination ip address */
+ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */
+ char peer_systemid[SMC_SYSTEMID_LEN];
+ /* unique system_id of peer */
+ struct rb_root conns_all; /* connection tree */
+ rwlock_t conns_lock; /* protects conns_all */
+ unsigned int conns_num; /* current # of connections */
+ unsigned short vlan_id; /* vlan id of link group */
+
+ struct list_head sndbufs[SMC_RMB_SIZES]; /* tx buffers */
+ rwlock_t sndbufs_lock; /* protects tx buffers */
+ struct list_head rmbs[SMC_RMB_SIZES]; /* rx buffers */
+ rwlock_t rmbs_lock; /* protects rx buffers */
+ struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
+ [SMC_LINKS_PER_LGR_MAX];
+ /* remote addr/key pairs */
+ unsigned long rtokens_used_mask[BITS_TO_LONGS(
+ SMC_RMBS_PER_LGR_MAX)];
+ /* used rtoken elements */
+
+ u32 id; /* unique lgr id */
+ struct delayed_work free_work; /* delayed freeing of an lgr */
+};
+
+/* Find the connection associated with the given alert token in the link group.
+ * To use rbtrees we have to implement our own search core.
+ * Requires @conns_lock
+ * @token alert token to search for
+ * @lgr link group to search in
+ * Returns connection associated with token if found, NULL otherwise.
+ */
+static inline struct smc_connection *smc_lgr_find_conn(
+ u32 token, struct smc_link_group *lgr)
+{
+ struct smc_connection *res = NULL;
+ struct rb_node *node;
+
+ node = lgr->conns_all.rb_node;
+ while (node) {
+ struct smc_connection *cur = rb_entry(node,
+ struct smc_connection, alert_node);
+
+ if (cur->alert_token_local > token) {
+ node = node->rb_left;
+ } else {
+ if (cur->alert_token_local < token) {
+ node = node->rb_right;
+ } else {
+ res = cur;
+ break;
+ }
+ }
+ }
+
+ return res;
+}
+
+struct smc_clc_msg_accept_confirm;
+
+void smc_lgr_free(struct smc_link_group *);
+int smc_sndbuf_create(struct smc_sock *);
+int smc_rmb_create(struct smc_sock *);
+int smc_rmb_rtoken_handling(struct smc_connection *,
+ struct smc_clc_msg_accept_confirm *);
+
+#endif
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
new file mode 100644
index 000000000000..4d5aeda4297f
--- /dev/null
+++ b/net/smc/smc_ib.c
@@ -0,0 +1,471 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * IB infrastructure:
+ * Establish SMC-R as an Infiniband Client to be notified about added and
+ * removed IB devices of type RDMA.
+ * Determine device an port characteristics for these IB devices.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/random.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc_pnet.h"
+#include "smc_ib.h"
+#include "smc_core.h"
+#include "smc_wr.h"
+#include "smc.h"
+
+#define SMC_QP_MIN_RNR_TIMER 5
+#define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */
+#define SMC_QP_RETRY_CNT 7 /* 7: infinite */
+#define SMC_QP_RNR_RETRY 7 /* 7: infinite */
+
+struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
+ .lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock),
+ .list = LIST_HEAD_INIT(smc_ib_devices.list),
+};
+
+#define SMC_LOCAL_SYSTEMID_RESET "%%%%%%%"
+
+u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system
+ * identifier
+ */
+
+void smc_ib_dereg_memory_region(struct ib_mr *mr)
+{
+ ib_dereg_mr(mr);
+ mr = NULL;
+}
+
+int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
+ struct ib_mr **mr)
+{
+ if (*mr)
+ return 0; /* already done */
+ /* obtain unique key -
+ * next invocation of ib_get_dma_mr returns a different key!
+ */
+ *mr = ib_get_dma_mr(pd, access_flags);
+ if (IS_ERR(*mr))
+ *mr = NULL;
+ return PTR_ERR_OR_ZERO(*mr);
+}
+
+static int smc_ib_modify_qp_init(struct smc_link *lnk)
+{
+ struct ib_qp_attr qp_attr;
+ int rc = 0;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_INIT;
+ qp_attr.pkey_index = 0;
+ qp_attr.port_num = lnk->ibport;
+ qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE
+ | IB_ACCESS_REMOTE_WRITE;
+ rc = ib_modify_qp(lnk->roce_qp, &qp_attr,
+ IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_ACCESS_FLAGS |
+ IB_QP_PORT);
+ return rc;
+}
+
+static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
+{
+ enum ib_qp_attr_mask qp_attr_mask =
+ IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
+ struct ib_qp_attr qp_attr;
+ int rc = 0;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_RTR;
+ qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
+ qp_attr.ah_attr.port_num = lnk->ibport;
+ qp_attr.ah_attr.ah_flags = IB_AH_GRH;
+ qp_attr.ah_attr.grh.hop_limit = 1;
+ memcpy(&qp_attr.ah_attr.grh.dgid, lnk->peer_gid,
+ sizeof(lnk->peer_gid));
+ memcpy(&qp_attr.ah_attr.dmac, lnk->peer_mac,
+ sizeof(lnk->peer_mac));
+ qp_attr.dest_qp_num = lnk->peer_qpn;
+ qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
+ qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
+ * requests
+ */
+ qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER;
+
+ rc = ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask);
+ return rc;
+}
+
+int smc_ib_modify_qp_rts(struct smc_link *lnk)
+{
+ struct ib_qp_attr qp_attr;
+ int rc = 0;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_RTS;
+ qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */
+ qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */
+ qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */
+ qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */
+ qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and
+ * atomic ops allowed
+ */
+ rc = ib_modify_qp(lnk->roce_qp, &qp_attr,
+ IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
+ IB_QP_SQ_PSN | IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC);
+ return rc;
+}
+
+int smc_ib_ready_link(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr =
+ container_of(lnk, struct smc_link_group, lnk[0]);
+ int rc = 0;
+
+ rc = smc_ib_modify_qp_init(lnk);
+ if (rc)
+ goto out;
+
+ rc = smc_ib_modify_qp_rtr(lnk);
+ if (rc)
+ goto out;
+ smc_wr_remember_qp_attr(lnk);
+ rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
+ IB_CQ_SOLICITED_MASK);
+ if (rc)
+ goto out;
+ rc = smc_wr_rx_post_init(lnk);
+ if (rc)
+ goto out;
+ smc_wr_remember_qp_attr(lnk);
+
+ if (lgr->role == SMC_SERV) {
+ rc = smc_ib_modify_qp_rts(lnk);
+ if (rc)
+ goto out;
+ smc_wr_remember_qp_attr(lnk);
+ }
+out:
+ return rc;
+}
+
+/* process context wrapper for might_sleep smc_ib_remember_port_attr */
+static void smc_ib_port_event_work(struct work_struct *work)
+{
+ struct smc_ib_device *smcibdev = container_of(
+ work, struct smc_ib_device, port_event_work);
+ u8 port_idx;
+
+ for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
+ smc_ib_remember_port_attr(smcibdev, port_idx + 1);
+ clear_bit(port_idx, &smcibdev->port_event_mask);
+ }
+}
+
+/* can be called in IRQ context */
+static void smc_ib_global_event_handler(struct ib_event_handler *handler,
+ struct ib_event *ibevent)
+{
+ struct smc_ib_device *smcibdev;
+ u8 port_idx;
+
+ smcibdev = container_of(handler, struct smc_ib_device, event_handler);
+ switch (ibevent->event) {
+ case IB_EVENT_PORT_ERR:
+ port_idx = ibevent->element.port_num - 1;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ schedule_work(&smcibdev->port_event_work);
+ /* fall through */
+ case IB_EVENT_DEVICE_FATAL:
+ /* tbd in follow-on patch:
+ * abnormal close of corresponding connections
+ */
+ break;
+ case IB_EVENT_PORT_ACTIVE:
+ port_idx = ibevent->element.port_num - 1;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ schedule_work(&smcibdev->port_event_work);
+ break;
+ default:
+ break;
+ }
+}
+
+long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
+{
+ struct ib_cq_init_attr cqattr = {
+ .cqe = SMC_WR_MAX_CQE, .comp_vector = 0 };
+ long rc = 0;
+
+ smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
+ smc_wr_tx_cq_handler, NULL,
+ smcibdev, &cqattr);
+ if (IS_ERR(smcibdev->roce_cq_send)) {
+ rc = PTR_ERR(smcibdev->roce_cq_send);
+ goto err;
+ }
+ smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
+ smc_wr_rx_cq_handler, NULL,
+ smcibdev, &cqattr);
+ if (IS_ERR(smcibdev->roce_cq_recv)) {
+ rc = PTR_ERR(smcibdev->roce_cq_recv);
+ goto err_cq;
+ }
+ INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
+ smc_ib_global_event_handler);
+ ib_register_event_handler(&smcibdev->event_handler);
+ smc_wr_add_dev(smcibdev);
+ return rc;
+
+err_cq:
+ ib_destroy_cq(smcibdev->roce_cq_send);
+err:
+ return rc;
+}
+
+void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
+{
+ ib_dealloc_pd(lnk->roce_pd);
+ lnk->roce_pd = NULL;
+}
+
+int smc_ib_create_protection_domain(struct smc_link *lnk)
+{
+ lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev);
+ if (IS_ERR(lnk->roce_pd)) {
+ lnk->roce_pd = NULL;
+ return (int)PTR_ERR(lnk->roce_pd);
+ }
+ return 0;
+}
+
+static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
+{
+ switch (ibevent->event) {
+ case IB_EVENT_DEVICE_FATAL:
+ case IB_EVENT_GID_CHANGE:
+ case IB_EVENT_PORT_ERR:
+ case IB_EVENT_QP_ACCESS_ERR:
+ /* tbd in follow-on patch:
+ * abnormal close of corresponding connections
+ */
+ break;
+ default:
+ break;
+ }
+}
+
+void smc_ib_destroy_queue_pair(struct smc_link *lnk)
+{
+ ib_destroy_qp(lnk->roce_qp);
+ lnk->roce_qp = NULL;
+}
+
+/* create a queue pair within the protection domain for a link */
+int smc_ib_create_queue_pair(struct smc_link *lnk)
+{
+ struct ib_qp_init_attr qp_attr = {
+ .event_handler = smc_ib_qp_event_handler,
+ .qp_context = lnk,
+ .send_cq = lnk->smcibdev->roce_cq_send,
+ .recv_cq = lnk->smcibdev->roce_cq_recv,
+ .srq = NULL,
+ .cap = {
+ .max_send_wr = SMC_WR_BUF_CNT,
+ /* include unsolicited rdma_writes as well,
+ * there are max. 2 RDMA_WRITE per 1 WR_SEND
+ */
+ .max_recv_wr = SMC_WR_BUF_CNT * 3,
+ .max_send_sge = SMC_IB_MAX_SEND_SGE,
+ .max_recv_sge = 1,
+ .max_inline_data = SMC_WR_TX_SIZE,
+ },
+ .sq_sig_type = IB_SIGNAL_REQ_WR,
+ .qp_type = IB_QPT_RC,
+ };
+
+ lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
+ if (IS_ERR(lnk->roce_qp)) {
+ lnk->roce_qp = NULL;
+ return (int)PTR_ERR(lnk->roce_qp);
+ }
+ smc_wr_remember_qp_attr(lnk);
+ return 0;
+}
+
+/* map a new TX or RX buffer to DMA */
+int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction)
+{
+ int rc = 0;
+
+ if (buf_slot->dma_addr[SMC_SINGLE_LINK])
+ return rc; /* already mapped */
+ buf_slot->dma_addr[SMC_SINGLE_LINK] =
+ ib_dma_map_single(smcibdev->ibdev, buf_slot->cpu_addr,
+ buf_size, data_direction);
+ if (ib_dma_mapping_error(smcibdev->ibdev,
+ buf_slot->dma_addr[SMC_SINGLE_LINK]))
+ rc = -EIO;
+ return rc;
+}
+
+void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int buf_size,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction)
+{
+ if (!buf_slot->used)
+ return; /* already unmapped */
+ ib_dma_unmap_single(smcibdev->ibdev, *buf_slot->dma_addr, buf_size,
+ data_direction);
+ buf_slot->dma_addr[SMC_SINGLE_LINK] = 0;
+}
+
+static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ struct net_device *ndev;
+ int rc;
+
+ rc = ib_query_gid(smcibdev->ibdev, ibport, 0,
+ &smcibdev->gid[ibport - 1], NULL);
+ /* the SMC protocol requires specification of the roce MAC address;
+ * if net_device cannot be determined, it can be derived from gid 0
+ */
+ ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport);
+ if (ndev) {
+ memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN);
+ } else if (!rc) {
+ memcpy(&smcibdev->mac[ibport - 1][0],
+ &smcibdev->gid[ibport - 1].raw[8], 3);
+ memcpy(&smcibdev->mac[ibport - 1][3],
+ &smcibdev->gid[ibport - 1].raw[13], 3);
+ smcibdev->mac[ibport - 1][0] &= ~0x02;
+ }
+ return rc;
+}
+
+/* Create an identifier unique for this instance of SMC-R.
+ * The MAC-address of the first active registered IB device
+ * plus a random 2-byte number is used to create this identifier.
+ * This name is delivered to the peer during connection initialization.
+ */
+static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
+ u8 ibport)
+{
+ memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
+ sizeof(smcibdev->mac[ibport - 1]));
+ get_random_bytes(&local_systemid[0], 2);
+}
+
+bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
+}
+
+int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ int rc;
+
+ memset(&smcibdev->pattr[ibport - 1], 0,
+ sizeof(smcibdev->pattr[ibport - 1]));
+ rc = ib_query_port(smcibdev->ibdev, ibport,
+ &smcibdev->pattr[ibport - 1]);
+ if (rc)
+ goto out;
+ smc_ib_fill_gid_and_mac(smcibdev, ibport);
+ if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
+ sizeof(local_systemid)) &&
+ smc_ib_port_active(smcibdev, ibport))
+ /* create unique system identifier */
+ smc_ib_define_local_systemid(smcibdev, ibport);
+out:
+ return rc;
+}
+
+static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
+{
+ ib_destroy_cq(smcibdev->roce_cq_send);
+ ib_destroy_cq(smcibdev->roce_cq_recv);
+ ib_unregister_event_handler(&smcibdev->event_handler);
+ smc_wr_remove_dev(smcibdev);
+}
+
+static struct ib_client smc_ib_client;
+
+/* callback function for ib_register_client() */
+static void smc_ib_add_dev(struct ib_device *ibdev)
+{
+ struct smc_ib_device *smcibdev;
+ int i;
+
+ if (ibdev->node_type != RDMA_NODE_IB_CA)
+ return;
+
+ smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
+ if (!smcibdev)
+ return;
+
+ smcibdev->ibdev = ibdev;
+ INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
+
+ for (i = 1; i <= SMC_MAX_PORTS; i++) {
+ if (smc_pnet_exists_in_table(smcibdev, i) &&
+ !smcibdev->initialized) {
+ /* dev hotplug: ib device and port is in pnet table */
+ if (smc_ib_remember_port_attr(smcibdev, i)) {
+ kfree(smcibdev);
+ return;
+ }
+ if (smc_ib_setup_per_ibdev(smcibdev)) {
+ kfree(smcibdev);
+ return;
+ }
+ smcibdev->initialized = 1;
+ break;
+ }
+ }
+ spin_lock(&smc_ib_devices.lock);
+ list_add_tail(&smcibdev->list, &smc_ib_devices.list);
+ spin_unlock(&smc_ib_devices.lock);
+ ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
+}
+
+/* callback function for ib_register_client() */
+static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
+{
+ struct smc_ib_device *smcibdev;
+
+ smcibdev = ib_get_client_data(ibdev, &smc_ib_client);
+ ib_set_client_data(ibdev, &smc_ib_client, NULL);
+ if (smcibdev->initialized)
+ smc_ib_cleanup_per_ibdev(smcibdev);
+ spin_lock(&smc_ib_devices.lock);
+ list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
+ spin_unlock(&smc_ib_devices.lock);
+ cancel_work_sync(&smcibdev->port_event_work);
+ kfree(smcibdev);
+}
+
+static struct ib_client smc_ib_client = {
+ .name = "smc_ib",
+ .add = smc_ib_add_dev,
+ .remove = smc_ib_remove_dev,
+};
+
+int __init smc_ib_register_client(void)
+{
+ return ib_register_client(&smc_ib_client);
+}
+
+void __exit smc_ib_unregister_client(void)
+{
+ ib_unregister_client(&smc_ib_client);
+}
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
new file mode 100644
index 000000000000..387960520fd5
--- /dev/null
+++ b/net/smc/smc_ib.h
@@ -0,0 +1,66 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for IB environment
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <Ursula Braun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_IB_H
+#define _SMC_IB_H
+
+#include <rdma/ib_verbs.h>
+
+#define SMC_MAX_PORTS 2 /* Max # of ports */
+#define SMC_GID_SIZE sizeof(union ib_gid)
+
+#define SMC_IB_MAX_SEND_SGE 2
+
+struct smc_ib_devices { /* list of smc ib devices definition */
+ struct list_head list;
+ spinlock_t lock; /* protects list of smc ib devices */
+};
+
+extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */
+
+struct smc_ib_device { /* ib-device infos for smc */
+ struct list_head list;
+ struct ib_device *ibdev;
+ struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */
+ struct ib_event_handler event_handler; /* global ib_event handler */
+ struct ib_cq *roce_cq_send; /* send completion queue */
+ struct ib_cq *roce_cq_recv; /* recv completion queue */
+ struct tasklet_struct send_tasklet; /* called by send cq handler */
+ struct tasklet_struct recv_tasklet; /* called by recv cq handler */
+ char mac[SMC_MAX_PORTS][6]; /* mac address per port*/
+ union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */
+ u8 initialized : 1; /* ib dev CQ, evthdl done */
+ struct work_struct port_event_work;
+ unsigned long port_event_mask;
+};
+
+struct smc_sock;
+struct smc_buf_desc;
+struct smc_link;
+
+int __init smc_ib_register_client(void);
+void __exit smc_ib_unregister_client(void);
+bool smc_ib_port_active(struct smc_ib_device *, u8);
+int smc_ib_remember_port_attr(struct smc_ib_device *, u8);
+int smc_ib_buf_map(struct smc_ib_device *, int, struct smc_buf_desc *,
+ enum dma_data_direction);
+void smc_ib_buf_unmap(struct smc_ib_device *, int, struct smc_buf_desc *,
+ enum dma_data_direction);
+void smc_ib_dealloc_protection_domain(struct smc_link *);
+int smc_ib_create_protection_domain(struct smc_link *);
+void smc_ib_destroy_queue_pair(struct smc_link *);
+int smc_ib_create_queue_pair(struct smc_link *);
+void smc_ib_dereg_memory_region(struct ib_mr *);
+int smc_ib_get_memory_region(struct ib_pd *, int, struct ib_mr **);
+int smc_ib_ready_link(struct smc_link *);
+int smc_ib_modify_qp_rts(struct smc_link *);
+long smc_ib_setup_per_ibdev(struct smc_ib_device *);
+
+#endif
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
new file mode 100644
index 000000000000..f70f9b96e21c
--- /dev/null
+++ b/net/smc/smc_llc.c
@@ -0,0 +1,152 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Link Layer Control (LLC)
+ *
+ * For now, we only support the necessary "confirm link" functionality
+ * which happens for the first RoCE link after successful CLC handshake.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
+ * Ursula Braun <ursula.braun@de.ibm.com>
+ */
+
+#include <net/tcp.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_clc.h"
+#include "smc_llc.h"
+
+struct smc_llc_tx_pend {
+};
+
+/* handler for send/transmission completion of an LLC msg */
+static void smc_llc_tx_handler(struct smc_wr_tx_pend_priv *pend,
+ struct smc_link *link,
+ enum ib_wc_status wc_status)
+{
+ /* future work: handle wc_status error for recovery and failover */
+}
+
+/**
+ * smc_llc_add_pending_send() - add LLC control message to pending WQE transmits
+ * @link: Pointer to SMC link used for sending LLC control message.
+ * @wr_buf: Out variable returning pointer to work request payload buffer.
+ * @pend: Out variable returning pointer to private pending WR tracking.
+ * It's the context the transmit complete handler will get.
+ *
+ * Reserves and pre-fills an entry for a pending work request send/tx.
+ * Used by mid-level smc_llc_send_msg() to prepare for later actual send/tx.
+ * Can sleep due to smc_get_ctrl_buf (if not in softirq context).
+ *
+ * Return: 0 on success, otherwise an error value.
+ */
+static int smc_llc_add_pending_send(struct smc_link *link,
+ struct smc_wr_buf **wr_buf,
+ struct smc_wr_tx_pend_priv **pend)
+{
+ int rc;
+
+ rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, pend);
+ if (rc < 0)
+ return rc;
+ BUILD_BUG_ON_MSG(
+ sizeof(union smc_llc_msg) > SMC_WR_BUF_SIZE,
+ "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_llc_msg)");
+ BUILD_BUG_ON_MSG(
+ sizeof(union smc_llc_msg) != SMC_WR_TX_SIZE,
+ "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_llc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
+ BUILD_BUG_ON_MSG(
+ sizeof(struct smc_llc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
+ "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_llc_tx_pend)");
+ return 0;
+}
+
+/* high-level API to send LLC confirm link */
+int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
+ union ib_gid *gid,
+ enum smc_llc_reqresp reqresp)
+{
+ struct smc_link_group *lgr = container_of(link, struct smc_link_group,
+ lnk[SMC_SINGLE_LINK]);
+ struct smc_llc_msg_confirm_link *confllc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ return rc;
+ confllc = (struct smc_llc_msg_confirm_link *)wr_buf;
+ memset(confllc, 0, sizeof(*confllc));
+ confllc->hd.common.type = SMC_LLC_CONFIRM_LINK;
+ confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link);
+ if (reqresp == SMC_LLC_RESP)
+ confllc->hd.flags |= SMC_LLC_FLAG_RESP;
+ memcpy(confllc->sender_mac, mac, ETH_ALEN);
+ memcpy(&confllc->sender_gid, gid, SMC_GID_SIZE);
+ hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
+ /* confllc->link_num = SMC_SINGLE_LINK; already done by memset above */
+ confllc->link_uid = htonl(lgr->id);
+ confllc->max_links = SMC_LINKS_PER_LGR_MAX;
+ /* send llc message */
+ rc = smc_wr_tx_send(link, NULL, pend);
+ return rc;
+}
+
+static void smc_llc_rx_confirm_link(struct smc_link *link,
+ struct smc_llc_msg_confirm_link *llc)
+{
+ struct smc_link_group *lgr;
+
+ lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+ if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
+ if (lgr->role == SMC_SERV)
+ complete(&link->llc_confirm_resp);
+ } else {
+ if (lgr->role == SMC_CLNT) {
+ link->link_id = llc->link_num;
+ complete(&link->llc_confirm);
+ }
+ }
+}
+
+static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
+{
+ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+ union smc_llc_msg *llc = buf;
+
+ if (wc->byte_len < sizeof(*llc))
+ return; /* short message */
+ if (llc->raw.hdr.length != sizeof(*llc))
+ return; /* invalid message */
+ if (llc->raw.hdr.common.type == SMC_LLC_CONFIRM_LINK)
+ smc_llc_rx_confirm_link(link, &llc->confirm_link);
+}
+
+static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_CONFIRM_LINK
+ },
+ {
+ .handler = NULL,
+ }
+};
+
+int __init smc_llc_init(void)
+{
+ struct smc_wr_rx_handler *handler;
+ int rc = 0;
+
+ for (handler = smc_llc_rx_handlers; handler->handler; handler++) {
+ INIT_HLIST_NODE(&handler->list);
+ rc = smc_wr_rx_register_handler(handler);
+ if (rc)
+ break;
+ }
+ return rc;
+}
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
new file mode 100644
index 000000000000..932d96daff3a
--- /dev/null
+++ b/net/smc/smc_llc.h
@@ -0,0 +1,63 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for LLC (link layer control) message handling
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
+ * Ursula Braun <ursula.braun@de.ibm.com>
+ */
+
+#ifndef SMC_LLC_H
+#define SMC_LLC_H
+
+#include "smc_wr.h"
+
+#define SMC_LLC_FLAG_RESP 0x80
+
+#define SMC_LLC_WAIT_FIRST_TIME (5 * HZ)
+
+enum smc_llc_reqresp {
+ SMC_LLC_REQ,
+ SMC_LLC_RESP
+};
+
+enum smc_llc_msg_type {
+ SMC_LLC_CONFIRM_LINK = 0x01,
+};
+
+#define SMC_LLC_DATA_LEN 40
+
+struct smc_llc_hdr {
+ struct smc_wr_rx_hdr common;
+ u8 length; /* 44 */
+ u8 reserved;
+ u8 flags;
+} __packed;
+
+struct smc_llc_msg_confirm_link { /* type 0x01 */
+ struct smc_llc_hdr hd;
+ u8 sender_mac[ETH_ALEN];
+ union ib_gid sender_gid;
+ u8 sender_qp_num[3];
+ u8 link_num;
+ __be32 link_uid;
+ u8 max_links;
+ u8 reserved[9];
+} __packed;
+
+union smc_llc_msg {
+ struct smc_llc_msg_confirm_link confirm_link;
+ struct {
+ struct smc_llc_hdr hdr;
+ u8 data[SMC_LLC_DATA_LEN];
+ } __packed raw;
+} __packed;
+
+/* transmit */
+int smc_llc_send_confirm_link(struct smc_link *, u8 *, union ib_gid *,
+ enum smc_llc_reqresp);
+int smc_llc_init(void) __init;
+
+#endif /* SMC_LLC_H */
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
new file mode 100644
index 000000000000..80a3b30f9091
--- /dev/null
+++ b/net/smc/smc_pnet.c
@@ -0,0 +1,605 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Sysfs support functions
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#include <linux/device.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+
+#include <net/sock.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "smc_ib.h"
+#include "smc_pnet.h"
+
+#define SMC_MAX_PNET_ID_LEN 16 /* Max. length of PNET id */
+
+/* Sysfs interface for the pnet table
+ *
+ * Create a directory /sys/kernel/smc/ with these files:
+ * /sys/kernel/smc/pnetid_add --> Create a PNETID
+ * /sys/kernel/smc/pnetid_delete --> Delete a PNETID
+ * /sys/kernel/smc/flush --> Delete all PNETIDs
+ * /sys/kernel/smc/pnetids/xxxxx --> Created PNETIDs
+ *
+ * Create PNETID PNET1:
+ * A new file named PNET1 shows up in /sys/kernel/smc/pnetids/.
+ * echo PNET1 > /sys/kernel/smc/pnetid_add
+ *
+ * Display all created PNETIDs:
+ * ls -l /sys/kernel/smc/pnetids
+ *
+ * Delete PNETID PNET1:
+ * File PNET1 is removed from directory /sys/kernel/smc/pnetids/.
+ * echo PNET1 > /sys/kernel/smc/pnetid_del
+ *
+ * Add an ethernet interface to PNETID PNET1:
+ * A leading '+' is optional.
+ * echo "eth enccw0.0.f5f0" > /sys/kernel/smc/pnetids/PNET1
+ *
+ * Add an RDMA device to PNETID PNET1:
+ * A leading '+' is optional
+ * The 3rd field is an optional port. If not specified it defaults to 1.
+ * Currently accepted port numbers are 1 and 2. Other numbers generate an
+ * error.
+ * echo "ib mlx4_0 1" > /sys/kernel/smc/pnetids/PNET1
+ * echo "+ib mlx4_1 2" > /sys/kernel/smc/pnetids/PNET1
+ *
+ * Display all entries belonging to PNETID PNET1:
+ * cat /sys/kernel/smc/pnetids/PNET1
+ *
+ * Delete any entry from PNETID PNET1 with a leading '-':
+ * echo "-ib mlx4_1 2" > /sys/kernel/smc/pnetids/PNET1
+ *
+ * Delete all created PNETIDs at once:
+ * echo - > /sys/kernel/smc/flush
+ *
+ * No load balancing and link fail over is supported.
+ * This results a one to one relationship between ethernet interface and
+ * RDMA device including port name. Therefore each pnet identifier maps
+ * one ethernet interface to one RDMA device.
+ */
+
+/**
+ * struct smc_pnettable - SMC sysfs anchor
+ * @kset: SMC sysfs anchor
+ * @pnetids_kobj: Anchor to /sys/kernel/smc/pnetids
+ * @lock: Lock for list action
+ * @pnetlist: List of PNETIDs
+ */
+static struct smc_pnettable {
+ struct kset *kset;
+ struct kobject pnetids_kobj;
+ rwlock_t lock;
+ struct list_head pnetlist;
+} smc_pnettable = {
+ .pnetlist = LIST_HEAD_INIT(smc_pnettable.pnetlist),
+ .lock = __RW_LOCK_UNLOCKED(smc_pnettable.lock)
+};
+
+/**
+ * struct smc_pnetentry - pnet identifier name entry
+ * @list: List node.
+ * @attr: Embedded attribute structure
+ * @pnet_name: Pnet identifier name
+ * @if_name: Name of the ethernet interface.
+ * @ib_name: Name of the RDMA device.
+ * @ib_port: RDMA device port number.
+ */
+struct smc_pnetentry {
+ struct list_head list;
+ struct kobj_attribute attr;
+ char pnet_name[SMC_MAX_PNET_ID_LEN + 1];
+ char if_name[IFNAMSIZ];
+ char ib_name[IB_DEVICE_NAME_MAX];
+ u8 ib_port;
+};
+
+#define to_smcpnetentry(a) container_of((a), struct smc_pnetentry, attr)
+
+/* Release /sys/kernel/smc/pnetids and delete all pnetids. This function
+ * is called when the kobject anchor in smc_pnettable.pnetids_kobj is freed.
+ */
+static void smc_pnetid_release(struct kobject *kobj)
+{
+ struct smc_pnetentry *e, *tmp_e;
+
+ write_lock(&smc_pnettable.lock);
+ list_for_each_entry_safe(e, tmp_e, &smc_pnettable.pnetlist, list) {
+ list_del(&e->list);
+ kfree(e);
+ }
+ write_unlock(&smc_pnettable.lock);
+}
+
+static struct kobj_type smc_pnet_ktype = {
+ .release = smc_pnetid_release,
+ .sysfs_ops = &kobj_sysfs_ops
+};
+
+/* Remove an ethernet entry from the PNET table */
+static int smc_pnet_del_eth(struct smc_pnetentry *pnetelem, char *name)
+{
+ int rc = -ENOENT;
+
+ write_lock(&smc_pnettable.lock);
+ if (!strncmp(pnetelem->if_name, name, sizeof(pnetelem->if_name))) {
+ rc = 0;
+ pnetelem->if_name[0] = '\0';
+ }
+ write_unlock(&smc_pnettable.lock);
+ return rc;
+}
+
+/* Add an ethernet entry to the PNET table. Search the complete pnet table to
+ * make sure the same ethernet interface is not listed under different PNET ids.
+ */
+static int smc_pnet_add_eth(struct smc_pnetentry *pnetelem, char *name)
+{
+ struct smc_pnetentry *p;
+ int rc = -EEXIST;
+
+ write_lock(&smc_pnettable.lock);
+ list_for_each_entry(p, &smc_pnettable.pnetlist, list) {
+ if (!strncmp(p->if_name, name, sizeof(p->if_name)))
+ goto out;
+ }
+ if (pnetelem->if_name[0] == '\0') {
+ strncpy(pnetelem->if_name, name, sizeof(pnetelem->if_name));
+ rc = 0;
+ }
+out:
+ write_unlock(&smc_pnettable.lock);
+ return rc;
+}
+
+/* Create an ethernet interface entry. */
+static int smc_pnet_makeeth(struct smc_pnetentry *pnetelem, bool add,
+ char *name)
+{
+ name = skip_spaces(name);
+ if (!dev_valid_name(name))
+ return -EINVAL;
+ return (add) ? smc_pnet_add_eth(pnetelem, name)
+ : smc_pnet_del_eth(pnetelem, name);
+}
+
+/* Check if two RDMA device entries are identical. Use device name and port
+ * number for comparison.
+ */
+static bool smc_pnet_same_ibname(struct smc_pnetentry *a, char *name, u8 ibport)
+{
+ return a->ib_port == ibport &&
+ !strncmp(a->ib_name, name, sizeof(a->ib_name));
+}
+
+/* Add an RDMA device entry to the PNET table */
+static int smc_pnet_add_ib(struct smc_pnetentry *pnetelem, char *name,
+ u8 ibport)
+{
+ struct smc_ib_device *smcibdev = NULL;
+ struct smc_ib_device *dev;
+ struct smc_pnetentry *p;
+ int rc = -EEXIST;
+
+ write_lock(&smc_pnettable.lock);
+ list_for_each_entry(p, &smc_pnettable.pnetlist, list) {
+ if (smc_pnet_same_ibname(p, name, ibport))
+ goto out;
+ }
+ if (pnetelem->ib_name[0] == '\0') {
+ strncpy(pnetelem->ib_name, name, sizeof(pnetelem->ib_name));
+ pnetelem->ib_port = ibport;
+ spin_lock(&smc_ib_devices.lock);
+ /* using string ib_name, search smcibdev in global list */
+ list_for_each_entry(dev, &smc_ib_devices.list, list) {
+ if (!strncmp(dev->ibdev->name, pnetelem->ib_name,
+ sizeof(pnetelem->ib_name))) {
+ smcibdev = dev;
+ break;
+ }
+ }
+ spin_unlock(&smc_ib_devices.lock);
+ rc = 0;
+ }
+out:
+ write_unlock(&smc_pnettable.lock);
+ if (smcibdev && !smcibdev->initialized) {
+ /* ib dev already existed [dev coldplug].
+ * Complements: smc_ib_add_dev() [dev hotplug],
+ * smc_ib_global_event_handler() [port hotplug].
+ * Function call chain can sleep so outside of our locks.
+ */
+ rc = smc_ib_remember_port_attr(smcibdev,
+ pnetelem->ib_port);
+ if (rc)
+ return rc;
+ rc = smc_ib_setup_per_ibdev(smcibdev);
+ if (rc)
+ return rc;
+ smcibdev->initialized = 1;
+ }
+ return rc;
+}
+
+/* Remove an RDMA device entry from the PNET table */
+static int smc_pnet_del_ib(struct smc_pnetentry *pnetelem, char *name,
+ u8 ibport)
+{
+ int rc = -ENOENT;
+
+ write_lock(&smc_pnettable.lock);
+ if (smc_pnet_same_ibname(pnetelem, name, ibport)) {
+ rc = 0;
+ pnetelem->ib_name[0] = '\0';
+ pnetelem->ib_port = 0;
+ }
+ write_unlock(&smc_pnettable.lock);
+ return rc;
+}
+
+/* Create an RDMA device entry. Optional port number delimited by blank
+ * from name. Missing port number defaults to 1.
+ */
+static int smc_pnet_makeib(struct smc_pnetentry *pnetelem, bool add, char *name)
+{
+ unsigned int tmp_port = 1;
+ char *portno;
+ int rc;
+
+ name = skip_spaces(name);
+ portno = strchr(name, ' ');
+ if (portno) { /* Port number specified */
+ *portno = '\0';
+ portno = skip_spaces(portno + 1);
+ rc = kstrtouint(portno, 10, &tmp_port);
+ if (rc || tmp_port > SMC_MAX_PORTS || !tmp_port) {
+ rc = -EINVAL;
+ goto out;
+ }
+ }
+ rc = (add) ? smc_pnet_add_ib(pnetelem, name, (u8)tmp_port)
+ : smc_pnet_del_ib(pnetelem, name, (u8)tmp_port);
+out:
+ return rc;
+}
+
+static ssize_t smc_pnetidfile_attr_store(struct kobject *kobj,
+ struct kobj_attribute *ka,
+ const char *buf, size_t len)
+{
+ char *text, *buf_copy;
+ bool add = true;
+ int rc;
+
+ /* Operate on a copy of the buffer, we might modify the string */
+ buf_copy = kstrdup(buf, GFP_KERNEL);
+ if (!buf_copy)
+ return -ENOMEM;
+ text = strim(buf_copy);
+ switch (*text) {
+ case '-':
+ add = false;
+ /* Fall through intended */
+ case '+':
+ ++text;
+ break;
+ }
+ text = skip_spaces(text);
+ rc = -EINVAL;
+ if (!strncmp(text, "ib ", 3))
+ rc = smc_pnet_makeib(to_smcpnetentry(ka), add, text + 3);
+ else if (!strncmp(text, "eth ", 4))
+ rc = smc_pnet_makeeth(to_smcpnetentry(ka), add, text + 4);
+ kfree(buf_copy);
+ return rc ?: len;
+}
+
+/* List all entries of a PNETID. List ethernet entries first followed by
+ * RDMA device entries. Output limited to PAGE_SIZE bytes.
+ */
+static ssize_t smc_pnetidfile_attr_show(struct kobject *kobj,
+ struct kobj_attribute *ka,
+ char *buf)
+{
+ struct smc_pnetentry *pnetelem = to_smcpnetentry(ka);
+
+ read_lock(&smc_pnettable.lock);
+ snprintf(buf, PAGE_SIZE, "eth %s\nib %s %u\n", pnetelem->if_name,
+ pnetelem->ib_name, pnetelem->ib_port);
+ read_unlock(&smc_pnettable.lock);
+ return strlen(buf);
+}
+
+/* Delete a PNETID attribute file in /sys/kernel/smc/pnetids.
+ * Remove the sysfs file first and then remove the node from the list and
+ * release memory.
+ */
+static int smc_pnetid_del_file(char *pnetid)
+{
+ struct smc_pnetentry *e, *tmp_e, *found = NULL;
+
+ write_lock(&smc_pnettable.lock);
+ list_for_each_entry_safe(e, tmp_e, &smc_pnettable.pnetlist, list) {
+ if (!strncmp(e->pnet_name, pnetid, sizeof(e->pnet_name))) {
+ list_del(&e->list);
+ found = e;
+ break;
+ }
+ }
+ write_unlock(&smc_pnettable.lock);
+ if (!found)
+ return -ENOENT;
+ sysfs_remove_file(&smc_pnettable.pnetids_kobj, &found->attr.attr);
+ kfree(found);
+ return 0;
+}
+
+/* Append a PNETID to the end of the list if not already on this list. */
+static int smc_pnet_append_pnetentry(struct smc_pnetentry *new)
+{
+ struct smc_pnetentry *pnetelem;
+ int rc = 0;
+
+ write_lock(&smc_pnettable.lock);
+ list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+ if (!strncmp(pnetelem->pnet_name, new->pnet_name,
+ sizeof(new->pnet_name))) {
+ rc = -EEXIST;
+ goto found;
+ }
+ }
+ list_add_tail(&new->list, &smc_pnettable.pnetlist);
+found:
+ write_unlock(&smc_pnettable.lock);
+ return rc;
+}
+
+/* Add a PNETID attribute file in /sys/kernel/smc/pnetids. */
+static int smc_pnetid_add_file(char *pnetname)
+{
+ struct smc_pnetentry *pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL);
+ struct kobj_attribute *ka;
+ int rc;
+
+ if (!pnetelem)
+ return -ENOMEM;
+ ka = &pnetelem->attr;
+ sysfs_attr_init(&ka->attr);
+ strncpy(pnetelem->pnet_name, pnetname, sizeof(pnetelem->pnet_name));
+ ka->attr.name = pnetelem->pnet_name;
+ ka->attr.mode = S_IWUSR | S_IRUGO;
+ ka->show = smc_pnetidfile_attr_show;
+ ka->store = smc_pnetidfile_attr_store;
+ rc = smc_pnet_append_pnetentry(pnetelem);
+ if (rc)
+ goto outfree;
+ rc = sysfs_create_file_ns(&smc_pnettable.pnetids_kobj, &ka->attr, NULL);
+ if (!rc)
+ return rc;
+ /* sysfs failure, remove node from list */
+ write_lock(&smc_pnettable.lock);
+ list_del(&pnetelem->list);
+ write_unlock(&smc_pnettable.lock);
+outfree:
+ kfree(pnetelem);
+ return rc;
+}
+
+/* The limit for PNETID is 16 characters.
+ * Valid characters should be (single-byte character set) a-z, A-Z, 0-9.
+ * Lower case letters are converted to upper case.
+ * Interior blanks should not be used.
+ */
+static bool smc_pnetid_valid(const char *buf, char *pnetid)
+{
+ char *bf = skip_spaces(buf);
+ size_t len = strlen(bf);
+ char *end = bf + len;
+
+ if (!len)
+ return false;
+ while (--end >= bf && isspace(*end))
+ ;
+ if (end - bf >= SMC_MAX_PNET_ID_LEN)
+ return false;
+ while (bf <= end) {
+ if (!isalnum(*bf))
+ return false;
+ *pnetid++ = islower(*bf) ? toupper(*bf) : *bf;
+ bf++;
+ }
+ *pnetid = '\0';
+ return true;
+}
+
+static ssize_t smc_pnetid_store(bool add, const char *buf)
+{
+ char pnetid[SMC_MAX_PNET_ID_LEN + 1];
+
+ if (!smc_pnetid_valid(buf, pnetid))
+ return -EINVAL;
+ return add ? smc_pnetid_add_file(pnetid) : smc_pnetid_del_file(pnetid);
+}
+
+#define SMC_ATTR_WO(_name) \
+ struct kobj_attribute smc_attr_##_name = __ATTR(_name, S_IWUSR, NULL, \
+ smc_##_name##_store)
+
+static ssize_t smc_pnetid_del_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t rc = smc_pnetid_store(false, buf);
+
+ return rc ?: count;
+}
+static SMC_ATTR_WO(pnetid_del);
+
+static ssize_t smc_pnetid_add_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t rc = smc_pnetid_store(true, buf);
+
+ return rc ?: count;
+}
+static SMC_ATTR_WO(pnetid_add);
+
+/* Delete all PNETIDs. Any string with leading '-' will do.
+ * smc_pnetid_del_file() can not be called directly, because function
+ * sysfs_remove_file() can not be called under lock. Get the first entry
+ * of the list and remove it. smc_pnetid_del_file() can handle the case
+ * when a PNETID already has been deleted in the mean time.
+ */
+static ssize_t smc_flush_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct smc_pnettable *ptr = &smc_pnettable;
+ char pnetname[SMC_MAX_PNET_ID_LEN + 1];
+ struct smc_pnetentry *pnetelem;
+ char *bf = skip_spaces(buf);
+
+ if (*bf != '-')
+ return -EINVAL;
+ do {
+ read_lock(&ptr->lock);
+ pnetelem = list_first_entry_or_null(&ptr->pnetlist,
+ struct smc_pnetentry, list);
+ if (pnetelem)
+ strncpy(pnetname, pnetelem->pnet_name,
+ sizeof(pnetname));
+ read_unlock(&ptr->lock);
+ if (pnetelem)
+ smc_pnetid_del_file(pnetname);
+ } while (pnetelem);
+ return count;
+}
+static SMC_ATTR_WO(flush);
+
+static struct attribute *smc_pnetid_attrs[] = { /* Default SMC attributes */
+ &smc_attr_pnetid_add.attr,
+ &smc_attr_pnetid_del.attr,
+ &smc_attr_flush.attr,
+ NULL
+};
+
+static struct attribute_group smc_attr_group = {
+ .attrs = smc_pnetid_attrs
+};
+
+/* Remove directory tree created under /sys/kernel/smc/. */
+void smc_pnet_exit(void)
+{
+ kobject_put(&smc_pnettable.pnetids_kobj);
+ sysfs_remove_group(&smc_pnettable.kset->kobj, &smc_attr_group);
+ kset_unregister(smc_pnettable.kset);
+}
+
+/* Create directory tree for SMC under /sys/kernel/smc/. */
+int __init smc_pnet_init(void)
+{
+ int rc = -ENOMEM;
+
+ smc_pnettable.kset = kset_create_and_add("smc", NULL, kernel_kobj);
+ if (!smc_pnettable.kset)
+ goto bad0;
+ rc = sysfs_create_group(&smc_pnettable.kset->kobj, &smc_attr_group);
+ if (rc)
+ goto bad1;
+ rc = kobject_init_and_add(&smc_pnettable.pnetids_kobj, &smc_pnet_ktype,
+ &smc_pnettable.kset->kobj, "pnetids");
+ if (rc)
+ goto bad2;
+ return rc;
+
+bad2:
+ sysfs_remove_group(&smc_pnettable.kset->kobj, &smc_attr_group);
+bad1:
+ kset_unregister(smc_pnettable.kset);
+bad0:
+ return rc;
+}
+
+/* Scan the pnet table and find an IB device given the pnetid entry.
+ * Return infiniband device and port number if an active port is found.
+ * This function is called under smc_pnettable.lock.
+ */
+static void smc_pnet_ib_dev_by_pnet(struct smc_pnetentry *pnetelem,
+ struct smc_ib_device **smcibdev, u8 *ibport)
+{
+ struct smc_ib_device *dev;
+
+ *smcibdev = NULL;
+ *ibport = 0;
+ spin_lock(&smc_ib_devices.lock);
+ /* using string ib->ib_name, search ibdev in global list */
+ list_for_each_entry(dev, &smc_ib_devices.list, list) {
+ if (!strncmp(dev->ibdev->name, pnetelem->ib_name,
+ sizeof(pnetelem->ib_name)) &&
+ smc_ib_port_active(dev, pnetelem->ib_port)) {
+ *smcibdev = dev;
+ *ibport = pnetelem->ib_port;
+ break;
+ }
+ }
+ spin_unlock(&smc_ib_devices.lock);
+}
+
+/* PNET table analysis for a given sock:
+ * determine ib_device and port belonging to used internal TCP socket
+ * ethernet interface.
+ */
+void smc_pnet_find_roce_resource(struct sock *sk,
+ struct smc_ib_device **smcibdev, u8 *ibport)
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+ struct smc_pnetentry *pnetelem;
+
+ *smcibdev = NULL;
+ *ibport = 0;
+
+ if (!dst)
+ return;
+ if (!dst->dev)
+ goto out_rel;
+ read_lock(&smc_pnettable.lock);
+ list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+ if (!strncmp(dst->dev->name, pnetelem->if_name, IFNAMSIZ)) {
+ smc_pnet_ib_dev_by_pnet(pnetelem, smcibdev, ibport);
+ break;
+ }
+ }
+ read_unlock(&smc_pnettable.lock);
+out_rel:
+ dst_release(dst);
+}
+
+/* Returns true if a specific ib_device and port is in the PNET table. */
+bool smc_pnet_exists_in_table(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ struct smc_pnetentry *pnetelem;
+ int rc = -false;
+
+ read_lock(&smc_pnettable.lock);
+ list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+ if (!strncmp(smcibdev->ibdev->name, pnetelem->ib_name,
+ IB_DEVICE_NAME_MAX) &&
+ ibport == pnetelem->ib_port) {
+ rc = true;
+ break;
+ }
+ }
+ read_unlock(&smc_pnettable.lock);
+ return rc;
+}
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
new file mode 100644
index 000000000000..1ff35df954b1
--- /dev/null
+++ b/net/smc/smc_pnet.h
@@ -0,0 +1,26 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * PNET table queries
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_PNET_H
+#define _SMC_PNET_H
+
+#define SMC_MAX_PORTS 2 /* Max # of ports */
+
+#include <net/sock.h>
+
+struct smc_ib_device;
+
+bool smc_pnet_exists_in_table(struct smc_ib_device *, u8);
+void smc_pnet_find_roce_resource(struct sock *, struct smc_ib_device **, u8 *);
+
+int smc_pnet_init(void) __init;
+void smc_pnet_exit(void);
+
+#endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
new file mode 100644
index 000000000000..bf6d6eda7ecb
--- /dev/null
+++ b/net/smc/smc_rx.c
@@ -0,0 +1,189 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage RMBE
+ * copy new RMBE data into user space
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ursula.braun@de.ibm.com>
+ */
+
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_cdc.h"
+#include "smc_tx.h" /* smc_tx_consumer_update() */
+#include "smc_rx.h"
+
+/* callback implementation for sk.sk_data_ready()
+ * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data().
+ * indirectly called by smc_cdc_msg_recv_action().
+ */
+static void smc_rx_data_ready(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ /* derived from sock_def_readable() */
+ /* called already in smc_listen_worker() */
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
+ POLLRDNORM | POLLRDBAND);
+ if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
+ (sk->sk_state == SMC_CLOSED))
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+ else
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ rcu_read_unlock();
+}
+
+/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
+ * @smc smc socket
+ * @len num bytes to wait for
+ * @timeo max seconds to wait, 0 for no timeout
+ * Returns:
+ * 1 if at least len bytes available in rcvbuf.
+ * -EAGAIN in case timeout expired.
+ * 0 otherwise (neither enough bytes in rcvbuf nor timeout, e.g. interrupted).
+ */
+static int smc_rx_wait_data(struct smc_sock *smc, int len, long timeo)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ DEFINE_WAIT(wait);
+ int rc;
+
+ if (atomic_read(&conn->bytes_to_rcv) >= len)
+ return 1;
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ rc = sk_wait_event(sk, &timeo,
+ sk->sk_err ||
+ sk->sk_shutdown & RCV_SHUTDOWN ||
+ sock_flag(sk, SOCK_DONE) ||
+ (atomic_read(&conn->bytes_to_rcv) >= len) ||
+ smc_stop_received(conn));
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ finish_wait(sk_sleep(sk), &wait);
+ return (rc || timeo) ? rc : -EAGAIN;
+}
+
+/* rcvbuf consumer: main API called by socket layer */
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
+ int flags)
+{
+ size_t read_done = 0, read_remaining = len;
+ struct smc_connection *conn = &smc->conn;
+ union smc_host_cursor_ovl prod, cons;
+ size_t readable, chunk;
+ char *rcvbuf_base;
+ int to_read;
+ long timeo;
+ int target; /* Read at least these many bytes */
+ int rc;
+
+ msg->msg_namelen = 0;
+ rcvbuf_base = conn->rmb_desc->cpu_addr;
+ read_remaining = min_t(size_t, len, conn->rmbe_size); /* cap rx len */
+
+again:
+ target = sock_rcvlowat(&smc->sk, flags & MSG_WAITALL, read_remaining);
+ timeo = sock_rcvtimeo(&smc->sk, flags & MSG_DONTWAIT);
+ if (signal_pending(current))
+ return timeo ? -EINTR : -EAGAIN;
+ rc = smc_rx_wait_data(smc, target, timeo);
+ if ((rc == -EAGAIN) || (rc == -EINTR))
+ return rc;
+ if (!rc)
+ goto again;
+ to_read = atomic_read(&conn->bytes_to_rcv);
+
+ if ((to_read <= 0) &&
+ (smc->sk.sk_err ||
+ smc->sk.sk_shutdown & RCV_SHUTDOWN ||
+ sock_flag(&smc->sk, SOCK_DONE) ||
+ smc_stop_received(conn)))
+ return sock_error(&smc->sk);
+
+ if (to_read <= 0)
+ goto check_repeat;
+
+ if ((to_read < target) && !smc_stop_received(conn))
+ goto check_repeat;
+
+ prod.acurs = smc_curs_read(conn->local_rx_ctrl.prod.acurs);
+ cons.acurs = smc_curs_read(conn->local_tx_ctrl.cons.acurs);
+ if (prod.curs.wrap == cons.curs.wrap) {
+ /* unwrapped case: copy 1 single chunk */
+ readable = prod.curs.count - cons.curs.count;
+ chunk = min(read_remaining, readable);
+ if (!(flags & MSG_TRUNC)) {
+ if (memcpy_to_msg(msg, rcvbuf_base + cons.curs.count,
+ chunk))
+ return -EFAULT;
+ }
+ read_remaining -= chunk;
+ read_done += chunk;
+ } else {
+ /* wrapped case: top chunk */
+ readable = conn->rmbe_size - cons.curs.count;
+ if (readable) {
+ chunk = min(read_remaining, readable);
+ if (!(flags & MSG_TRUNC)) {
+ if (memcpy_to_msg(msg,
+ rcvbuf_base + cons.curs.count,
+ chunk))
+ return -EFAULT;
+ }
+ read_remaining -= chunk;
+ read_done += chunk;
+ }
+ /* wrapped case: bottom chunk (if any) */
+ if (read_remaining) {
+ readable = prod.curs.count;
+ chunk = min(read_remaining, readable);
+ if (!(flags & MSG_TRUNC)) {
+ if (memcpy_to_msg(msg, rcvbuf_base, chunk))
+ return -EFAULT;
+ }
+ read_remaining -= chunk;
+ read_done += chunk;
+ }
+ }
+
+ /* update cursors */
+ if (!(flags & MSG_PEEK)) {
+ smc_curs_add(conn->rmbe_size, &cons.curs, read_done);
+ smp_mb__before_atomic();
+ atomic_sub(read_done, &conn->bytes_to_rcv);
+ smp_mb__after_atomic();
+ xchg(&conn->local_tx_ctrl.cons.acurs, cons.acurs);
+ /* send consumer cursor update if required */
+ /* analogon to advertising a new TCP rcv_wnd if required */
+ smc_tx_consumer_update(conn);
+ }
+check_repeat:
+ if ((to_read < target) &&
+ !smc_stop_received(conn) &&
+ !conn->local_tx_ctrl.conn_state_flags.abnormal_close) {
+ goto again;
+ }
+
+ return read_done;
+}
+
+void smc_rx_handler(struct smc_sock *smc)
+{
+ smc->sk.sk_data_ready(&smc->sk);
+}
+
+/* Initialize receive properties on connection establishment. NB: not __init! */
+void smc_rx_init(struct smc_sock *smc)
+{
+ smc->sk.sk_data_ready = smc_rx_data_ready;
+}
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
new file mode 100644
index 000000000000..953c7bec822e
--- /dev/null
+++ b/net/smc/smc_rx.h
@@ -0,0 +1,24 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage RMBE
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ursula.braun@de.ibm.com>
+ */
+
+#ifndef SMC_RX_H
+#define SMC_RX_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+#include "smc.h"
+
+void smc_rx_init(struct smc_sock *);
+int smc_rx_to_read(struct smc_connection *);
+int smc_rx_recvmsg(struct smc_sock *, struct msghdr *, size_t, int);
+void smc_rx_handler(struct smc_sock *);
+
+#endif /* SMC_RX_H */
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
new file mode 100644
index 000000000000..348a6d4ab94a
--- /dev/null
+++ b/net/smc/smc_tx.c
@@ -0,0 +1,516 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage send buffer.
+ * Producer:
+ * Copy user space data into send buffer, if send buffer space available.
+ * Consumer:
+ * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ursula.braun@de.ibm.com>
+ */
+
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+#include "smc_cdc.h"
+#include "smc_tx.h"
+
+/***************************** sndbuf producer *******************************/
+
+/* callback implementation for sk.sk_write_space()
+ * to wakeup sndbuf producers that blocked with smc_tx_wait_memory()
+ */
+static void smc_tx_write_space(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+ struct smc_sock *smc = smc_sk(sk);
+ struct socket_wq *wq;
+
+ /* similar to sk_stream_write_space */
+ if (atomic_read(&smc->conn.sndbuf_space) && sock) {
+ clear_bit(SOCK_NOSPACE, &sock->flags);
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible_poll(&wq->wait,
+ POLLOUT | POLLWRNORM |
+ POLLWRBAND);
+ if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
+ sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
+ rcu_read_unlock();
+ }
+}
+
+/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+ * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
+ */
+void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
+{
+ if (smc->sk.sk_socket &&
+ atomic_read(&smc->conn.sndbuf_space) &&
+ test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags))
+ smc->sk.sk_write_space(&smc->sk);
+}
+
+/* sndbuf producer */
+static inline int smc_tx_give_up_send(struct smc_sock *smc, int copied)
+{
+ struct smc_connection *conn = &smc->conn;
+
+ if (smc->sk.sk_shutdown & SEND_SHUTDOWN ||
+ conn->local_tx_ctrl.conn_state_flags.abnormal_close)
+ return -EPIPE;
+ if (conn->local_rx_ctrl.conn_state_flags.abnormal_close ||
+ conn->local_rx_ctrl.conn_state_flags.closed_conn)
+ return copied ? copied : -ECONNRESET;
+ return 0;
+}
+
+/* blocks sndbuf producer until at least one byte of free space available */
+static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ DEFINE_WAIT(wait);
+ long timeo;
+ int rc = 0;
+
+ /* similar to sk_stream_wait_memory */
+ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) ||
+ conn->local_tx_ctrl.conn_state_flags.sending_done) {
+ rc = -EPIPE;
+ goto out;
+ }
+ if (conn->local_rx_ctrl.conn_state_flags.abnormal_close) {
+ rc = -ECONNRESET;
+ goto out;
+ }
+ if (!timeo) {
+ rc = -EAGAIN;
+ goto out;
+ }
+ if (signal_pending(current)) {
+ rc = -EINTR;
+ goto out;
+ }
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ if (atomic_read(&conn->sndbuf_space))
+ goto out;
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ rc = sk_wait_event(sk, &timeo,
+ sk->sk_err ||
+ (sk->sk_shutdown & SEND_SHUTDOWN) ||
+ smc_stop_received(conn) ||
+ atomic_read(&conn->sndbuf_space));
+ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) ||
+ conn->local_tx_ctrl.conn_state_flags.sending_done) {
+ rc = -EPIPE;
+ }
+ if (conn->local_rx_ctrl.conn_state_flags.abnormal_close)
+ rc = -ECONNRESET;
+out:
+ finish_wait(sk_sleep(sk), &wait);
+ return rc;
+}
+
+/* sndbuf producer: main API called by socket layer */
+int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
+{
+ size_t chunk_len, send_done = 0, send_remaining = len;
+ struct smc_connection *conn = &smc->conn;
+ union smc_host_cursor_ovl prep;
+ int tx_top, tx_bot;
+ char *sndbuf_base;
+ int tx_cnt_prep;
+ int writespace;
+ int rc;
+
+again:
+ if (smc->sk.sk_state == SMC_INIT)
+ return -ENOTCONN;
+ if (smc->sk.sk_state != SMC_ACTIVE)
+ return -ECONNRESET;
+ rc = smc_tx_give_up_send(smc, send_done);
+ if (rc)
+ return rc;
+ /* what to do in case of smc->sk.sk_err ??? */
+
+ writespace = atomic_read(&conn->sndbuf_space);
+ if (!writespace) {
+ int wait_rc;
+
+ wait_rc = smc_tx_wait_memory(smc, msg->msg_flags);
+ if (wait_rc < 0)
+ return ((send_done && (wait_rc == -EAGAIN))
+ ? send_done : wait_rc);
+ if (!wait_rc) {
+ rc = smc_tx_give_up_send(smc, send_done);
+ if (rc)
+ return rc;
+ if (msg->msg_flags & MSG_DONTWAIT)
+ return send_done ? send_done : -EAGAIN;
+ if (smc->sk.sk_err)
+ return send_done ? send_done : -EPIPE;
+ goto again;
+ }
+ }
+ if (smc->sk.sk_err)
+ return -EPIPE;
+ rc = smc_tx_give_up_send(smc, send_done);
+ if (rc)
+ return rc;
+
+ /* re-calc, could be just 1 byte after smc_tx_wait_memory above */
+ writespace = atomic_read(&conn->sndbuf_space);
+ chunk_len = min_t(size_t, send_remaining, writespace);
+ /* determine start of sndbuf */
+ prep.acurs = smc_curs_read(conn->tx_curs_prep.acurs);
+ tx_cnt_prep = prep.curs.count;
+ sndbuf_base = conn->sndbuf_desc->cpu_addr;
+ /* determine sndbuf chunks - top and bottom of sndbuf */
+ if (tx_cnt_prep + chunk_len <= conn->sndbuf_size) {
+ tx_top = 0;
+ tx_bot = chunk_len;
+ if (memcpy_from_msg(sndbuf_base + tx_cnt_prep, msg, chunk_len))
+ return -EFAULT;
+ } else {
+ tx_bot = conn->sndbuf_size - tx_cnt_prep;
+ tx_top = chunk_len - tx_bot;
+ if (memcpy_from_msg(sndbuf_base + tx_cnt_prep, msg, tx_bot))
+ return -EFAULT;
+ if (memcpy_from_msg(sndbuf_base, msg, tx_top))
+ return -EFAULT;
+ }
+ smc_curs_add(conn->sndbuf_size, &prep.curs, chunk_len);
+ xchg(&conn->tx_curs_prep.acurs, prep.acurs);
+ smp_mb__before_atomic();
+ atomic_sub(chunk_len, &conn->sndbuf_space);
+ smp_mb__after_atomic();
+
+ /* since we just produced more new data into sndbuf,
+ * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
+ */
+ rc = smc_tx_sndbuf_nonempty(conn);
+ if (rc)
+ return rc;
+
+ send_done += chunk_len;
+ send_remaining -= chunk_len;
+ if (send_done < len)
+ goto again;
+
+ return send_done;
+}
+
+/***************************** sndbuf consumer *******************************/
+
+/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */
+static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
+ int num_sges, struct ib_sge sges[])
+{
+ struct smc_link_group *lgr = conn->lgr;
+ struct ib_send_wr *failed_wr = NULL;
+ struct ib_rdma_wr rdma_wr;
+ struct smc_link *link;
+ int i, rc;
+
+ memset(&rdma_wr, 0, sizeof(rdma_wr));
+ link = &lgr->lnk[SMC_SINGLE_LINK];
+ for (i = 0; i < num_sges; i++) {
+ sges[i].addr =
+ conn->sndbuf_desc->dma_addr[SMC_SINGLE_LINK] +
+ sges[i].addr;
+ sges[i].lkey = link->mr_tx->lkey;
+ }
+ rdma_wr.wr.wr_id = smc_wr_tx_get_next_wr_id(link);
+ rdma_wr.wr.sg_list = sges;
+ rdma_wr.wr.num_sge = num_sges;
+ rdma_wr.wr.opcode = IB_WR_RDMA_WRITE;
+ rdma_wr.remote_addr =
+ lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr +
+ peer_rmbe_offset +
+ ((conn->peer_conn_idx - 1) * (conn->peer_rmbe_len));
+ rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
+ rc = ib_post_send(link->roce_qp, &rdma_wr.wr, &failed_wr);
+ if (rc)
+ conn->local_tx_ctrl.conn_state_flags.abnormal_close = 1;
+ return rc;
+}
+
+/* sndbuf consumer */
+static inline void smc_tx_fill_sges(int *num_sges, struct ib_sge sges[],
+ u64 sge_offset1, u32 sge_len1,
+ u64 sge_offset2, u32 sge_len2)
+{
+ memset(sges, 0, SMC_IB_MAX_SEND_SGE * sizeof(sges[0]));
+ sges[0].addr = sge_offset1;
+ sges[0].length = sge_len1;
+ if (sge_len2) {
+ *num_sges = 2;
+ sges[1].addr = sge_offset2;
+ sges[1].length = sge_len2;
+ } else {
+ *num_sges = 1;
+ }
+}
+
+/* sndbuf consumer */
+static inline void smc_tx_advance_cursors(struct smc_connection *conn,
+ union smc_host_cursor_ovl *prod,
+ union smc_host_cursor_ovl *sent,
+ size_t len)
+{
+ smc_curs_add(conn->peer_rmbe_len, &prod->curs, len);
+ smp_mb__before_atomic();
+ /* data in flight reduces usable snd_wnd */
+ atomic_sub(len, &conn->peer_rmbe_space);
+ smp_mb__after_atomic();
+ smc_curs_add(conn->sndbuf_size, &sent->curs, len);
+}
+
+/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
+ * usable snd_wnd as max transmit
+ */
+static int smc_tx_rdma_writes(struct smc_connection *conn)
+{
+ union smc_host_cursor_ovl sent, prep, prod, cons;
+ size_t to_copy, space1, space2, send_len;
+ struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
+ size_t tx_top1, tx_top2;
+ size_t tx_bot1, tx_bot2;
+ size_t tx_top, tx_bot;
+ int to_send, rmbespace;
+ int num_sges;
+ int rc;
+
+ sent.acurs = smc_curs_read(conn->tx_curs_sent.acurs);
+ prep.acurs = smc_curs_read(conn->tx_curs_prep.acurs);
+
+ /* cf. wmem_alloc - (snd_max - snd_una) */
+ to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep);
+ if (to_send <= 0)
+ return 0;
+
+ /* cf. snd_wnd */
+ rmbespace = atomic_read(&conn->peer_rmbe_space);
+ if (rmbespace <= 0)
+ return 0;
+
+ if (to_send >= rmbespace)
+ conn->local_tx_ctrl.prod_flags.write_blocked = 1;
+ else
+ conn->local_tx_ctrl.prod_flags.write_blocked = 0;
+
+ /* cf. usable snd_wnd */
+ to_copy = min(to_send, rmbespace);
+
+ if (sent.curs.count + to_copy <= conn->peer_rmbe_len) {
+ tx_top = 0;
+ tx_bot = to_copy;
+ } else {
+ tx_bot = conn->sndbuf_size - sent.curs.count;
+ tx_top = to_copy - tx_bot;
+ }
+ prod.acurs = smc_curs_read(conn->local_tx_ctrl.prod.acurs);
+ cons.acurs = smc_curs_read(conn->local_rx_ctrl.cons.acurs);
+ if (prod.curs.wrap == cons.curs.wrap) {
+ space1 = conn->peer_rmbe_len - prod.curs.count;
+ space2 = cons.curs.count;
+
+ send_len = min(to_copy, space1);
+ if (send_len <= tx_bot) {
+ tx_bot1 = send_len;
+ tx_bot2 = tx_bot - tx_bot1;
+ tx_top1 = 0;
+ tx_top2 = tx_top;
+ } else {
+ tx_bot1 = tx_bot;
+ tx_bot2 = 0;
+ tx_top1 = send_len - tx_bot;
+ tx_top2 = tx_top - tx_top1;
+ }
+ smc_tx_fill_sges(&num_sges, sges, sent.curs.count, tx_bot1, 0,
+ tx_top1);
+ rc = smc_tx_rdma_write(conn, prod.curs.count, num_sges, sges);
+ if (rc)
+ return rc;
+ to_copy -= send_len;
+ smc_tx_advance_cursors(conn, &prod, &sent, send_len);
+
+ if (to_copy && space2 && (tx_bot2 + tx_top2 > 0)) {
+ send_len = min(to_copy, space2);
+ if (tx_bot2 > send_len) {
+ tx_bot2 = send_len;
+ tx_top2 = 0;
+ } else {
+ if (tx_bot2 + tx_top2 > send_len)
+ tx_top2 = send_len - tx_bot2;
+ }
+ if (tx_bot2)
+ smc_tx_fill_sges(&num_sges, sges,
+ sent.curs.count,
+ tx_bot2, tx_top1, tx_top2);
+ else if (tx_top2)
+ smc_tx_fill_sges(&num_sges, sges, tx_top1,
+ tx_top2, 0, 0);
+ rc = smc_tx_rdma_write(conn, 0, num_sges, sges);
+ if (rc)
+ return rc;
+ smc_tx_advance_cursors(conn, &prod, &sent,
+ tx_bot2 + tx_top2);
+ }
+ } else {
+ space1 = cons.curs.count - prod.curs.count;
+ send_len = min(to_copy, space1);
+ if (send_len <= tx_bot) {
+ tx_bot = send_len;
+ tx_top = 0;
+ } else {
+ if ((send_len - tx_bot) <= tx_top)
+ tx_top = send_len - tx_bot;
+ }
+ smc_tx_fill_sges(&num_sges, sges, sent.curs.count, tx_bot, 0,
+ tx_top);
+ rc = smc_tx_rdma_write(conn, prod.curs.count, num_sges, sges);
+ if (rc)
+ return rc;
+ smc_tx_advance_cursors(conn, &prod, &sent, send_len);
+ }
+ xchg(&conn->local_tx_ctrl.prod.acurs, prod.acurs);
+ xchg(&conn->tx_curs_sent.acurs, sent.acurs);
+
+ return 0;
+}
+
+/* Wakeup sndbuf consumers from any context (IRQ or process)
+ * since there is more data to transmit; usable snd_wnd as max transmit
+ */
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+ struct smc_cdc_tx_pend *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ spin_lock_bh(&conn->send_lock);
+ rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
+ &pend);
+ if (rc < 0) {
+ schedule_delayed_work(&conn->tx_work, HZ / 10);
+ goto out_unlock;
+ }
+
+ rc = smc_tx_rdma_writes(conn);
+ if (rc) {
+ smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+ (struct smc_wr_tx_pend_priv *)pend);
+ goto out_unlock;
+ }
+
+ rc = smc_cdc_msg_send(conn, wr_buf, pend);
+
+out_unlock:
+ spin_unlock_bh(&conn->send_lock);
+ return rc;
+}
+
+int smc_tx_close_wr(struct smc_connection *conn)
+{
+ struct smc_cdc_tx_pend *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ conn->local_tx_ctrl.conn_state_flags.sending_done = 1;
+
+ rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
+ &pend);
+
+ rc = smc_cdc_msg_send(conn, wr_buf, pend);
+
+ return rc;
+}
+
+int smc_tx_close(struct smc_connection *conn)
+{
+ struct smc_cdc_tx_pend *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ if (atomic_read(&conn->bytes_to_rcv))
+ conn->local_tx_ctrl.conn_state_flags.abnormal_close = 1;
+ else
+ conn->local_tx_ctrl.conn_state_flags.closed_conn = 1;
+
+ rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
+ &pend);
+
+ rc = smc_cdc_msg_send(conn, wr_buf, pend);
+
+ return rc;
+}
+
+/* Wakeup sndbuf consumers from process context
+ * since there is more data to transmit
+ */
+static void smc_tx_worker(struct work_struct *work)
+{
+ struct smc_connection *conn = container_of(to_delayed_work(work),
+ struct smc_connection,
+ tx_work);
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+
+ lock_sock(&smc->sk);
+ smc_tx_sndbuf_nonempty(conn);
+ release_sock(&smc->sk);
+}
+
+void smc_tx_consumer_update(struct smc_connection *conn)
+{
+ union smc_host_cursor_ovl cfed, cons;
+ struct smc_cdc_tx_pend *pend;
+ struct smc_wr_buf *wr_buf;
+ int to_confirm, rc;
+
+ cons.acurs = smc_curs_read(conn->local_tx_ctrl.cons.acurs);
+ cfed.acurs = smc_curs_read(conn->rx_curs_confirmed.acurs);
+ to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons);
+
+ if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+ ((to_confirm > conn->rmbe_update_limit) &&
+ ((to_confirm / (conn->rmbe_size / 2) > 0) ||
+ conn->local_rx_ctrl.prod_flags.write_blocked))) {
+ rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+ &wr_buf, &pend);
+ if (!rc)
+ rc = smc_cdc_msg_send(conn, wr_buf, pend);
+ if (rc < 0) {
+ schedule_delayed_work(&conn->tx_work, HZ / 10);
+ return;
+ }
+ xchg(&conn->rx_curs_confirmed.acurs,
+ smc_curs_read(conn->local_tx_ctrl.cons.acurs));
+ conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
+ }
+ if (conn->local_rx_ctrl.prod_flags.write_blocked &&
+ !atomic_read(&conn->bytes_to_rcv))
+ conn->local_rx_ctrl.prod_flags.write_blocked = 0;
+}
+
+/***************************** send initialize *******************************/
+
+/* Initialize send properties on connection establishment. NB: not __init! */
+void smc_tx_init(struct smc_sock *smc)
+{
+ smc->sk.sk_write_space = smc_tx_write_space;
+ INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_worker);
+ spin_lock_init(&smc->conn.send_lock);
+}
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
new file mode 100644
index 000000000000..c83a0914e24d
--- /dev/null
+++ b/net/smc/smc_tx.h
@@ -0,0 +1,36 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage send buffer
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ursula.braun@de.ibm.com>
+ */
+
+#ifndef SMC_TX_H
+#define SMC_TX_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+#include "smc.h"
+
+static inline int smc_tx_prepared_sends(struct smc_connection *conn)
+{
+ union smc_host_cursor_ovl sent, prep;
+
+ sent.acurs = smc_curs_read(conn->tx_curs_sent.acurs);
+ prep.acurs = smc_curs_read(conn->tx_curs_prep.acurs);
+ return smc_curs_diff(conn->sndbuf_size, &sent, &prep);
+}
+
+void smc_tx_init(struct smc_sock *);
+int smc_tx_sendmsg(struct smc_sock *, struct msghdr *, size_t);
+int smc_tx_sndbuf_nonempty(struct smc_connection *);
+void smc_tx_sndbuf_nonfull(struct smc_sock *);
+void smc_tx_consumer_update(struct smc_connection *);
+int smc_tx_close(struct smc_connection *);
+int smc_tx_close_wr(struct smc_connection *);
+
+#endif /* SMC_TX_H */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
new file mode 100644
index 000000000000..52df04f2b0e1
--- /dev/null
+++ b/net/smc/smc_wr.c
@@ -0,0 +1,557 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Work Requests exploiting Infiniband API
+ *
+ * Work requests (WR) of type ib_post_send or ib_post_recv respectively
+ * are submitted to either RC SQ or RC RQ respectively
+ * (reliably connected send/receive queue)
+ * and become work queue entries (WQEs).
+ * While an SQ WR/WQE is pending, we track it until transmission completion.
+ * Through a send or receive completion queue (CQ) respectively,
+ * we get completion queue entries (CQEs) [aka work completions (WCs)].
+ * Since the CQ callback is called from IRQ context, we split work by using
+ * bottom halves implemented by tasklets.
+ *
+ * SMC uses this to exchange LLC (link layer control)
+ * and CDC (connection data control) messages.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
+ */
+
+#include <linux/hashtable.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+
+#define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */
+
+#define SMC_WR_RX_HASH_BITS 4
+static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
+static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
+
+static bool smc_wr_tx_pending_on_link(struct smc_link *link)
+{
+ return find_first_bit(link->wr_tx_mask, link->wr_tx_cnt)
+ != link->wr_tx_cnt;
+}
+
+int smc_wr_tx_wait_no_pending_on_link(struct smc_link *link)
+{
+ int rc = 1;
+
+ if (smc_wr_tx_pending_on_link(link)) {
+ rc = wait_event_interruptible_timeout(
+ link->wr_tx_wait,
+ !smc_wr_tx_pending_on_link(link),
+ SMC_WR_TX_WAIT_PENDING_TIME);
+ }
+ return rc;
+}
+
+static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
+{
+ u32 i;
+
+ for (i = 0; i < link->wr_tx_cnt; i++) {
+ if (link->wr_tx_pends[i].wr_id == wr_id)
+ return i;
+ }
+ return link->wr_tx_cnt;
+}
+
+static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
+{
+ struct smc_wr_tx_pend pnd_snd;
+ struct smc_link *link;
+ u32 pnd_snd_idx;
+ int i;
+
+ link = wc->qp->qp_context;
+ pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
+ if (pnd_snd_idx == link->wr_tx_cnt)
+ return;
+ link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
+ memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd));
+ if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
+ return;
+ if (wc->status) {
+ if ((wc->status == IB_WC_RETRY_EXC_ERR) ||
+ (wc->status == IB_WC_RNR_RETRY_EXC_ERR)) {
+ pr_warn("smc overload on link %p - reason %s\n",
+ link,
+ (wc->status == IB_WC_RETRY_EXC_ERR) ?
+ "retry" : "rnr");
+ }
+ for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt)
+ clear_bit(i, link->wr_tx_mask);
+ /* tbd in future patch: terminate connections of this link
+ * group abnormally
+ */
+ }
+ if (pnd_snd.handler)
+ pnd_snd.handler(&pnd_snd.priv, link, wc->status);
+ wake_up(&link->wr_tx_wait);
+}
+
+void smc_wr_tx_tasklet_fn(unsigned long data)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)data;
+ struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
+ int i = 0, rc;
+ int polled = 0;
+
+again:
+ polled++;
+ do {
+ rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
+ if (polled == 1) {
+ ib_req_notify_cq(dev->roce_cq_send,
+ IB_CQ_NEXT_COMP |
+ IB_CQ_REPORT_MISSED_EVENTS);
+ }
+ if (!rc)
+ break;
+ for (i = 0; i < rc; i++)
+ smc_wr_tx_process_cqe(&wc[i]);
+ } while (rc > 0);
+ if (polled == 1)
+ goto again;
+}
+
+void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
+
+ tasklet_schedule(&dev->send_tasklet);
+}
+
+static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
+{
+ *idx = link->wr_tx_cnt;
+ for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
+ if (!test_and_set_bit(*idx, link->wr_tx_mask))
+ return 0;
+ }
+ *idx = link->wr_tx_cnt;
+ return -EBUSY;
+}
+
+/**
+ * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
+ * and sets info for pending transmit tracking
+ * @link: Pointer to smc_link used to later send the message.
+ * @handler: Send completion handler function pointer.
+ * @wr_buf: Out value returns pointer to message buffer.
+ * @wr_pend_priv: Out value returns pointer serving as handler context.
+ *
+ * Return: 0 on success, or -errno on error.
+ */
+int smc_wr_tx_get_free_slot(struct smc_link *link,
+ smc_wr_tx_handler handler,
+ struct smc_wr_buf **wr_buf,
+ struct smc_wr_tx_pend_priv **wr_pend_priv)
+{
+ struct smc_wr_tx_pend *wr_pend;
+ struct ib_send_wr *wr_ib;
+ u64 wr_id;
+ u32 idx;
+ int rc;
+
+ *wr_buf = NULL;
+ *wr_pend_priv = NULL;
+ if (in_softirq()) {
+ rc = smc_wr_tx_get_free_slot_index(link, &idx);
+ if (rc)
+ return rc;
+ } else {
+ rc = wait_event_interruptible_timeout(
+ link->wr_tx_wait,
+ (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
+ SMC_WR_TX_WAIT_FREE_SLOT_TIME);
+ if (!rc) {
+ /* tbd in future patch: timeout - terminate connections
+ * of this link group abnormally
+ */
+ return -EPIPE;
+ }
+ if (rc == -ERESTARTSYS)
+ return -EINTR;
+ if (idx == link->wr_tx_cnt)
+ return -EPIPE;
+ }
+ wr_id = smc_wr_tx_get_next_wr_id(link);
+ wr_pend = &link->wr_tx_pends[idx];
+ /* clear the full struct smc_wr_tx_pend including .priv */
+ memset(wr_pend, 0, sizeof(*wr_pend));
+ wr_pend->wr_id = wr_id;
+ wr_pend->handler = handler;
+ wr_pend->link = link;
+ wr_pend->idx = idx;
+ wr_ib = &link->wr_tx_ibs[idx];
+ wr_ib->wr_id = wr_id;
+ *wr_buf = &link->wr_tx_bufs[idx];
+ *wr_pend_priv = &wr_pend->priv;
+ return 0;
+}
+
+int smc_wr_tx_put_slot(struct smc_link *link,
+ struct smc_wr_tx_pend_priv *wr_pend_priv)
+{
+ struct smc_wr_tx_pend *pend;
+
+ pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
+ if (pend->idx < link->wr_tx_cnt) {
+ test_and_clear_bit(pend->idx, link->wr_tx_mask);
+ memset(&pend, 0, sizeof(pend));
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Send prepared WR slot via ib_post_send.
+ * Requires conn->send_lock being held if entered with an smc_connection
+ * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
+ */
+int smc_wr_tx_send(struct smc_link *link, struct smc_connection *conn,
+ struct smc_wr_tx_pend_priv *priv)
+{
+ struct ib_send_wr *failed_wr = NULL;
+ struct smc_wr_tx_pend *pend;
+ int rc;
+
+ ib_req_notify_cq(link->smcibdev->roce_cq_send,
+ IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS);
+ pend = container_of(priv, struct smc_wr_tx_pend, priv);
+ rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx],
+ &failed_wr);
+ if (rc)
+ smc_wr_tx_put_slot(link, priv);
+ return rc;
+}
+
+int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
+{
+ struct smc_wr_rx_handler *h_iter;
+ int rc = 0;
+
+ spin_lock(&smc_wr_rx_hash_lock);
+ hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
+ if (h_iter->type == handler->type) {
+ rc = -EEXIST;
+ goto out_unlock;
+ }
+ }
+ hash_add(smc_wr_rx_hash, &handler->list, handler->type);
+out_unlock:
+ spin_unlock(&smc_wr_rx_hash_lock);
+ return rc;
+}
+
+/* Demultiplex a received work request based on the message type to its handler.
+ * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
+ * and not being modified any more afterwards so we don't need to lock it.
+ */
+static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
+{
+ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+ struct smc_wr_rx_handler *handler;
+ struct smc_wr_rx_hdr *wr_rx;
+ u32 index;
+
+ if (wc->byte_len < sizeof(*wr_rx))
+ return; /* short message */
+ index = wc->wr_id % link->wr_rx_cnt;
+ wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
+ hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
+ if (handler->type == wr_rx->type)
+ handler->handler(wc, wr_rx);
+ }
+}
+
+static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
+{
+ struct smc_link *link;
+ int i;
+
+ for (i = 0; i < num; i++) {
+ link = wc[i].qp->qp_context;
+ if (wc[i].status == IB_WC_SUCCESS) {
+ smc_wr_rx_demultiplex(&wc[i]);
+ smc_wr_rx_post(link); /* refill WR RX */
+ } else {
+ /* handle status errors */
+ switch (wc[i].status) {
+ case IB_WC_RETRY_EXC_ERR:
+ case IB_WC_RNR_RETRY_EXC_ERR:
+ case IB_WC_WR_FLUSH_ERR:
+ /* tbd in future patch: terminate connections of this
+ * link group abnormally
+ */
+ break;
+ default:
+ smc_wr_rx_post(link); /* refill WR RX */
+ break;
+ }
+ continue;
+ }
+ }
+}
+
+void smc_wr_rx_tasklet_fn(unsigned long data)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)data;
+ struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
+ int polled = 0;
+ int rc;
+
+again:
+ polled++;
+ do {
+ memset(&wc, 0, sizeof(wc));
+ rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
+ if (polled == 1) {
+ ib_req_notify_cq(dev->roce_cq_recv,
+ IB_CQ_SOLICITED_MASK
+ | IB_CQ_REPORT_MISSED_EVENTS);
+ }
+ if (!rc)
+ break;
+ smc_wr_rx_process_cqes(&wc[0], rc);
+ } while (rc > 0);
+ if (polled == 1)
+ goto again;
+}
+
+void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
+
+ tasklet_schedule(&dev->recv_tasklet);
+}
+
+int smc_wr_rx_post_init(struct smc_link *link)
+{
+ u32 i;
+ int rc = 0;
+
+ for (i = 0; i < link->wr_rx_cnt; i++)
+ rc = smc_wr_rx_post(link);
+ return rc;
+}
+
+void smc_wr_remember_qp_attr(struct smc_link *lnk)
+{
+ struct ib_qp_attr *attr = &lnk->qp_attr;
+ struct ib_qp_init_attr init_attr;
+
+ memset(attr, 0, sizeof(*attr));
+ memset(&init_attr, 0, sizeof(init_attr));
+ ib_query_qp(lnk->roce_qp, attr,
+ IB_QP_STATE |
+ IB_QP_CUR_STATE |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY |
+ IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_RQ_PSN |
+ IB_QP_ALT_PATH |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_SQ_PSN |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_CAP |
+ IB_QP_DEST_QPN,
+ &init_attr);
+
+ lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
+ lnk->qp_attr.cap.max_send_wr);
+ lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
+ lnk->qp_attr.cap.max_recv_wr);
+}
+
+static void smc_wr_init_sge(struct smc_link *lnk)
+{
+ u32 i;
+
+ for (i = 0; i < lnk->wr_tx_cnt; i++) {
+ lnk->wr_tx_sges[i].addr =
+ lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
+ lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
+ lnk->wr_tx_sges[i].lkey = lnk->mr_tx->lkey;
+ lnk->wr_tx_ibs[i].next = NULL;
+ lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
+ lnk->wr_tx_ibs[i].num_sge = 1;
+ lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
+ lnk->wr_tx_ibs[i].send_flags =
+ IB_SEND_SIGNALED | IB_SEND_SOLICITED | IB_SEND_INLINE;
+ }
+ for (i = 0; i < lnk->wr_rx_cnt; i++) {
+ lnk->wr_rx_sges[i].addr =
+ lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
+ lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE;
+ lnk->wr_rx_sges[i].lkey = lnk->mr_tx->lkey;
+ lnk->wr_rx_ibs[i].next = NULL;
+ lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i];
+ lnk->wr_rx_ibs[i].num_sge = 1;
+ }
+}
+
+void smc_wr_free_link(struct smc_link *lnk)
+{
+ struct ib_device *ibdev;
+
+ memset(lnk->wr_tx_mask, 0,
+ BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
+
+ if (!lnk->smcibdev)
+ return;
+ ibdev = lnk->smcibdev->ibdev;
+
+ if (lnk->wr_rx_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
+ SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ DMA_FROM_DEVICE);
+ lnk->wr_rx_dma_addr = 0;
+ }
+ if (lnk->wr_tx_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
+ SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
+ DMA_TO_DEVICE);
+ lnk->wr_tx_dma_addr = 0;
+ }
+}
+
+void smc_wr_free_link_mem(struct smc_link *lnk)
+{
+ kfree(lnk->wr_tx_pends);
+ lnk->wr_tx_pends = NULL;
+ kfree(lnk->wr_tx_mask);
+ lnk->wr_tx_mask = NULL;
+ kfree(lnk->wr_tx_sges);
+ lnk->wr_tx_sges = NULL;
+ kfree(lnk->wr_rx_sges);
+ lnk->wr_rx_sges = NULL;
+ kfree(lnk->wr_rx_ibs);
+ lnk->wr_rx_ibs = NULL;
+ kfree(lnk->wr_tx_ibs);
+ lnk->wr_tx_ibs = NULL;
+ kfree(lnk->wr_tx_bufs);
+ lnk->wr_tx_bufs = NULL;
+ kfree(lnk->wr_rx_bufs);
+ lnk->wr_rx_bufs = NULL;
+}
+
+int smc_wr_alloc_link_mem(struct smc_link *link)
+{
+ /* allocate link related memory */
+ link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
+ if (!link->wr_tx_bufs)
+ goto no_mem;
+ link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
+ GFP_KERNEL);
+ if (!link->wr_rx_bufs)
+ goto no_mem_wr_tx_bufs;
+ link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_ibs)
+ goto no_mem_wr_rx_bufs;
+ link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
+ sizeof(link->wr_rx_ibs[0]),
+ GFP_KERNEL);
+ if (!link->wr_rx_ibs)
+ goto no_mem_wr_tx_ibs;
+ link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_sges)
+ goto no_mem_wr_rx_ibs;
+ link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
+ sizeof(link->wr_rx_sges[0]),
+ GFP_KERNEL);
+ if (!link->wr_rx_sges)
+ goto no_mem_wr_tx_sges;
+ link->wr_tx_mask = kzalloc(
+ BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*link->wr_tx_mask),
+ GFP_KERNEL);
+ if (!link->wr_tx_mask)
+ goto no_mem_wr_rx_sges;
+ link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
+ sizeof(link->wr_tx_pends[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_pends)
+ goto no_mem_wr_tx_mask;
+ return 0;
+
+no_mem_wr_tx_mask:
+ kfree(link->wr_tx_mask);
+no_mem_wr_rx_sges:
+ kfree(link->wr_rx_sges);
+no_mem_wr_tx_sges:
+ kfree(link->wr_tx_sges);
+no_mem_wr_rx_ibs:
+ kfree(link->wr_rx_ibs);
+no_mem_wr_tx_ibs:
+ kfree(link->wr_tx_ibs);
+no_mem_wr_rx_bufs:
+ kfree(link->wr_rx_bufs);
+no_mem_wr_tx_bufs:
+ kfree(link->wr_tx_bufs);
+no_mem:
+ return -ENOMEM;
+}
+
+void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
+{
+ tasklet_kill(&smcibdev->recv_tasklet);
+ tasklet_kill(&smcibdev->send_tasklet);
+}
+
+void smc_wr_add_dev(struct smc_ib_device *smcibdev)
+{
+ tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn,
+ (unsigned long)smcibdev);
+ tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn,
+ (unsigned long)smcibdev);
+}
+
+int smc_wr_create_lgr(struct smc_link *lnk)
+{
+ struct ib_device *ibdev = lnk->smcibdev->ibdev;
+ int rc = 0;
+
+ atomic64_set(&lnk->wr_tx_id, 0);
+ lnk->wr_rx_id = 0;
+ lnk->wr_rx_dma_addr = ib_dma_map_single(
+ ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
+ lnk->wr_rx_dma_addr = 0;
+ rc = -EIO;
+ goto out;
+ }
+ lnk->wr_tx_dma_addr = ib_dma_map_single(
+ ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
+ rc = -EIO;
+ goto dma_unmap;
+ }
+ smc_wr_init_sge(lnk);
+ memset(lnk->wr_tx_mask, 0,
+ BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
+ return rc;
+
+dma_unmap:
+ ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
+ SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ DMA_FROM_DEVICE);
+ lnk->wr_rx_dma_addr = 0;
+out:
+ return rc;
+}
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
new file mode 100644
index 000000000000..a1f50c284d90
--- /dev/null
+++ b/net/smc/smc_wr.h
@@ -0,0 +1,101 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Work Requests exploiting Infiniband API
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_WR_H
+#define SMC_WR_H
+
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_core.h"
+
+#define SMC_WR_MAX_CQE 32768 /* max. # of completion queue elements */
+#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */
+
+#define SMC_WR_TX_WAIT_FREE_SLOT_TIME HZ
+#define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ)
+
+#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
+
+#define SMC_WR_TX_PEND_PRIV_SIZE 32
+
+struct smc_wr_tx_pend_priv {
+ u8 priv[SMC_WR_TX_PEND_PRIV_SIZE];
+};
+
+typedef void (*smc_wr_tx_handler)(struct smc_wr_tx_pend_priv *,
+ struct smc_link *,
+ enum ib_wc_status);
+
+struct smc_wr_rx_handler {
+ struct hlist_node list; /* hash table collision resolution */
+ void (*handler)(struct ib_wc *, void *);
+ u8 type;
+};
+
+struct smc_wr_tx_pend { /* control data for a pending send request */
+ u64 wr_id; /* work request id sent */
+ smc_wr_tx_handler handler;
+ enum ib_wc_status wc_status; /* CQE status */
+ struct smc_link *link;
+ u32 idx;
+ struct smc_wr_tx_pend_priv priv;
+};
+
+/* Only used by RDMA write WRs.
+ * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly
+ */
+static inline u64 smc_wr_tx_get_next_wr_id(struct smc_link *link)
+{
+ return atomic64_inc_return(&link->wr_tx_id);
+}
+
+/* post a new receive work request to fill a completed old work request entry */
+static inline int smc_wr_rx_post(struct smc_link *link)
+{
+ struct ib_recv_wr *bad_recv_wr = NULL;
+ int rc = 0;
+ u64 wr_id;
+ u32 index;
+
+ wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */
+ index = wr_id % link->wr_rx_cnt;
+ link->wr_rx_ibs[index].wr_id = wr_id;
+ rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], &bad_recv_wr);
+ return rc;
+}
+
+struct smc_connection;
+
+int smc_wr_create_lgr(struct smc_link *);
+int smc_wr_alloc_link_mem(struct smc_link *);
+void smc_wr_free_link(struct smc_link *);
+void smc_wr_free_link_mem(struct smc_link *);
+void smc_wr_remember_qp_attr(struct smc_link *);
+void smc_wr_cq_event_handler(struct ib_event *, void *);
+void smc_wr_remove_dev(struct smc_ib_device *);
+void smc_wr_add_dev(struct smc_ib_device *);
+
+int smc_wr_tx_wait_no_pending_on_link(struct smc_link *);
+int smc_wr_tx_get_free_slot(struct smc_link *, smc_wr_tx_handler,
+ struct smc_wr_buf **,
+ struct smc_wr_tx_pend_priv **);
+int smc_wr_tx_put_slot(struct smc_link *, struct smc_wr_tx_pend_priv *);
+int smc_wr_tx_send(struct smc_link *, struct smc_connection *,
+ struct smc_wr_tx_pend_priv *);
+void smc_wr_tx_tasklet_fn(unsigned long);
+void smc_wr_tx_cq_handler(struct ib_cq *, void *);
+
+int smc_wr_rx_register_handler(struct smc_wr_rx_handler *);
+int smc_wr_rx_post_init(struct smc_link *);
+void smc_wr_rx_tasklet_fn(unsigned long);
+void smc_wr_rx_cq_handler(struct ib_cq *, void *);
+
+#endif /* SMC_WR_H */