summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEiji Ota <Eiji.Ota@Sun.COM>2010-04-21 07:25:52 -0700
committerEiji Ota <Eiji.Ota@Sun.COM>2010-04-21 07:25:52 -0700
commitc0dd49bdd68c0d758a67d56f07826f3b45cfc664 (patch)
treebd39a182ec430367040aaee8df5c188a3a05399d
parentb4756084ba1ef238a1e15b1585b853a7c1f85582 (diff)
downloadillumos-gate-c0dd49bdd68c0d758a67d56f07826f3b45cfc664.tar.gz
PSARC/2010/043 Reliable Datagram Service v3
6850013 RDS driver upgrade to version 3 6902396 su_recv does not call pollwakeup() for zero-len datagrams when protocol uses uio recv
-rw-r--r--usr/src/cmd/cmd-inet/etc/sock2path5
-rw-r--r--usr/src/cmd/rcm_daemon/Makefile.com3
-rw-r--r--usr/src/cmd/rcm_daemon/common/SUNW,rdsv3u.sh109
-rw-r--r--usr/src/pkg/manifests/driver-network-rdsv3.mf57
-rw-r--r--usr/src/pkg/manifests/system-header.mf2
-rw-r--r--usr/src/pkg/manifests/system-network.mf5
-rw-r--r--usr/src/tools/opensolaris/license-list1
-rw-r--r--usr/src/uts/common/Makefile.files11
-rw-r--r--usr/src/uts/common/Makefile.rules7
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon_sops.c7
-rw-r--r--usr/src/uts/common/inet/sockmods/sockmod_rds.c106
-rw-r--r--usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_cma.c487
-rw-r--r--usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_ib_cma.c189
-rw-r--r--usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_kverbs.c2323
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/LICENSE40
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/LICENSE.descrip1
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/af_rds.c1009
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/bind.c202
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/cong.c523
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/connection.c546
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/ib.c410
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/ib_cm.c978
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/ib_rdma.c551
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/ib_recv.c1129
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/ib_ring.c208
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/ib_send.c1148
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/ib_stats.c125
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/ib_sysctl.c90
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/info.c155
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/loop.c242
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/message.c473
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/page.c102
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/rdma.c672
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/rdma_transport.c292
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/rds_recv.c677
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/rdsv3.conf25
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_ddi.c303
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_debug.c348
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_impl.c1294
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_sc.c395
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/send.c1178
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/stats.c174
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/sysctl.c86
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/threads.c356
-rw-r--r--usr/src/uts/common/io/ib/clients/rdsv3/transport.c142
-rw-r--r--usr/src/uts/common/io/warlock/rdsv3.wlcmd365
-rw-r--r--usr/src/uts/common/sys/Makefile4
-rw-r--r--usr/src/uts/common/sys/ib/clients/of/rdma/ib_verbs.h129
-rw-r--r--usr/src/uts/common/sys/ib/clients/of/sol_ofs/sol_cma.h16
-rw-r--r--usr/src/uts/common/sys/ib/clients/of/sol_ofs/sol_kverb_impl.h89
-rw-r--r--usr/src/uts/common/sys/ib/clients/rdsv3/ib.h359
-rw-r--r--usr/src/uts/common/sys/ib/clients/rdsv3/info.h59
-rw-r--r--usr/src/uts/common/sys/ib/clients/rdsv3/loop.h33
-rw-r--r--usr/src/uts/common/sys/ib/clients/rdsv3/rdma.h120
-rw-r--r--usr/src/uts/common/sys/ib/clients/rdsv3/rdma_transport.h44
-rw-r--r--usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3.h790
-rw-r--r--usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_debug.h139
-rw-r--r--usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_impl.h402
-rw-r--r--usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_sc.h55
-rw-r--r--usr/src/uts/common/sys/rds.h375
-rw-r--r--usr/src/uts/intel/Makefile.intel.shared2
-rw-r--r--usr/src/uts/intel/rdsv3/Makefile94
-rw-r--r--usr/src/uts/intel/sockrds/Makefile86
-rw-r--r--usr/src/uts/sparc/Makefile.sparc.shared2
-rw-r--r--usr/src/uts/sparc/rdsv3/Makefile143
-rw-r--r--usr/src/uts/sparc/sockrds/Makefile88
66 files changed, 20306 insertions, 274 deletions
diff --git a/usr/src/cmd/cmd-inet/etc/sock2path b/usr/src/cmd/cmd-inet/etc/sock2path
index a56b540b1b..555d5ec340 100644
--- a/usr/src/cmd/cmd-inet/etc/sock2path
+++ b/usr/src/cmd/cmd-inet/etc/sock2path
@@ -17,8 +17,7 @@
#
# CDDL HEADER END
#
-# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
#
# socket configuration information
#
@@ -53,6 +52,8 @@
28 2 0 /dev/nca
29 4 1 /dev/spdsock
+ 30 6 0 sockrds
+
31 1 0 trill
32 1 0 sockpfp
32 4 0 sockpfp
diff --git a/usr/src/cmd/rcm_daemon/Makefile.com b/usr/src/cmd/rcm_daemon/Makefile.com
index 6940bbc87e..4fec58013e 100644
--- a/usr/src/cmd/rcm_daemon/Makefile.com
+++ b/usr/src/cmd/rcm_daemon/Makefile.com
@@ -65,7 +65,8 @@ COMMON_PERL_SCRIPT_SRC =
sparc_PERL_SCRIPT_SRC = SUNW,vdevices.pl
-COMMON_SHELL_SCRIPT_SRC = SUNW,ibsdpu.sh
+COMMON_SHELL_SCRIPT_SRC = SUNW,ibsdpu.sh \
+ SUNW,rdsv3u.sh
COMMON_MOD_OBJ = \
filesys_rcm.o \
diff --git a/usr/src/cmd/rcm_daemon/common/SUNW,rdsv3u.sh b/usr/src/cmd/rcm_daemon/common/SUNW,rdsv3u.sh
new file mode 100644
index 0000000000..c54565f860
--- /dev/null
+++ b/usr/src/cmd/rcm_daemon/common/SUNW,rdsv3u.sh
@@ -0,0 +1,109 @@
+#!/sbin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+#
+
+#
+# RCM script to inform if RDSv3 is currently used
+#
+rcm_script_version=1
+rcm_script_func_info="RDSv3 (un)configuration rcm script"
+rcm_cmd_timeout=10
+rcm_resource_name=/devices/ib/rdsv3@0:rdsv3
+
+do_scriptinfo()
+{
+ printf "rcm_script_version=%d\n" $rcm_script_version;
+ printf "rcm_script_func_info=$rcm_script_func_info\n";
+ printf "rcm_cmd_timeout=%d\n" $rcm_cmd_timeout;
+ exit 0;
+}
+
+do_register()
+{
+ printf "rcm_resource_name=%s\n" $rcm_resource_name;
+ exit 0;
+}
+
+do_resourceinfo()
+{
+ if [ x"$1" = x"/devices/ib/rdsv3@0:rdsv3" ]
+ then
+ printf "rcm_resource_usage_info=RDSv3 IB device 0\n";
+ exit 0;
+ else
+ printf "rcm_failure_reason=Unknown RDSv3 device\n";
+ exit 3;
+ fi
+}
+
+do_queryremove()
+{
+ output=`/usr/sbin/fuser $rcm_resource_name 2>&1`
+ ret=$?
+
+ sockrds=`echo "$output" | grep 'sockrds'`
+
+ if [ $ret -eq 0 ] && [ ! -z "$sockrds" ]
+ then
+ printf "rcm_log_warn=RDSv3 is being used currently. "
+ printf "Please stop processes currently running on it "
+ printf "before un-configuring IB HCA/RDSv3.\n";
+ printf "rcm_failure_reason=RDSv3 is being used on this system\n";
+ exit 3;
+ elif [ $ret -ne 0 ]
+ then
+ printf "rcm_log_warn='fuser $rcm_resource_name' command failed."
+ printf "rcm_failure_reason='fuser $rcm_resource_name' command "
+ printf "failed.\n";
+ exit 1;
+ fi
+ exit 0;
+}
+
+do_preremove()
+{
+ exit 0;
+}
+
+do_undoremove()
+{
+ exit 0;
+}
+
+do_postremove()
+{
+ exit 0;
+}
+
+case "$1" in
+ scriptinfo) do_scriptinfo;;
+ register) do_register;;
+ resourceinfo) do_resourceinfo $2;;
+ queryremove) do_queryremove $2;;
+ preremove) do_preremove $2;;
+ undoremove) do_undoremove $2;;
+ postremove) do_postremove $2;;
+ *) echo Unknown option $1;;
+esac
diff --git a/usr/src/pkg/manifests/driver-network-rdsv3.mf b/usr/src/pkg/manifests/driver-network-rdsv3.mf
new file mode 100644
index 0000000000..802015fae3
--- /dev/null
+++ b/usr/src/pkg/manifests/driver-network-rdsv3.mf
@@ -0,0 +1,57 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+#
+
+#
+# This package will install successfully into any zone, global or
+# non-global. The files, directories, links, and hardlinks, however,
+# will only be installed into the global zone.
+#
+<include hollow_zone_pkg>
+set name=pkg.fmri value=pkg:/driver/network/rdsv3@$(PKGVERS)
+set name=pkg.description value="The RDS driver is an implementation of the Reliable Datagram Sockets API. It provides reliable, in-order datagram and RDMA data delivery between sockets."
+set name=pkg.summary value="Solaris Reliable Datagram Sockets"
+set name=info.classification value=org.opensolaris.category.2008:System/Core
+set name=variant.arch value=$(ARCH)
+set name=variant.opensolaris.zone value=global value=nonglobal
+dir path=usr group=sys
+dir path=usr/lib group=bin
+dir path=usr/lib/rcm group=bin
+dir path=usr/lib/rcm/scripts group=bin
+file path=/usr/lib/rcm/scripts/SUNW,rdsv3u.sh group=bin mode=0555
+dir path=kernel group=sys
+dir path=kernel/drv group=sys
+dir path=kernel/drv/$(ARCH64) group=sys
+driver name=rdsv3 perms="* 0644 root sys"
+$(i386_ONLY)file path=kernel/drv/rdsv3 group=sys
+file path=kernel/drv/$(ARCH64)/rdsv3 group=sys
+file path=kernel/drv/rdsv3.conf group=sys preserve=renamenew
+dir path=kernel/socketmod group=sys
+dir path=kernel/socketmod/$(ARCH64) group=sys
+$(i386_ONLY)file path=kernel/socketmod/sockrds mode=0755 group=sys
+file path=kernel/socketmod/$(ARCH64)/sockrds mode=0755 group=sys
+license cr_Sun license=cr_Sun
+license lic_CDDL license=lic_CDDL
+license uts/common/io/ib/clients/rdsv3/LICENSE \
+ license=uts/common/io/ib/clients/rdsv3/LICENSE
diff --git a/usr/src/pkg/manifests/system-header.mf b/usr/src/pkg/manifests/system-header.mf
index 5a11fda9e8..e679fb1744 100644
--- a/usr/src/pkg/manifests/system-header.mf
+++ b/usr/src/pkg/manifests/system-header.mf
@@ -1085,6 +1085,7 @@ file path=usr/include/sys/ib/clients/of/rdma/rdma_user_cm.h
file path=usr/include/sys/ib/clients/of/sol_ofs/sol_cma.h
file path=usr/include/sys/ib/clients/of/sol_ofs/sol_ib_cma.h
file path=usr/include/sys/ib/clients/of/sol_ofs/sol_ofs_common.h
+file path=usr/include/sys/ib/clients/of/sol_ofs/sol_kverb_impl.h
file path=usr/include/sys/ib/clients/of/sol_ucma/sol_rdma_user_cm.h
file path=usr/include/sys/ib/clients/of/sol_ucma/sol_ucma.h
file path=usr/include/sys/ib/clients/of/sol_uverbs/sol_uverbs.h
@@ -1325,6 +1326,7 @@ file path=usr/include/sys/ramdisk.h
file path=usr/include/sys/random.h
file path=usr/include/sys/rctl.h
file path=usr/include/sys/rctl_impl.h
+file path=usr/include/sys/rds.h
file path=usr/include/sys/reboot.h
file path=usr/include/sys/refstr.h
file path=usr/include/sys/refstr_impl.h
diff --git a/usr/src/pkg/manifests/system-network.mf b/usr/src/pkg/manifests/system-network.mf
index fe17448d86..cdabc70544 100644
--- a/usr/src/pkg/manifests/system-network.mf
+++ b/usr/src/pkg/manifests/system-network.mf
@@ -20,8 +20,7 @@
#
#
-# Copyright 2010 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
#
set name=pkg.fmri value=pkg:/system/network@$(PKGVERS)
@@ -70,7 +69,7 @@ file path=etc/inet/secret/ike.preshared group=sys mode=0600 \
original_name=SUNWcnetr:etc/inet/secret/ike.preshared preserve=true
file path=etc/inet/secret/ipseckeys.sample group=sys mode=0600
file path=etc/inet/sock2path group=sys \
- original_name=SUNWcnetr:etc/inet/sock2path preserve=true
+ original_name=SUNWcnetr:etc/inet/sock2path preserve=renameold
file path=etc/ipadm/ipadm.conf group=netadm owner=netadm preserve=true
file path=etc/nwam/loc/NoNet/ipf.conf.dfl group=netadm owner=netadm \
preserve=true
diff --git a/usr/src/tools/opensolaris/license-list b/usr/src/tools/opensolaris/license-list
index e48abfd7fe..0c87b1927d 100644
--- a/usr/src/tools/opensolaris/license-list
+++ b/usr/src/tools/opensolaris/license-list
@@ -148,6 +148,7 @@ usr/src/uts/common/io/drm/THIRDPARTYLICENSE
usr/src/uts/common/io/elxl/THIRDPARTYLICENSE
usr/src/uts/common/io/ib/clients/of/lic_of
usr/src/uts/common/io/ib/clients/rds/THIRDPARTYLICENSE
+usr/src/uts/common/io/ib/clients/rdsv3/LICENSE
usr/src/uts/common/io/ipw/THIRDPARTYLICENSE
usr/src/uts/common/io/ipw/fw-ipw2100/LICENSE
usr/src/uts/common/io/iwh/THIRDPARTYLICENSE
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 09560f032d..b514153403 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -593,11 +593,19 @@ SCTP_SOCK_MOD_OBJS += sockmod_sctp.o socksctp.o socksctpsubr.o
PFP_SOCK_MOD_OBJS += sockmod_pfp.o
+RDS_SOCK_MOD_OBJS += sockmod_rds.o
+
RDS_OBJS += rdsddi.o rdssubr.o rds_opt.o rds_ioctl.o
RDSIB_OBJS += rdsib.o rdsib_ib.o rdsib_cm.o rdsib_ep.o rdsib_buf.o \
rdsib_debug.o rdsib_sc.o
+RDSV3_OBJS += af_rds.o rdsv3_ddi.o bind.o loop.o threads.o connection.o \
+ transport.o cong.o sysctl.o message.o rds_recv.o send.o \
+ stats.o info.o page.o rdma_transport.o ib_ring.o ib_rdma.o \
+ ib_recv.o ib.o ib_send.o ib_sysctl.o ib_stats.o ib_cm.o \
+ rdsv3_sc.o rdsv3_debug.o rdsv3_impl.o rdma.o
+
ISER_OBJS += iser.o iser_cm.o iser_cq.o iser_ib.o iser_idm.o \
iser_resource.o iser_xfer.o
@@ -695,7 +703,8 @@ HERMON_OBJS += hermon.o hermon_agents.o hermon_cfg.o hermon_ci.o hermon_cmd.o \
DAPLT_OBJS += daplt.o
SOL_OFS_OBJS += sol_cma.o sol_ib_cma.o sol_uobj.o \
- sol_ofs_debug_util.o sol_ofs_gen_util.o
+ sol_ofs_debug_util.o sol_ofs_gen_util.o \
+ sol_kverbs.o
SOL_UCMA_OBJS += sol_ucma.o
diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules
index 8db7e6ef44..fa12fcd9c7 100644
--- a/usr/src/uts/common/Makefile.rules
+++ b/usr/src/uts/common/Makefile.rules
@@ -738,6 +738,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/ib/clients/rds/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/ib/clients/rdsv3/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/ib/clients/iser/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -2041,6 +2045,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/hotplug/pcihp/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/ib/clients/rds/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/ib/clients/rdsv3/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/ib/clients/iser/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
index 4521fdd352..64ea59c4b5 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -1156,8 +1155,7 @@ so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp,
ASSERT(errorp != NULL);
*errorp = 0;
if (mp == NULL) {
- if (msg_size > 0) {
- ASSERT(so->so_downcalls->sd_recv_uio != NULL);
+ if (so->so_downcalls->sd_recv_uio != NULL) {
mutex_enter(&so->so_lock);
/* the notify functions will drop the lock */
if (flags & MSG_OOB)
@@ -1166,6 +1164,7 @@ so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp,
so_notify_data(so, msg_size);
return (0);
}
+ ASSERT(msg_size == 0);
/*
* recv space check
*/
diff --git a/usr/src/uts/common/inet/sockmods/sockmod_rds.c b/usr/src/uts/common/inet/sockmods/sockmod_rds.c
new file mode 100644
index 0000000000..f8fc2e42d0
--- /dev/null
+++ b/usr/src/uts/common/inet/sockmods/sockmod_rds.c
@@ -0,0 +1,106 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/modctl.h>
+#include <sys/sunldi.h>
+#include <inet/common.h>
+#include <sys/strsubr.h>
+#include <sys/socketvar.h>
+
+extern sock_lower_handle_t rdsv3_create(int, int, int, sock_downcalls_t **,
+ uint_t *, int *, int, cred_t *);
+
+#define INET_NAME "sockrds"
+#define INET_DEVMINOR 0
+#define INET_MODMTFLAGS D_MP
+#define INET_SOCKDESC "RDSv3 socket module"
+#define INET_SOCK_PROTO_CREATE_FUNC (*rdsv3_create)
+
+#include "../inetddi.c"
+
+ldi_ident_t sockrds_li;
+ldi_handle_t rdsv3_transport_handle = NULL;
+
+#define RDSV3_DEVICE_NAME "/devices/ib/rdsv3@0:rdsv3"
+
+int
+_init(void)
+{
+ int ret;
+
+ ret = ldi_ident_from_mod(&modlinkage, &sockrds_li);
+ if (ret != 0) {
+ sockrds_li = NULL;
+ goto done;
+ }
+
+ ret = ldi_open_by_name(RDSV3_DEVICE_NAME, FREAD | FWRITE, kcred,
+ &rdsv3_transport_handle, sockrds_li);
+ if (ret != 0) {
+ ldi_ident_release(sockrds_li);
+ sockrds_li = NULL;
+ rdsv3_transport_handle = NULL;
+ goto done;
+ }
+
+ ret = mod_install(&modlinkage);
+ if (ret != 0) {
+ (void) ldi_close(rdsv3_transport_handle, FNDELAY, kcred);
+ ldi_ident_release(sockrds_li);
+ sockrds_li = NULL;
+ rdsv3_transport_handle = NULL;
+ }
+
+done:
+ return (ret);
+}
+
+int
+_fini(void)
+{
+ int ret;
+
+ ret = mod_remove(&modlinkage);
+ if (ret != 0) {
+ return (ret);
+ }
+
+ if (rdsv3_transport_handle != NULL) {
+ (void) ldi_close(rdsv3_transport_handle, FNDELAY, kcred);
+ rdsv3_transport_handle = NULL;
+ }
+
+ if (sockrds_li != NULL)
+ ldi_ident_release(sockrds_li);
+
+ return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_cma.c b/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_cma.c
index 81b1edcb50..2405af462d 100644
--- a/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_cma.c
+++ b/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_cma.c
@@ -49,6 +49,7 @@
#include <sys/ib/clients/of/rdma/rdma_cm.h>
#include <sys/ib/clients/of/sol_ofs/sol_cma.h>
+#include <sys/ib/clients/of/sol_ofs/sol_kverb_impl.h>
/* Modload support */
static struct modlmisc sol_ofs_modmisc = {
@@ -62,18 +63,7 @@ struct modlinkage sol_ofs_modlinkage = {
NULL
};
-static void sol_ofs_ibt_async_hdlr(void *clnt, ibt_hca_hdl_t hdl,
- ibt_async_code_t code, ibt_async_event_t *event);
-
-static ibt_clnt_modinfo_t sol_ofs_ibt_modinfo = {
- IBTI_V_CURR,
- IBT_GENERIC_MISC,
- sol_ofs_ibt_async_hdlr,
- NULL,
- "sol_ofs"
-};
-
-ibt_clnt_hdl_t sol_ofs_ibt_hdl;
+static ib_client_t *sol_cma_ib_client;
sol_cma_glbl_listen_t sol_cma_glbl_listen;
avl_tree_t sol_cma_glbl_listen_tree;
@@ -106,7 +96,6 @@ static void cma_handle_nomore_events(sol_cma_chan_t *);
extern void sol_ofs_dprintf_init();
extern void sol_ofs_dprintf_fini();
-static void ibcma_init_rdma_devs();
cma_chan_state_t cma_get_chan_state(sol_cma_chan_t *);
extern int ibcma_init_root_chan(sol_cma_chan_t *, sol_cma_glbl_listen_t *);
extern int ibcma_fini_root_chan(sol_cma_chan_t *);
@@ -133,7 +122,6 @@ int
_init(void)
{
int err;
- ibt_status_t status;
sol_ofs_dprintf_init();
SOL_OFS_DPRINTF_L5(sol_ofs_dbg_str, "_init()");
@@ -144,23 +132,39 @@ _init(void)
sol_cma_svc_cmp, sizeof (sol_cma_glbl_listen_t),
offsetof(sol_cma_glbl_listen_t, cma_listen_node));
+ sol_cma_ib_client = kmem_zalloc(sizeof (ib_client_t), KM_NOSLEEP);
+ if (!sol_cma_ib_client) {
+ SOL_OFS_DPRINTF_L2(sol_ofs_dbg_str,
+ "_init() - mem alloc failed");
+ avl_destroy(&sol_cma_glbl_listen_tree);
+ mutex_destroy(&sol_cma_dev_mutex);
+ mutex_destroy(&sol_cma_glob_mutex);
+ sol_ofs_dprintf_fini();
+ return (ENOMEM);
+ }
+
+ sol_cma_ib_client->name = "sol_ofs";
+ sol_cma_ib_client->add = sol_cma_add_dev;
+ sol_cma_ib_client->remove = sol_cma_rem_dev;
+ sol_cma_ib_client->dip = NULL;
- if ((status = ibt_attach(&sol_ofs_ibt_modinfo, NULL, NULL,
- &sol_ofs_ibt_hdl)) != IBT_SUCCESS) {
- cmn_err(CE_WARN, "_init: ibt_attach failed");
+ if ((err = ib_register_client(sol_cma_ib_client)) != 0) {
SOL_OFS_DPRINTF_L2(sol_ofs_dbg_str,
- "_init() ibt_attach() failed with status %d",
- status);
+ "_init() ib_register_client() failed with err %d",
+ err);
+ kmem_free(sol_cma_ib_client, sizeof (ib_client_t));
avl_destroy(&sol_cma_glbl_listen_tree);
mutex_destroy(&sol_cma_dev_mutex);
mutex_destroy(&sol_cma_glob_mutex);
sol_ofs_dprintf_fini();
- return (ENODEV);
+ return (err);
}
if ((err = mod_install(&sol_ofs_modlinkage)) != 0) {
- SOL_OFS_DPRINTF_L2(sol_ofs_dbg_str, "_init() failed");
- (void) ibt_detach(sol_ofs_ibt_hdl);
+ SOL_OFS_DPRINTF_L2(sol_ofs_dbg_str,
+ "_init() - mod_install() failed");
+ ib_unregister_client(sol_cma_ib_client);
+ kmem_free(sol_cma_ib_client, sizeof (ib_client_t));
avl_destroy(&sol_cma_glbl_listen_tree);
mutex_destroy(&sol_cma_dev_mutex);
mutex_destroy(&sol_cma_glob_mutex);
@@ -168,8 +172,6 @@ _init(void)
return (err);
}
- ibcma_init_rdma_devs();
-
SOL_OFS_DPRINTF_L5(sol_ofs_dbg_str, "_init() - ret");
return (err);
}
@@ -191,7 +193,9 @@ _fini(void)
"_fini: mod_remove failed");
return (err);
}
- (void) ibt_detach(sol_ofs_ibt_hdl);
+
+ ib_unregister_client(sol_cma_ib_client);
+ kmem_free(sol_cma_ib_client, sizeof (ib_client_t));
avl_destroy(&sol_cma_glbl_listen_tree);
mutex_destroy(&sol_cma_dev_mutex);
mutex_destroy(&sol_cma_glob_mutex);
@@ -234,7 +238,7 @@ sol_cma_add_dev(struct ib_device *dev)
init_genlist(&new_device->cma_epchan_list);
new_device->cma_device = dev;
- dev->data = new_device;
+ ib_set_client_data(dev, sol_cma_ib_client, new_device);
mutex_enter(&sol_cma_dev_mutex);
llist_add_tail(&new_device->cma_list, &sol_cma_dev_list);
@@ -247,7 +251,9 @@ sol_cma_rem_dev(struct ib_device *dev)
cma_device_t *rem_device;
genlist_entry_t *entry;
- rem_device = (cma_device_t *)dev->data;
+ SOL_OFS_DPRINTF_L5(sol_ofs_dbg_str, "sol_rem_dev(%p)", dev);
+
+ rem_device = (cma_device_t *)ib_get_client_data(dev, sol_cma_ib_client);
if (!rem_device) {
SOL_OFS_DPRINTF_L2(sol_ofs_dbg_str, "sol_cma_rem_dev() "
"NULL cma_dev!!");
@@ -384,45 +390,6 @@ sol_cma_add_hca_list(sol_cma_chan_t *ep_chanp, ib_guid_t hca_guid)
"No matching HCA in list!!", ep_chanp, hca_guid);
}
-/*ARGSUSED*/
-static void
-sol_ofs_ibt_async_hdlr(void *clnt, ibt_hca_hdl_t hdl,
- ibt_async_code_t code, ibt_async_event_t *event)
-{
- struct ib_device *device;
- llist_head_t *entry;
- cma_device_t *cma_devp;
-
- SOL_OFS_DPRINTF_L3(sol_ofs_dbg_str,
- "ibt_async_hdlr(%p, %p, %x, %p)",
- clnt, hdl, code, event);
-
- switch (code) {
- case IBT_HCA_ATTACH_EVENT:
- device = kmem_zalloc(sizeof (struct ib_device),
- KM_SLEEP);
- device->node_guid = htonll(event->ev_hca_guid);
- sol_cma_add_dev(device);
- break;
- case IBT_HCA_DETACH_EVENT:
- mutex_enter(&sol_cma_dev_mutex);
- list_for_each(entry, &sol_cma_dev_list) {
- cma_devp = (cma_device_t *)entry->ptr;
-
- if (cma_devp->cma_device->node_guid ==
- htonll(event->ev_hca_guid)) {
- mutex_exit(&sol_cma_dev_mutex);
- sol_cma_rem_dev(cma_devp->cma_device);
- mutex_enter(&sol_cma_dev_mutex);
- break;
- }
- }
- mutex_exit(&sol_cma_dev_mutex);
-
- break;
- }
-}
-
/*
* rdma_cm.h API functions.
*/
@@ -474,6 +441,7 @@ rdma_map_id2qphdl(struct rdma_cm_id *rdma_idp, void *qp_hdl)
chanp->chan_qp_hdl = qp_hdl;
}
+
void
rdma_destroy_id(struct rdma_cm_id *rdma_idp)
{
@@ -494,7 +462,15 @@ rdma_destroy_id(struct rdma_cm_id *rdma_idp)
rdma_idp, root_chanp);
mutex_enter(&chanp->chan_mutex);
- chanp->chan_cmid_destroy_state = SOL_CMA_CALLER_CMID_DESTROYED;
+ chanp->chan_cmid_destroy_state |= SOL_CMA_CALLER_CMID_DESTROYED;
+
+ /*
+ * Wait in destroy of CMID when rdma_resolve_addr() / rdma_listen()
+ * rdma_resolve_route() API is in progress.
+ */
+ while (chanp->chan_cmid_destroy_state & SOL_CMA_CALLER_API_PROGRESS)
+ cv_wait(&chanp->chan_destroy_cv, &chanp->chan_mutex);
+
/* Wait if Event is been notified to consumer */
while (chanp->chan_cmid_destroy_state & SOL_CMA_CALLER_EVENT_PROGRESS)
cv_wait(&chanp->chan_destroy_cv, &chanp->chan_mutex);
@@ -541,6 +517,10 @@ rdma_destroy_id(struct rdma_cm_id *rdma_idp)
chanp->chan_req_cnt--;
chanp->chan_req_total_cnt--;
mutex_exit(&chanp->chan_mutex);
+ mutex_enter(&req_cmid_chan->chan_mutex);
+ req_cmid_chan->chan_req_state =
+ REQ_CMID_NONE;
+ mutex_exit(&req_cmid_chan->chan_mutex);
(void) rdma_disconnect(
(struct rdma_cm_id *)req_cmid_chan);
mutex_enter(&chanp->chan_mutex);
@@ -578,14 +558,20 @@ rdma_destroy_id(struct rdma_cm_id *rdma_idp)
cv_wait(&chanp->chan_destroy_cv, &chanp->chan_mutex);
}
+ if (root_chanp)
+ mutex_enter(&root_chanp->chan_mutex);
+#ifdef DEBUG
SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "rdma_destroy_id: "
"root_idp %p, cnt %x, state %x", root_chanp,
root_chanp ? root_chanp->chan_req_total_cnt : 0,
root_chanp ? cma_get_chan_state(root_chanp) : 0);
+#endif
if (root_chanp && root_chanp->chan_req_total_cnt == 1 &&
cma_get_chan_state(root_chanp) == SOL_CMA_CHAN_DESTROY_PENDING)
do_wait = 1;
+ if (root_chanp)
+ mutex_exit(&root_chanp->chan_mutex);
skip_passive_handling :
state = cma_get_chan_state(chanp);
@@ -697,7 +683,10 @@ rdma_bind_addr(struct rdma_cm_id *idp, struct sockaddr *addr)
* iWARP.
*/
if (chanp->chan_ib_client_hdl == NULL) {
- chanp->chan_ib_client_hdl = sol_ofs_ibt_hdl;
+ ofs_client_t *ofs_clnt;
+
+ ofs_clnt = (ofs_client_t *)sol_cma_ib_client->clnt_hdl;
+ chanp->chan_ib_client_hdl = ofs_clnt->ibt_hdl;
}
if (chanp->chan_ib_client_hdl && rdma_ib_bind_addr(idp, addr) == 0) {
SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str,
@@ -726,8 +715,6 @@ rdma_resolve_addr(struct rdma_cm_id *idp, struct sockaddr *src_addr,
sol_cma_chan_t *chanp;
struct rdma_addr *addrp;
cma_chan_state_t state;
- enum rdma_cm_event_type event;
- int rc = 0;
ASSERT(idp);
chanp = (sol_cma_chan_t *)idp;
@@ -740,10 +727,18 @@ rdma_resolve_addr(struct rdma_cm_id *idp, struct sockaddr *src_addr,
if (state != SOL_CMA_CHAN_IDLE && state != SOL_CMA_CHAN_BOUND) {
SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str,
"rdma_resolve_addr : invalid chan state %x", state);
- rc = EINVAL;
mutex_exit(&chanp->chan_mutex);
- goto resolve_exit;
+ return (EINVAL);
}
+ if (chanp->chan_cmid_destroy_state &
+ SOL_CMA_CALLER_CMID_DESTROYED) {
+ SOL_OFS_DPRINTF_L3(sol_rdmacm_dbg_str,
+ "rdma_resolve_addr : CMID %p, destroy called", chanp);
+ mutex_exit(&chanp->chan_mutex);
+ return (EINVAL);
+ }
+ chanp->chan_cmid_destroy_state |= SOL_CMA_CALLER_API_PROGRESS;
+
if (chanp->chan_xport_type == SOL_CMA_XPORT_NONE) {
bcopy((void *)src_addr, (void *)&(addrp->src_addr),
sizeof (struct sockaddr));
@@ -757,31 +752,52 @@ rdma_resolve_addr(struct rdma_cm_id *idp, struct sockaddr *src_addr,
* if this fails, resolve this as an @ corresponding to iWARP
*/
if (chanp->chan_ib_client_hdl == NULL) {
- chanp->chan_ib_client_hdl = sol_ofs_ibt_hdl;
+ ofs_client_t *ofs_clnt;
+
+ ofs_clnt = (ofs_client_t *)sol_cma_ib_client->clnt_hdl;
+ chanp->chan_ib_client_hdl = ofs_clnt->ibt_hdl;
}
if (chanp->chan_ib_client_hdl && rdma_ib_resolve_addr(idp, src_addr,
dst_addr, timeout_ms) == 0) {
SOL_OFS_DPRINTF_L4(sol_rdmacm_dbg_str,
"rdma_resolve_addr: ret IB @");
- goto resolve_exit;
#ifdef IWARP_SUPPORT
} else if (chanp->chan_iw_client_hdl && rdma_iw_resolve_addr(idp,
src_addr, dst_addr, timeout_ms) == 0) {
SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str,
"rdma_resolve_addr: ret iWARP @");
- goto resolve_exit;
#endif /* IWARP_SUPPORT */
} else {
SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str,
"rdma_resolve_addr: Invalid @");
- rc = EINVAL;
+ return (EINVAL);
}
+ SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "rdma_resolve_addr: ret 0");
+ return (0);
+}
+
+static void cma_generate_event_sync(struct rdma_cm_id *,
+ enum rdma_cm_event_type, int, struct rdma_conn_param *,
+ struct rdma_ud_param *);
+
+void
+cma_resolve_addr_callback(sol_cma_chan_t *chanp, int rc)
+{
+ enum rdma_cm_event_type event;
-resolve_exit:
+ mutex_enter(&chanp->chan_mutex);
+ if (chanp->chan_cmid_destroy_state &
+ SOL_CMA_CALLER_CMID_DESTROYED) {
+ SOL_OFS_DPRINTF_L3(sol_rdmacm_dbg_str,
+ "cma_resolve_addr : CMID %p, destroy called", chanp);
+ chanp->chan_cmid_destroy_state &=
+ ~SOL_CMA_CALLER_API_PROGRESS;
+ cv_broadcast(&chanp->chan_destroy_cv);
+ mutex_exit(&chanp->chan_mutex);
+ return;
+ }
if (rc == 0) {
- mutex_enter(&chanp->chan_mutex);
cma_set_chan_state(chanp, SOL_CMA_CHAN_ADDR_RESLVD);
- mutex_exit(&chanp->chan_mutex);
event = RDMA_CM_EVENT_ADDR_RESOLVED;
} else
event = RDMA_CM_EVENT_ADDR_ERROR;
@@ -791,9 +807,16 @@ resolve_exit:
* This will result in RDMA_USER_CM_CMD_RESOLVE_ROUTE in
* userland.
*/
- cma_generate_event(idp, event, 0, NULL, NULL);
- SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "rdma_resolve_addr: ret 0");
- return (0);
+ chanp->chan_cmid_destroy_state |= SOL_CMA_CALLER_EVENT_PROGRESS;
+ mutex_exit(&chanp->chan_mutex);
+ cma_generate_event_sync((struct rdma_cm_id *)chanp, event, 0,
+ NULL, NULL);
+
+ mutex_enter(&chanp->chan_mutex);
+ chanp->chan_cmid_destroy_state &= ~SOL_CMA_CALLER_API_PROGRESS;
+ if (chanp->chan_cmid_destroy_state & SOL_CMA_CALLER_CMID_DESTROYED)
+ cv_broadcast(&chanp->chan_destroy_cv);
+ mutex_exit(&chanp->chan_mutex);
}
int
@@ -814,6 +837,14 @@ rdma_resolve_route(struct rdma_cm_id *idp, int timeout_ms)
"resolve_route: Invalid state");
return (EINVAL);
}
+ if (chanp->chan_cmid_destroy_state &
+ SOL_CMA_CALLER_CMID_DESTROYED) {
+ SOL_OFS_DPRINTF_L3(sol_rdmacm_dbg_str,
+ "rdma_resolve_route : CMID %p, destroy called", chanp);
+ mutex_exit(&chanp->chan_mutex);
+ return (EINVAL);
+ }
+ chanp->chan_cmid_destroy_state |= SOL_CMA_CALLER_API_PROGRESS;
mutex_exit(&chanp->chan_mutex);
/*
@@ -823,6 +854,13 @@ rdma_resolve_route(struct rdma_cm_id *idp, int timeout_ms)
*/
cma_generate_event(idp, RDMA_CM_EVENT_ROUTE_RESOLVED, 0,
NULL, NULL);
+
+ mutex_enter(&chanp->chan_mutex);
+ chanp->chan_cmid_destroy_state &= ~SOL_CMA_CALLER_API_PROGRESS;
+ if (chanp->chan_cmid_destroy_state & SOL_CMA_CALLER_CMID_DESTROYED)
+ cv_broadcast(&chanp->chan_destroy_cv);
+ mutex_exit(&chanp->chan_mutex);
+
SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "resolve_route: ret 0");
return (0);
}
@@ -905,11 +943,16 @@ rdma_listen(struct rdma_cm_id *idp, int bklog)
}
cma_set_chan_state(chanp, SOL_CMA_CHAN_LISTEN);
- if (chanp->chan_listenp) {
- SOL_OFS_DPRINTF_L4(sol_rdmacm_dbg_str, "rdma_listen: "
- "NON NULL listen_list");
- goto listen_from_list;
+ if (chanp->chan_cmid_destroy_state &
+ SOL_CMA_CALLER_CMID_DESTROYED) {
+ SOL_OFS_DPRINTF_L3(sol_rdmacm_dbg_str,
+ "rdma_listen : CMID %p, destroy called", chanp);
+ mutex_exit(&chanp->chan_mutex);
+ return (EINVAL);
}
+ chanp->chan_cmid_destroy_state |= SOL_CMA_CALLER_API_PROGRESS;
+
+ ASSERT(chanp->chan_listenp == NULL);
chanp->chan_listenp = kmem_zalloc(sizeof (sol_cma_listen_info_t),
KM_SLEEP);
@@ -917,12 +960,12 @@ rdma_listen(struct rdma_cm_id *idp, int bklog)
(chanp->chan_listenp)->listen_is_root = 1;
ret = cma_init_listen_root(chanp);
if (ret) {
+ chanp->chan_listenp = NULL;
+ mutex_exit(&chanp->chan_mutex);
SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str, "rdma_listen: "
"cma_init_listen_root: failed");
kmem_free(chanp->chan_listenp,
sizeof (sol_cma_listen_info_t));
- chanp->chan_listenp = NULL;
- mutex_exit(&chanp->chan_mutex);
return (EINVAL);
}
@@ -949,7 +992,13 @@ rdma_listen(struct rdma_cm_id *idp, int bklog)
mutex_exit(&chanp->chan_mutex);
return (0);
}
-listen_from_list:
+
+ if (chanp->chan_cmid_destroy_state & SOL_CMA_CALLER_CMID_DESTROYED) {
+ chanp->chan_cmid_destroy_state &=
+ ~SOL_CMA_CALLER_API_PROGRESS;
+ cv_broadcast(&chanp->chan_destroy_cv);
+ }
+
genlist_for_each(entry, &(CHAN_LISTEN_LIST(chanp))) {
struct rdma_cm_id *ep_idp;
sol_cma_chan_t *ep_chanp;
@@ -965,6 +1014,10 @@ listen_from_list:
if (ret)
break;
}
+
+ chanp->chan_cmid_destroy_state &= ~SOL_CMA_CALLER_API_PROGRESS;
+ if (chanp->chan_cmid_destroy_state & SOL_CMA_CALLER_CMID_DESTROYED)
+ cv_broadcast(&chanp->chan_destroy_cv);
mutex_exit(&chanp->chan_mutex);
SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "rdma_listen: ret %x", ret);
@@ -1005,6 +1058,28 @@ rdma_accept(struct rdma_cm_id *idp, struct rdma_conn_param *conn_param)
"REQ AVL remove %p", root_chanp, idp);
mutex_enter(&root_chanp->chan_mutex);
avl_remove(&root_chanp->chan_req_avl_tree, idp);
+
+ /* For TCP, insert into ACPT_AVL_TREE */
+ if (idp->ps == RDMA_PS_TCP) {
+ void *find_ret;
+ avl_index_t where;
+
+ SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str,
+ "Add to ACPT AVL of %p IDP, idp %p, qp_hdl %p",
+ root_idp, idp, chanp->chan_qp_hdl);
+ find_ret = avl_find(&root_chanp->chan_acpt_avl_tree,
+ (void *)chanp->chan_qp_hdl, &where);
+ if (find_ret) {
+ mutex_exit(&root_chanp->chan_mutex);
+ SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str,
+ "DUPLICATE ENTRY in ACPT AVL : root %p, "
+ "idp %p, qp_hdl %p",
+ root_idp, idp, chanp->chan_qp_hdl);
+ return (EINVAL);
+ }
+ avl_insert(&root_chanp->chan_acpt_avl_tree,
+ (void *)idp, where);
+ }
mutex_exit(&root_chanp->chan_mutex);
mutex_enter(&chanp->chan_mutex);
@@ -1013,26 +1088,7 @@ rdma_accept(struct rdma_cm_id *idp, struct rdma_conn_param *conn_param)
mutex_exit(&chanp->chan_mutex);
}
- /* For TCP, insert into ACPT_AVL_TREE */
- if (root_idp && idp->ps == RDMA_PS_TCP) {
- void *find_ret;
- avl_index_t where;
-
- SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str,
- "Add to ACPT AVL of %p IDP, idp %p, qp_hdl %p",
- root_idp, idp, chanp->chan_qp_hdl);
- mutex_enter(&root_chanp->chan_mutex);
- find_ret = avl_find(&root_chanp->chan_acpt_avl_tree,
- (void *)chanp->chan_qp_hdl, &where);
- if (find_ret)
- SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str,
- "DUPLICATE ENTRY in ACPT AVL : root %p, "
- "idp %p, qp_hdl %p",
- root_idp, idp, chanp->chan_qp_hdl);
- avl_insert(&root_chanp->chan_acpt_avl_tree,
- (void *)idp, where);
- mutex_exit(&root_chanp->chan_mutex);
- } else if (root_idp && IS_UDP_CMID(root_idp)) {
+ if (root_idp && IS_UDP_CMID(root_idp)) {
cma_chan_state_t chan_state;
/*
@@ -1062,6 +1118,9 @@ rdma_accept(struct rdma_cm_id *idp, struct rdma_conn_param *conn_param)
mutex_enter(&root_chanp->chan_mutex);
avl_remove(&root_chanp->chan_acpt_avl_tree, idp);
mutex_exit(&root_chanp->chan_mutex);
+ mutex_enter(&chanp->chan_mutex);
+ chanp->chan_req_state = REQ_CMID_NONE;
+ mutex_exit(&chanp->chan_mutex);
}
SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "rdma_accept: ret %x", ret);
@@ -1123,10 +1182,6 @@ rdma_reject(struct rdma_cm_id *idp, const void *priv_data,
ret = rdma_iw_reject(idp, priv_data, priv_data_len);
#endif /* IWARP_SUPPORT */
- mutex_enter(&chanp->chan_mutex);
- if (!ret)
- chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE;
- mutex_exit(&chanp->chan_mutex);
if (!ret && root_idp) {
cma_chan_state_t chan_state;
@@ -1303,41 +1358,6 @@ rdma_leave_multicast(struct rdma_cm_id *idp, struct sockaddr *addr)
SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "rdma_join_multicast: ret");
}
-/*ARGSUSED*/
-int
-rdma_create_qp(struct rdma_cm_id *idp, struct ib_pd *pd,
- struct ib_qp_init_attr *qp_init_attr)
-{
- return (-EINVAL);
-}
-
-/*ARGSUSED*/
-void
-rdma_destroy_qp(struct rdma_cm_id *idp)
-{
-}
-
-void
-ibcma_init_rdma_devs()
-{
- uint_t i, nhcas;
- ib_guid_t *guidp;
- struct ib_device *device;
-
- if ((nhcas = ibt_get_hca_list(&guidp)) == 0) {
- SOL_OFS_DPRINTF_L3(sol_rdmacm_dbg_str,
- "ibcma_init_rdma_devs() - NO HCAs");
- return;
- }
-
- for (i = 0; i < nhcas; i++) {
- device = kmem_zalloc(sizeof (struct ib_device), KM_SLEEP);
- device->node_guid = htonll(guidp[i]);
- sol_cma_add_dev(device);
- }
- ibt_free_hca_list(guidp, nhcas);
-}
-
/*
* Functions to compare to rdma_cm_id *, used by AVL tree
* routines.
@@ -1643,6 +1663,27 @@ cma_generate_event_sync(struct rdma_cm_id *idp, enum rdma_cm_event_type event,
}
mutex_exit(&chanp->chan_mutex);
+ root_idp = CHAN_LISTEN_ROOT(chanp);
+ root_chanp = (sol_cma_chan_t *)root_idp;
+ SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "gen_event: root_idp %p",
+ root_idp);
+
+ if (event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+ /*
+ * Update chan_req_state for the REQ CMID. Decrement
+ * count of REQ CMIDs not notifed to consumer.
+ */
+ ASSERT(root_idp);
+ mutex_enter(&root_chanp->chan_mutex);
+ root_chanp->chan_req_cnt--;
+#ifdef DEBUG
+ SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str,
+ "Dec req_cnt of %p IDP, idp %p, req_cnt %x",
+ root_idp, idp, root_chanp->chan_req_cnt);
+#endif
+ mutex_exit(&root_chanp->chan_mutex);
+ }
+
/* Pass the event to the client */
ret = (idp->event_handler) (idp, &cm_event);
@@ -1666,6 +1707,7 @@ cma_generate_event_sync(struct rdma_cm_id *idp, enum rdma_cm_event_type event,
event);
mutex_enter(&chanp->chan_mutex);
+ chanp->chan_req_state = REQ_CMID_NONE;
chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE;
chanp->chan_cmid_destroy_state &=
~SOL_CMA_CALLER_EVENT_PROGRESS;
@@ -1680,42 +1722,17 @@ cma_generate_event_sync(struct rdma_cm_id *idp, enum rdma_cm_event_type event,
return;
}
ofs_consume_event:
- root_idp = CHAN_LISTEN_ROOT(chanp);
- root_chanp = (sol_cma_chan_t *)root_idp;
- SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "gen_event: root_idp %p",
- root_idp);
- if (event == RDMA_CM_EVENT_CONNECT_REQUEST) {
- /*
- * Update chan_req_state for the REQ CMID. Decrement
- * count of REQ CMIDs not notifed to consumer.
- */
- if (!root_idp) {
- mutex_enter(&chanp->chan_mutex);
- chanp->chan_cmid_destroy_state &=
- ~SOL_CMA_CALLER_EVENT_PROGRESS;
- if (chanp->chan_cmid_destroy_state &
- SOL_CMA_CALLER_CMID_DESTROYED)
- cv_broadcast(&chanp->chan_destroy_cv);
- mutex_exit(&chanp->chan_mutex);
- return;
- }
-
+ if (event == RDMA_CM_EVENT_DISCONNECTED || event ==
+ RDMA_CM_EVENT_REJECTED) {
mutex_enter(&chanp->chan_mutex);
- chanp->chan_req_state = REQ_CMID_NOTIFIED;
+ chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE;
+ chanp->chan_qp_hdl = NULL;
mutex_exit(&chanp->chan_mutex);
- mutex_enter(&root_chanp->chan_mutex);
- root_chanp->chan_req_cnt--;
-#ifdef DEBUG
- SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str,
- "Dec req_cnt of %p IDP, idp %p, req_cnt %x",
- root_idp, idp, root_chanp->chan_req_cnt);
-#endif
- mutex_exit(&root_chanp->chan_mutex);
- } else if (event == RDMA_CM_EVENT_DISCONNECTED && root_idp) {
+ }
+ if (event == RDMA_CM_EVENT_DISCONNECTED && root_idp) {
cma_chan_state_t chan_state;
mutex_enter(&chanp->chan_mutex);
- chanp->chan_qp_hdl = NULL;
cma_handle_nomore_events(chanp);
chan_state = cma_get_chan_state(chanp);
chanp->chan_cmid_destroy_state &=
@@ -2054,13 +2071,16 @@ cma_handle_nomore_events(sol_cma_chan_t *chanp)
root_chanp->chan_req_total_cnt--;
if (!root_chanp->chan_req_total_cnt)
root_chanp->chan_req_state = REQ_CMID_NONE;
- if (root_idp->ps == RDMA_PS_TCP && (chanp->chan_req_state ==
- REQ_CMID_ACCEPTED || chanp->chan_req_state ==
- REQ_CMID_DISCONNECTED))
+ if (root_idp->ps == RDMA_PS_TCP && chanp->chan_req_state ==
+ REQ_CMID_ACCEPTED) {
avl_remove(&root_chanp->chan_acpt_avl_tree, idp);
+ chanp->chan_req_state = REQ_CMID_NONE;
+ }
if (chanp->chan_req_state == REQ_CMID_CREATED ||
- chanp->chan_req_state == REQ_CMID_NOTIFIED)
+ chanp->chan_req_state == REQ_CMID_NOTIFIED) {
avl_remove(&root_chanp->chan_req_avl_tree, idp);
+ chanp->chan_req_state = REQ_CMID_NONE;
+ }
state = cma_get_chan_state(root_chanp);
req_nodes = avl_numnodes(&root_chanp->chan_req_avl_tree);
acpt_nodes = avl_numnodes(&root_chanp->chan_acpt_avl_tree);
@@ -2069,3 +2089,104 @@ cma_handle_nomore_events(sol_cma_chan_t *chanp)
acpt_nodes == 0UL)
cma_destroy_id(root_idp);
}
+
+extern int ib_modify_qp(struct ib_qp *, struct ib_qp_attr *, int);
+extern int rdma_init_qp_attr(struct rdma_cm_id *, struct ib_qp_attr *,
+ int *);
+
+static int
+cma_init_ud_qp(sol_cma_chan_t *chanp, struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(&chanp->chan_rdma_cm, &qp_attr, &qp_attr_mask);
+ if (ret)
+ return (ret);
+
+ ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+ if (ret)
+ return (ret);
+
+ qp_attr.qp_state = IB_QPS_RTR;
+ ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+ if (ret)
+ return (ret);
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ qp_attr.sq_psn = 0;
+ ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN);
+
+ return (ret);
+}
+
+static int
+cma_init_conn_qp(sol_cma_chan_t *chanp, struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(&chanp->chan_rdma_cm, &qp_attr, &qp_attr_mask);
+ if (ret)
+ return (ret);
+
+ return (ib_modify_qp(qp, &qp_attr, qp_attr_mask));
+}
+
+static inline int
+cma_is_ud_ps(enum rdma_port_space ps)
+{
+ return (ps == RDMA_PS_UDP || ps == RDMA_PS_IPOIB);
+}
+
+int
+rdma_create_qp(struct rdma_cm_id *idp, struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ sol_cma_chan_t *chanp;
+ struct ib_qp *qp;
+ int ret;
+ ofs_client_t *dev_ofs_client;
+
+ ASSERT(idp);
+ chanp = (sol_cma_chan_t *)idp;
+ if (idp->device->node_guid != pd->device->node_guid)
+ return (-EINVAL);
+
+ dev_ofs_client = (ofs_client_t *)pd->device->clnt_hdl;
+ rdma_map_id2clnthdl(idp, dev_ofs_client->ibt_hdl, NULL);
+
+ qp = ib_create_qp(pd, qp_init_attr);
+ if ((uintptr_t)qp >= (uintptr_t)-0xFFF) {
+ return ((intptr_t)qp);
+ }
+ rdma_map_id2qphdl(idp, (void *)qp->ibt_qp);
+
+ if (cma_is_ud_ps(idp->ps)) {
+ ret = cma_init_ud_qp(chanp, qp);
+ } else {
+ ret = cma_init_conn_qp(chanp, qp);
+ }
+
+ if (ret) {
+ goto err;
+ }
+
+ idp->qp = qp;
+ chanp->chan_qp_num = qp->qp_num;
+ chanp->chan_is_srq = (qp->srq != NULL);
+ return (0);
+err:
+ (void) ib_destroy_qp(qp);
+ return (ret);
+}
+
+void
+rdma_destroy_qp(struct rdma_cm_id *idp)
+{
+ ASSERT(idp);
+ (void) ib_destroy_qp(idp->qp);
+ idp->qp = NULL;
+}
diff --git a/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_ib_cma.c b/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_ib_cma.c
index 2fdc1e266e..31749ebb96 100644
--- a/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_ib_cma.c
+++ b/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_ib_cma.c
@@ -748,9 +748,6 @@ rdma_ib_reject(struct rdma_cm_id *idp, const void *private_data,
kmem_free(privp, SOL_REP_PRIV_DATA_SZ);
return (EINVAL);
}
- mutex_enter(&chanp->chan_mutex);
- chanp->chan_connect_flag = SOL_CMA_CONNECT_SERVER_DONE;
- mutex_exit(&chanp->chan_mutex);
} else {
SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "rdma_ib_reject :"
"calling ibt_cm_ud_proceed");
@@ -763,9 +760,6 @@ rdma_ib_reject(struct rdma_cm_id *idp, const void *private_data,
kmem_free(privp, SOL_REP_PRIV_DATA_SZ);
return (EINVAL);
}
- mutex_enter(&chanp->chan_mutex);
- chanp->chan_connect_flag = SOL_CMA_CONNECT_SERVER_DONE;
- mutex_exit(&chanp->chan_mutex);
}
if (privp)
@@ -804,8 +798,8 @@ rdma_ib_disconnect(struct rdma_cm_id *idp)
mutex_enter(&root_chanp->chan_mutex);
avl_remove(&root_chanp->chan_req_avl_tree, idp);
mutex_exit(&root_chanp->chan_mutex);
+ chanp->chan_req_state = REQ_CMID_NONE;
}
- chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE;
}
if (idp->ps == RDMA_PS_TCP && chanp->chan_connect_flag ==
SOL_CMA_CONNECT_SERVER_RCVD && chanp->chan_session_id) {
@@ -828,8 +822,8 @@ rdma_ib_disconnect(struct rdma_cm_id *idp)
mutex_enter(&root_chanp->chan_mutex);
avl_remove(&root_chanp->chan_req_avl_tree, idp);
mutex_exit(&root_chanp->chan_mutex);
+ chanp->chan_req_state = REQ_CMID_NONE;
}
- chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE;
}
/*
@@ -1223,32 +1217,74 @@ ibcma_query_local_ip(struct rdma_cm_id *idp, sol_cma_chan_t *chanp,
return (0);
}
+extern void cma_resolve_addr_callback(sol_cma_chan_t *, int);
+
+static void
+ibcma_path_hdlr(void *arg, ibt_status_t retval, ibt_path_info_t *pathp,
+ uint8_t num_paths, ibt_path_ip_src_t *src_ip_p)
+{
+ struct rdma_cm_id *idp = (struct rdma_cm_id *)arg;
+ sol_cma_chan_t *chanp = (sol_cma_chan_t *)arg;
+ ibcma_chan_t *ibchanp = &(chanp->chan_ib);
+ int i;
+ ibcma_dev_t *devp;
+ ib_lid_t base_lid;
+
+ if (retval != IBT_SUCCESS && retval != IBT_INSUFF_DATA) {
+ cma_resolve_addr_callback(chanp, 1);
+ return;
+ }
+
+ ibchanp->chan_path_size = 2 * sizeof (ibt_path_info_t);
+ ibchanp->chan_pathp = kmem_zalloc(ibchanp->chan_path_size, KM_SLEEP);
+ bcopy(pathp, ibchanp->chan_pathp, num_paths *
+ sizeof (ibt_path_info_t));
+ ibchanp->chan_numpaths = num_paths;
+
+ if (ibchanp->chan_devp == NULL && src_ip_p) {
+ ipaddr2sockaddr(&(src_ip_p[0].ip_primary),
+ &(idp->route.addr.src_addr), NULL);
+ bcopy(&(src_ip_p[0].ip_primary), &ibchanp->chan_local_addr,
+ sizeof (ibt_ip_addr_t));
+ if (ibcma_init_devinfo((struct rdma_cm_id *)chanp,
+ ibchanp, pathp)) {
+ kmem_free(ibchanp->chan_pathp,
+ ibchanp->chan_path_size);
+ cma_resolve_addr_callback(chanp, 1);
+ return;
+ }
+ }
+
+ if (ibchanp->chan_devp == NULL) {
+ cma_resolve_addr_callback(chanp, 1);
+ return;
+ }
+
+ devp = ibchanp->chan_devp;
+ (idp->route).num_paths = ibchanp->chan_numpaths;
+ idp->route.path_rec = kmem_zalloc(sizeof (struct ib_sa_path_rec) *
+ ibchanp->chan_numpaths, KM_SLEEP);
+ base_lid = ibt_get_port_state_byguid(devp->dev_node_guid,
+ devp->dev_port_num, NULL, &base_lid);
+ for (i = 0; i < ibchanp->chan_numpaths; i++)
+ ibt_path2sa_path(&((ibchanp->chan_pathp)[i]),
+ &((idp->route.path_rec)[i]), base_lid);
+
+ cma_resolve_addr_callback(chanp, 0);
+}
+
static int
ibcma_get_paths(struct rdma_cm_id *idp, sol_cma_chan_t *chanp,
ibcma_chan_t *ibchanp)
{
ibt_ip_path_attr_t path_attr;
ibt_status_t status;
- ibt_path_ip_src_t *src_ip_p = NULL;
- uint8_t max_paths;
- ibcma_dev_t *devp;
ibt_ip_addr_t *dst_addrp;
- ib_lid_t base_lid;
- int i;
ASSERT(ibchanp);
SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "ibcma_get_paths(%p, %p)", idp,
ibchanp);
- max_paths = 2;
- ibchanp->chan_path_size = max_paths * sizeof (ibt_path_info_t);
- ibchanp->chan_pathp = kmem_zalloc(ibchanp->chan_path_size, KM_SLEEP);
-
- devp = ibchanp->chan_devp;
- if (devp == NULL) {
- src_ip_p = kmem_zalloc(sizeof (ibt_path_ip_src_t) * max_paths,
- KM_SLEEP);
- }
bzero(&path_attr, sizeof (ibt_ip_path_attr_t));
dst_addrp = kmem_zalloc(sizeof (ibt_ip_addr_t), KM_SLEEP);
bcopy(&ibchanp->chan_remote_addr, dst_addrp, sizeof (ibt_ip_addr_t));
@@ -1256,56 +1292,19 @@ ibcma_get_paths(struct rdma_cm_id *idp, sol_cma_chan_t *chanp,
bcopy(&ibchanp->chan_local_addr, &path_attr.ipa_src_ip,
sizeof (ibt_ip_addr_t));
path_attr.ipa_ndst = 1;
- path_attr.ipa_max_paths = max_paths;
+ path_attr.ipa_max_paths = 2;
if (ibcma_any_addr(&path_attr.ipa_src_ip))
path_attr.ipa_src_ip.family = AF_UNSPEC;
- status = ibt_get_ip_paths(chanp->chan_ib_client_hdl, IBT_PATH_NO_FLAGS,
- &path_attr, ibchanp->chan_pathp, &ibchanp->chan_numpaths,
- src_ip_p);
- if (status != IBT_SUCCESS && status != IBT_INSUFF_DATA) {
+ status = ibt_aget_ip_paths(chanp->chan_ib_client_hdl, IBT_PATH_NO_FLAGS,
+ &path_attr, ibcma_path_hdlr, idp);
+ if (status != IBT_SUCCESS) {
SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str,
- "cma_get_paths : failed %d", status);
+ "cma_get_paths : ibt_aget_paths() failed %d", status);
kmem_free(dst_addrp, sizeof (ibt_ip_addr_t));
- if (src_ip_p)
- kmem_free(src_ip_p,
- sizeof (ibt_path_ip_src_t) * max_paths);
- kmem_free(ibchanp->chan_pathp, ibchanp->chan_path_size);
- ibchanp->chan_pathp = NULL;
return (EINVAL);
}
- if (src_ip_p) {
- ipaddr2sockaddr(&(src_ip_p[0].ip_primary),
- &(idp->route.addr.src_addr), NULL);
- bcopy(&(src_ip_p[0].ip_primary), &ibchanp->chan_local_addr,
- sizeof (ibt_ip_addr_t));
- if (ibcma_init_devinfo(idp, ibchanp, ibchanp->chan_pathp)) {
- kmem_free(src_ip_p, sizeof (ibt_path_ip_src_t) *
- max_paths);
- kmem_free(dst_addrp, sizeof (ibt_ip_addr_t));
- kmem_free(ibchanp->chan_pathp,
- ibchanp->chan_path_size);
- return (EINVAL);
- }
- kmem_free(src_ip_p, sizeof (ibt_path_ip_src_t) * max_paths);
- }
- if (!ibchanp->chan_devp) {
- SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str,
- "cma_get_paths : devp ERROR");
- kmem_free(dst_addrp, sizeof (ibt_ip_addr_t));
- return (EINVAL);
- }
- devp = ibchanp->chan_devp;
- (idp->route).num_paths = ibchanp->chan_numpaths;
- idp->route.path_rec = kmem_zalloc(sizeof (struct ib_sa_path_rec) *
- ibchanp->chan_numpaths, KM_SLEEP);
- base_lid = ibt_get_port_state_byguid(devp->dev_node_guid,
- devp->dev_port_num, NULL, &base_lid);
- for (i = 0; i < ibchanp->chan_numpaths; i++)
- ibt_path2sa_path(&((ibchanp->chan_pathp)[i]),
- &((idp->route.path_rec)[i]), base_lid);
-
kmem_free(dst_addrp, sizeof (ibt_ip_addr_t));
return (0);
}
@@ -1447,6 +1446,7 @@ ibcma_ud_hdlr(void *inp, ibt_cm_ud_event_t *eventp,
ASSERT(chanp->chan_connect_flag == SOL_CMA_CONNECT_INITIATED);
mutex_enter(&chanp->chan_mutex);
chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE;
+ chanp->chan_cmid_destroy_state |= SOL_CMA_CALLER_EVENT_PROGRESS;
mutex_exit(&chanp->chan_mutex);
sidr_rep = &((eventp->cm_event).sidr_rep);
if (sidr_rep->srep_status == IBT_CM_SREP_CHAN_VALID) {
@@ -1666,6 +1666,7 @@ ibcma_handle_req(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr,
root_chanp->chan_req_total_cnt++;
avl_insert(&root_chanp->chan_req_avl_tree, (void *)event_idp, where);
mutex_exit(&root_chanp->chan_mutex);
+ event_chanp->chan_req_state = REQ_CMID_NOTIFIED;
return (IBT_CM_DEFER);
}
@@ -1714,6 +1715,7 @@ ibcma_handle_est(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr,
if (chanp->chan_listenp == NULL) {
ASSERT(chanp->chan_connect_flag == SOL_CMA_CONNECT_INITIATED);
chanp->chan_connect_flag = SOL_CMA_CONNECT_CLIENT_DONE;
+ *event_id_ptr = idp;
bcopy(&chanp->chan_param, paramp,
sizeof (struct rdma_conn_param));
if (paramp->private_data_len) {
@@ -1726,6 +1728,9 @@ ibcma_handle_est(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr,
paramp->private_data_len);
}
event_chanp = chanp;
+ mutex_enter(&chanp->chan_mutex);
+ chanp->chan_cmid_destroy_state |= SOL_CMA_CALLER_EVENT_PROGRESS;
+ mutex_exit(&chanp->chan_mutex);
goto est_common;
}
@@ -1734,7 +1739,9 @@ ibcma_handle_est(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr,
root_chanp = (sol_cma_chan_t *)root_idp;
event_chanp = NULL;
+ mutex_enter(&root_chanp->chan_mutex);
event_idp = cma_get_acpt_idp(root_idp, eventp->cm_channel);
+ mutex_exit(&root_chanp->chan_mutex);
if (event_idp == NULL) {
SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str, "ibcma_handle_est: "
"No matching CMID for qp_hdl %p in ACPT AVL of CMID %p",
@@ -1743,7 +1750,11 @@ ibcma_handle_est(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr,
}
*event_id_ptr = event_idp;
event_chanp = (sol_cma_chan_t *)event_idp;
+ mutex_enter(&event_chanp->chan_mutex);
event_chanp->chan_connect_flag = SOL_CMA_CONNECT_SERVER_DONE;
+ event_chanp->chan_cmid_destroy_state |=
+ SOL_CMA_CALLER_EVENT_PROGRESS;
+ mutex_exit(&event_chanp->chan_mutex);
est_common:
#ifdef QP_DEBUG
@@ -1766,34 +1777,44 @@ ibcma_handle_closed(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr,
ibt_cm_event_t *eventp, enum rdma_cm_event_type *event, int *evt_status)
{
struct rdma_cm_id *root_idp, *event_idp;
- sol_cma_chan_t *chanp, *event_chanp;
+ sol_cma_chan_t *chanp, *root_chanp, *event_chanp;
*event = RDMA_CM_EVENT_DISCONNECTED;
*evt_status = 0;
chanp = (sol_cma_chan_t *)idp;
mutex_enter(&chanp->chan_mutex);
root_idp = CHAN_LISTEN_ROOT((chanp));
+ root_chanp = (sol_cma_chan_t *)root_idp;
chanp->chan_qp_hdl = NULL;
if (!root_idp) {
- chanp->chan_connect_flag = 0;
+ chanp->chan_cmid_destroy_state |=
+ SOL_CMA_CALLER_EVENT_PROGRESS;
mutex_exit(&chanp->chan_mutex);
+ *event_id_ptr = idp;
return (IBT_CM_DEFAULT);
}
mutex_exit(&chanp->chan_mutex);
/* On the passive side, search ACPT AVL Tree */
+ mutex_enter(&root_chanp->chan_mutex);
event_idp = cma_get_acpt_idp(root_idp, eventp->cm_channel);
+ event_chanp = (sol_cma_chan_t *)event_idp;
if (event_idp == NULL) {
+ mutex_exit(&root_chanp->chan_mutex);
SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str,
"ibcma_handle_closed: "
"No matching CMID for qp hdl %p in EST AVL of CMID %p",
eventp->cm_channel, root_idp);
return (IBT_CM_DEFAULT);
}
- event_chanp = (sol_cma_chan_t *)event_idp;
+ avl_remove(&root_chanp->chan_acpt_avl_tree, event_idp);
+ mutex_exit(&root_chanp->chan_mutex);
mutex_enter(&event_chanp->chan_mutex);
- event_chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE;
+ event_chanp->chan_req_state = REQ_CMID_NONE;
+ event_chanp->chan_cmid_destroy_state |=
+ SOL_CMA_CALLER_EVENT_PROGRESS;
mutex_exit(&event_chanp->chan_mutex);
+
*event_id_ptr = event_idp;
return (IBT_CM_DEFAULT);
}
@@ -1843,9 +1864,14 @@ ibcma_handle_failed(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr,
* event to accepted CMID.
*/
if (root_idp) {
+ sol_cma_chan_t *root_chanp;
ASSERT(eventp->cm_channel);
+
+ root_chanp = (sol_cma_chan_t *)root_idp;
+ mutex_enter(&root_chanp->chan_mutex);
event_idp = cma_get_acpt_idp(root_idp,
eventp->cm_channel);
+ mutex_exit(&root_chanp->chan_mutex);
if (event_idp == NULL) {
SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str,
"ibcma_handle_failed: No matching CMID "
@@ -1856,8 +1882,9 @@ ibcma_handle_failed(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr,
event_chanp = (sol_cma_chan_t *)event_idp;
mutex_enter(&event_chanp->chan_mutex);
- event_chanp->chan_connect_flag =
- SOL_CMA_CONNECT_NONE;
+ event_chanp->chan_req_state = REQ_CMID_NONE;
+ event_chanp->chan_cmid_destroy_state |=
+ SOL_CMA_CALLER_EVENT_PROGRESS;
event_chanp->chan_qp_hdl = NULL;
mutex_exit(&event_chanp->chan_mutex);
*event_id_ptr = event_idp;
@@ -1865,8 +1892,14 @@ ibcma_handle_failed(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr,
avl_remove(&root_chanp->chan_acpt_avl_tree,
event_idp);
mutex_exit(&root_chanp->chan_mutex);
- } else
- chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE;
+ } else {
+ mutex_enter(&chanp->chan_mutex);
+ chanp->chan_cmid_destroy_state |=
+ SOL_CMA_CALLER_EVENT_PROGRESS;
+ chanp->chan_qp_hdl = NULL;
+ mutex_exit(&chanp->chan_mutex);
+ *event_id_ptr = idp;
+ }
*evt_status = failedp->cf_reason;
*event = RDMA_CM_EVENT_REJECTED;
break;
@@ -1889,8 +1922,7 @@ ibcma_handle_failed(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr,
event_chanp = (sol_cma_chan_t *)event_idp;
mutex_enter(&event_chanp->chan_mutex);
- event_chanp->chan_connect_flag =
- SOL_CMA_CONNECT_NONE;
+ event_chanp->chan_req_state = REQ_CMID_NONE;
event_chanp->chan_qp_hdl = NULL;
mutex_exit(&event_chanp->chan_mutex);
*event_id_ptr = event_idp;
@@ -1909,12 +1941,15 @@ ibcma_handle_failed(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr,
"session_id NULL");
}
if (!root_idp) {
- chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE;
+ *event_id_ptr = idp;
+ mutex_enter(&chanp->chan_mutex);
+ chanp->chan_cmid_destroy_state |=
+ SOL_CMA_CALLER_EVENT_PROGRESS;
+ chanp->chan_qp_hdl = NULL;
+ mutex_exit(&chanp->chan_mutex);
*evt_status = IBT_CM_TIMEOUT;
*event = RDMA_CM_EVENT_REJECTED;
}
- chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE;
- chanp->chan_qp_hdl = NULL;
break;
case IBT_CM_FAILURE_STALE :
diff --git a/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_kverbs.c b/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_kverbs.c
new file mode 100644
index 0000000000..92a7fabc14
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_kverbs.c
@@ -0,0 +1,2323 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/* Solaris Open Fabric kernel verbs */
+
+#include <sys/types.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/modctl.h>
+#include <sys/ib/clients/of/rdma/ib_verbs.h>
+#include <sys/ib/clients/of/rdma/ib_addr.h>
+#include <sys/ib/clients/of/rdma/rdma_cm.h>
+#include <sys/ib/clients/of/sol_ofs/sol_kverb_impl.h>
+
+static void *statep;
+char *sol_kverbs_dbg_str = "sol_kverbs";
+
+static llist_head_t client_list = LLIST_HEAD_INIT(client_list);
+kmutex_t clist_lock; /* mutex for client_list */
+
+static void ofs_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
+ ibt_async_event_t *);
+
+/*
+ * set ibt_client_t members. clnt->ib_client must be set before
+ * this func is called.
+ */
+static int
+alloc_ibt_client(ofs_client_t *clnt)
+{
+ int namelen;
+ ASSERT(clnt->ib_client != NULL);
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "alloc_ibt_client: client: 0x%p", clnt);
+
+ /*
+ * double-check the name string. if it's longer than MAXNAMELEN
+ * including the string terminator, assuming the name is invalid,
+ * return EINVAL.
+ */
+ namelen = strlen(clnt->ib_client->name);
+ if (namelen >= MAXNAMELEN) {
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "alloc_ibt_client: client: 0x%p => "
+ "namelen(%d) is larger than MAXNAMELEN", clnt, namelen);
+ return (-EINVAL);
+ }
+ clnt->ibt_client.mi_clnt_name = kmem_zalloc(namelen + 1, KM_NOSLEEP);
+ if (clnt->ibt_client.mi_clnt_name == NULL) {
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "alloc_ibt_client: client: 0x%p => "
+ "no sufficient memory", clnt);
+ return (-ENOMEM);
+ }
+ bcopy(clnt->ib_client->name, clnt->ibt_client.mi_clnt_name, namelen);
+ clnt->ibt_client.mi_ibt_version = IBTI_V_CURR;
+ if (clnt->ib_client->dip) {
+ clnt->ibt_client.mi_clnt_class = IBT_GENERIC;
+ } else {
+ clnt->ibt_client.mi_clnt_class = IBT_GENERIC_MISC;
+ }
+ clnt->ibt_client.mi_async_handler = ofs_async_handler;
+
+ return (0);
+}
+
+static void
+free_ibt_client(ofs_client_t *clnt)
+{
+ int namelen = strlen(clnt->ib_client->name);
+ ASSERT(namelen < MAXNAMELEN);
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "free_ibt_client: client: 0x%p", clnt);
+
+ kmem_free(clnt->ibt_client.mi_clnt_name, namelen + 1);
+ clnt->ibt_client.mi_clnt_name = NULL;
+}
+
+/*
+ * get_device() returns a pointer to struct ib_devcie with
+ * the same guid as one passed to the function.
+ */
+static ib_device_t *
+get_device(ofs_client_t *ofs_client, ib_guid_t guid)
+{
+ ib_device_t *device;
+ llist_head_t *entry;
+
+ ASSERT(RW_LOCK_HELD(&ofs_client->lock));
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "get_device: client: 0x%p, guid:0x%p",
+ ofs_client, (void *)(uintptr_t)htonll(guid));
+
+ list_for_each(entry, &ofs_client->device_list) {
+ device = entry->ptr;
+ if (device->node_guid == htonll(guid)) {
+ ASSERT(device->reg_state == IB_DEV_CLOSE);
+ ASSERT(device->node_type == RDMA_NODE_IB_CA);
+ ASSERT(device->clnt_hdl == (ofs_client_p_t)ofs_client);
+ return (device);
+ }
+ }
+
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "get_device: client: 0x%p, guid:0x%p => no match guid",
+ ofs_client, (void *)(uintptr_t)htonll(guid));
+
+ return (NULL);
+}
+
+/*
+ * ofs_async_handler() is a delegated function to handle asynchrnonous events,
+ * which dispatches each event to corresponding qp/cq handlers registered
+ * with ib_create_qp() and/or ib_create_cq().
+ */
+static void
+ofs_async_handler(void *clntp, ibt_hca_hdl_t hdl, ibt_async_code_t code,
+ ibt_async_event_t *event)
+{
+ ofs_client_t *ofs_client = (ofs_client_t *)clntp;
+ struct ib_event ib_event;
+ struct ib_qp *qpp;
+ struct ib_cq *cqp;
+
+
+ ASSERT(ofs_client != NULL);
+
+ cqp = event->ev_cq_hdl ? ibt_get_cq_private(event->ev_cq_hdl) : NULL;
+ qpp = event->ev_chan_hdl ?
+ ibt_get_qp_private(event->ev_chan_hdl) : NULL;
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ofs_async_handler: client: 0x%p, hca_hdl: 0x%p, code:0x%x, "
+ "event->qp: 0x%p, event->cq: 0x%p, event->srq: 0x%p "
+ "event->guid: 0x%p, event->port: 0x%x",
+ clntp, hdl, code, qpp, cqp, event->ev_srq_hdl,
+ (void *)(uintptr_t)event->ev_hca_guid, event->ev_port);
+
+ bzero(&ib_event, sizeof (struct ib_event));
+ switch (code) {
+ case IBT_EVENT_PATH_MIGRATED:
+ FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp,
+ IB_EVENT_PATH_MIG);
+ return;
+ case IBT_EVENT_SQD:
+ FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp,
+ IB_EVENT_SQ_DRAINED);
+ return;
+ case IBT_EVENT_COM_EST:
+ FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp,
+ IB_EVENT_COMM_EST);
+ return;
+ case IBT_ERROR_CATASTROPHIC_CHAN:
+ FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp,
+ IB_EVENT_QP_FATAL);
+ return;
+ case IBT_ERROR_INVALID_REQUEST_CHAN:
+ FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp,
+ IB_EVENT_QP_REQ_ERR);
+ return;
+ case IBT_ERROR_ACCESS_VIOLATION_CHAN:
+ FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp,
+ IB_EVENT_QP_ACCESS_ERR);
+ return;
+ case IBT_ERROR_PATH_MIGRATE_REQ:
+ FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp,
+ IB_EVENT_PATH_MIG);
+ return;
+ case IBT_EVENT_EMPTY_CHAN:
+ FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp,
+ IB_EVENT_QP_LAST_WQE_REACHED);
+ return;
+ case IBT_ERROR_CQ:
+ FIRE_CQ_EVENT(ofs_client, hdl, ib_event, cqp,
+ IB_EVENT_CQ_ERR);
+ return;
+ case IBT_HCA_ATTACH_EVENT:
+ {
+ ib_device_t *device;
+ int rtn;
+
+ /* re-use the device once it was created */
+ rw_enter(&ofs_client->lock, RW_WRITER);
+ device = get_device(ofs_client, event->ev_hca_guid);
+ if (device == NULL) {
+ device = kmem_alloc(sizeof (ib_device_t), KM_SLEEP);
+ device->node_type = RDMA_NODE_IB_CA;
+ device->reg_state = IB_DEV_CLOSE;
+ device->clnt_hdl = (ofs_client_p_t)ofs_client;
+ device->node_guid = htonll(event->ev_hca_guid);
+ device->data = NULL;
+ /* add this HCA */
+ ofs_client->hca_num++;
+ llist_head_init(&device->list, device);
+ llist_add_tail(&device->list, &ofs_client->device_list);
+ }
+ device->hca_hdl = NULL;
+ device->local_dma_lkey = 0;
+ device->phys_port_cnt = 0;
+
+ /* open this HCA */
+ rtn = ibt_open_hca(ofs_client->ibt_hdl, event->ev_hca_guid,
+ &device->hca_hdl);
+ if (rtn == IBT_SUCCESS) {
+ ibt_hca_attr_t hattr;
+
+ ofs_client->hca_open_num++;
+ device->reg_state = IB_DEV_OPEN;
+ ibt_set_hca_private(device->hca_hdl, device);
+
+ rtn = ibt_query_hca(device->hca_hdl, &hattr);
+ if (rtn != IBT_SUCCESS) {
+ device->reg_state = IB_DEV_CLOSE;
+ rtn = ibt_close_hca(device->hca_hdl);
+ ASSERT(rtn == IBT_SUCCESS);
+ ofs_client->hca_open_num--;
+ return;
+ }
+
+ (void) sprintf(device->name, "%x:%x:%x",
+ hattr.hca_vendor_id, hattr.hca_device_id,
+ hattr.hca_version_id);
+ device->local_dma_lkey = hattr.hca_reserved_lkey;
+ device->phys_port_cnt = hattr.hca_nports;
+ ibt_set_hca_private(device->hca_hdl, device);
+
+ /* invoke client's callback */
+ if (ofs_client->ib_client->add) {
+ ofs_client->ib_client->add(device);
+ }
+ }
+ rw_exit(&ofs_client->lock);
+
+ return;
+ }
+ case IBT_HCA_DETACH_EVENT:
+ {
+ struct ib_device *device;
+
+ rw_enter(&ofs_client->lock, RW_WRITER);
+ device = ibt_get_hca_private(hdl);
+ if (device->reg_state == IB_DEV_OPEN) {
+ ibt_status_t rtn;
+ /* invoke client's callback */
+ if (ofs_client->ib_client->remove) {
+ ofs_client->ib_client->remove(device);
+ }
+ /* change the state only */
+ device->reg_state = IB_DEV_CLOSE;
+ /* close this HCA */
+ rtn = ibt_close_hca(device->hca_hdl);
+ ASSERT(rtn == IBT_SUCCESS);
+ ofs_client->hca_open_num--;
+ }
+ rw_exit(&ofs_client->lock);
+
+ return;
+ }
+ case IBT_EVENT_LIMIT_REACHED_SRQ:
+ case IBT_ERROR_CATASTROPHIC_SRQ:
+ default:
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "sol_ofs does not support this event(0x%x).\n"
+ "\t clntp=0x%p, hca_hdl=0x%p, code=%d, eventp=0x%p\n",
+ code, clntp, hdl, code, event);
+ return;
+ }
+}
+
+/*
+ * ib_register_client - Register an IB client
+ * @client:Client to register
+ *
+ * Upper level users of the IB drivers can use ib_register_client() to
+ * register callbacks for IB device addition and removal. When an IB
+ * device is added, each registered client's add method will be called
+ * (in the order the clients were registered), and when a device is
+ * removed, each client's remove method will be called (in the reverse
+ * order that clients were registered). In addition, when
+ * ib_register_client() is called, the client will receive an add
+ * callback for all devices already registered.
+ *
+ * Note that struct ib_client should have a dip pointer to the client,
+ * which is different from the Linux implementation.
+ */
+int
+ib_register_client(struct ib_client *client)
+{
+ uint_t i, nhcas; /* number of HCAs */
+ ib_guid_t *guidp;
+ ofs_client_t *ofs_client;
+ llist_head_t *entry, *tmp;
+ ib_device_t *device;
+ int rtn;
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_register_client: client: 0x%p", client);
+
+ /* get the number of HCAs on this system */
+ if ((nhcas = ibt_get_hca_list(&guidp)) == 0) {
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_register_client: client: 0x%p => no HCA", client);
+ return (-ENXIO);
+ }
+
+ /* allocate a new sol_ofs_client structure */
+ ofs_client = kmem_zalloc(sizeof (ofs_client_t), KM_NOSLEEP);
+ if (ofs_client == NULL) {
+ (void) ibt_free_hca_list(guidp, nhcas);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_register_client: client: 0x%p => "
+ "no sufficient memory for ofs_client", client);
+ return (-ENOMEM);
+ }
+
+ /* set members */
+ ofs_client->ib_client = client;
+ if ((rtn = alloc_ibt_client(ofs_client)) != 0) {
+ kmem_free(ofs_client, sizeof (ofs_client_t));
+ (void) ibt_free_hca_list(guidp, nhcas);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_register_client: client: 0x%p => "
+ "alloc_ibt_client failed w/ 0x%x", client, rtn);
+ return (rtn);
+ }
+ ofs_client->state = IB_OFS_CLNT_INITIALIZED;
+ llist_head_init(&ofs_client->device_list, NULL);
+ llist_head_init(&ofs_client->client_list, ofs_client);
+ rw_init(&ofs_client->lock, NULL, RW_DEFAULT, NULL);
+
+ /* initialize IB client */
+ rw_enter(&ofs_client->lock, RW_WRITER);
+ if (client->state != IB_CLNT_UNINITIALIZED) {
+ rw_exit(&ofs_client->lock);
+ kmem_free(ofs_client, sizeof (ofs_client_t));
+ (void) ibt_free_hca_list(guidp, nhcas);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_register_client: client: 0x%p => "
+ "invalid client state(%d)", client, client->state);
+ return (-EPERM);
+ }
+
+ /* attach this client to IBTF */
+ rtn = ibt_attach(&ofs_client->ibt_client, client->dip, ofs_client,
+ &ofs_client->ibt_hdl);
+ if (rtn != IBT_SUCCESS) {
+ rw_exit(&ofs_client->lock);
+ free_ibt_client(ofs_client);
+ kmem_free(ofs_client, sizeof (ofs_client_t));
+ (void) ibt_free_hca_list(guidp, nhcas);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_register_client: client: 0x%p => "
+ "ibt_attach failed w/ 0x%x", client, rtn);
+ return (-EINVAL);
+ }
+ client->clnt_hdl = (ofs_client_p_t)ofs_client;
+ client->state = IB_CLNT_INITIALIZED;
+
+ /* link this client */
+ mutex_enter(&clist_lock);
+ llist_add_tail(&ofs_client->client_list, &client_list);
+ mutex_exit(&clist_lock);
+
+ /* Open HCAs */
+ ofs_client->hca_num = nhcas;
+ for (i = 0; i < ofs_client->hca_num; i++) {
+ /* allocate the ib_device structure */
+ device = kmem_zalloc(sizeof (ib_device_t), KM_NOSLEEP);
+ if (device == NULL) {
+ rtn = -ENOMEM;
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_register_client: client: 0x%p => "
+ "no sufficient memory for ib_device", client);
+ goto err;
+ }
+ device->node_guid = htonll(guidp[i]);
+ device->node_type = RDMA_NODE_IB_CA;
+ device->reg_state = IB_DEV_CLOSE;
+ device->clnt_hdl = (ofs_client_p_t)ofs_client;
+ llist_head_init(&device->list, device);
+ llist_add_tail(&device->list, &ofs_client->device_list);
+
+ rtn = ibt_open_hca(ofs_client->ibt_hdl, guidp[i],
+ &device->hca_hdl);
+ if (rtn == IBT_SUCCESS) {
+ ibt_hca_attr_t hattr;
+
+ ofs_client->hca_open_num++;
+ device->reg_state = IB_DEV_OPEN;
+
+ rtn = ibt_query_hca(device->hca_hdl, &hattr);
+ if (rtn != IBT_SUCCESS) {
+ rtn = -EIO;
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_register_client: client: 0x%p,"
+ "hca_hdl: 0x%p ==> "
+ "ibt_query_hca() failed w/ %d",
+ client, device->hca_hdl, rtn);
+ goto err;
+ }
+
+ (void) sprintf(device->name, "%x:%x:%x",
+ hattr.hca_vendor_id, hattr.hca_device_id,
+ hattr.hca_version_id);
+ device->local_dma_lkey = hattr.hca_reserved_lkey;
+ device->phys_port_cnt = hattr.hca_nports;
+ ibt_set_hca_private(device->hca_hdl, device);
+
+ /* invoke client's callback */
+ if (client->add) {
+ client->add(device);
+ }
+ }
+ }
+ if (ofs_client->hca_open_num == 0) {
+ rtn = -ENXIO;
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_register_client: client: 0x%p => "
+ "no available HCA", client);
+ goto err;
+ }
+ rw_exit(&ofs_client->lock);
+
+ (void) ibt_free_hca_list(guidp, nhcas);
+ return (0);
+
+err:
+ /* first close all open HCAs */
+ list_for_each(entry, &ofs_client->device_list) {
+ device = entry->ptr;
+ /*
+ * If it's open already, close it after the remove
+ * callback.
+ */
+ if (device->reg_state == IB_DEV_OPEN) {
+ ibt_status_t rtn;
+ /* invoke client's callback */
+ if (client->remove) {
+ client->remove(device);
+ }
+ device->reg_state = IB_DEV_CLOSE;
+ rtn = ibt_close_hca(device->hca_hdl);
+ ASSERT(rtn == IBT_SUCCESS);
+ ofs_client->hca_open_num--;
+ }
+ }
+ ASSERT(ofs_client->hca_open_num == 0);
+
+ /* then free the devices */
+ list_for_each_safe(entry, tmp, &ofs_client->device_list) {
+ device = entry->ptr;
+ /* de-link and free the device */
+ llist_del(entry);
+ kmem_free(device, sizeof (ib_device_t));
+ ofs_client->hca_num--;
+ }
+ ASSERT(ofs_client->hca_num == 0);
+
+ /* delink this client */
+ mutex_enter(&clist_lock);
+ llist_del(&ofs_client->client_list);
+ mutex_exit(&clist_lock);
+
+ /* detach the client */
+ client->clnt_hdl = NULL;
+ client->state = IB_CLNT_UNINITIALIZED;
+ (void) ibt_detach(ofs_client->ibt_hdl);
+ rw_exit(&ofs_client->lock);
+
+ /* free sol_ofs_client */
+ free_ibt_client(ofs_client);
+ kmem_free(ofs_client, sizeof (ofs_client_t));
+
+ (void) ibt_free_hca_list(guidp, nhcas);
+ return (rtn);
+}
+
+/*
+ * ib_unregister_client - Unregister an IB client
+ * @client:Client to unregister
+ *
+ * Upper level users use ib_unregister_client() to remove their client
+ * registration. When ib_unregister_client() is called, the client
+ * will receive a remove callback for each IB device still registered.
+ */
+void
+ib_unregister_client(struct ib_client *client)
+{
+ ofs_client_t *ofs_client;
+ ib_device_t *device;
+ llist_head_t *entry, *tmp;
+
+ ASSERT(client->state == IB_CLNT_INITIALIZED &&
+ client->clnt_hdl != NULL);
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_unregister_client: client: 0x%p", client);
+
+ ofs_client = (ofs_client_t *)client->clnt_hdl;
+ rw_enter(&ofs_client->lock, RW_WRITER);
+
+ /* first close all open HCAs */
+ list_for_each(entry, &ofs_client->device_list) {
+ device = entry->ptr;
+ /*
+ * If it's open already, close it after the remove
+ * callback.
+ */
+ if (device->reg_state == IB_DEV_OPEN) {
+ ibt_status_t rtn;
+ /* invoke client's callback */
+ if (client->remove) {
+ client->remove(device);
+ }
+ device->reg_state = IB_DEV_CLOSE;
+ rtn = ibt_close_hca(device->hca_hdl);
+ if (rtn != IBT_SUCCESS)
+ SOL_OFS_DPRINTF_L3(
+ sol_kverbs_dbg_str,
+ "ib_unregister_client(%p) - "
+ "ibt_close_hca failed %d",
+ client, rtn);
+
+ ofs_client->hca_open_num--;
+ }
+ }
+ ASSERT(ofs_client->hca_open_num == 0);
+
+ /* then free the devices */
+ list_for_each_safe(entry, tmp, &ofs_client->device_list) {
+ device = entry->ptr;
+ /* de-link and free the device */
+ llist_del(entry);
+ kmem_free(device, sizeof (ib_device_t));
+ ofs_client->hca_num--;
+ }
+ ASSERT(ofs_client->hca_num == 0);
+
+ /* delink this client */
+ mutex_enter(&clist_lock);
+ llist_del(&ofs_client->client_list);
+ mutex_exit(&clist_lock);
+
+ /* detach the client */
+ client->clnt_hdl = NULL;
+ client->state = IB_CLNT_UNINITIALIZED;
+ (void) ibt_detach(ofs_client->ibt_hdl);
+ rw_exit(&ofs_client->lock);
+
+ /* free sol_ofs_client */
+ free_ibt_client(ofs_client);
+ kmem_free(ofs_client, sizeof (ofs_client_t));
+}
+
+/*
+ * ofs_lock_enter() and ofs_lock_exit() are used to avoid the recursive
+ * rwlock while the client callbacks are invoked.
+ *
+ * Note that the writer lock is used only in the client callback case,
+ * so that the kverb functions wanting to acquire the reader lock can
+ * safely ignore the reader lock if the writer lock is already held.
+ * The writer lock shouldn't be used in no other plances.
+ */
+static inline void
+ofs_lock_enter(krwlock_t *lock)
+{
+ if (!RW_WRITE_HELD(lock)) {
+ rw_enter(lock, RW_READER);
+ }
+}
+
+static inline void
+ofs_lock_exit(krwlock_t *lock)
+{
+ if (!RW_WRITE_HELD(lock)) {
+ rw_exit(lock);
+ }
+}
+
+/*
+ * ib_get_client_data - Get IB client context
+ * @device:Device to get context for
+ * @client:Client to get context for
+ *
+ * ib_get_client_data() returns client context set with
+ * ib_set_client_data() and returns NULL if it's not found.
+ */
+void *ib_get_client_data(struct ib_device *device,
+ struct ib_client *client)
+{
+ ofs_client_t *ofs_client;
+ struct ib_device *ib_device;
+ boolean_t found = B_FALSE;
+ llist_head_t *entry;
+ void *data;
+
+ ASSERT(device != 0 && client != 0);
+
+ ofs_client = (ofs_client_t *)client->clnt_hdl;
+ if (ofs_client == 0) {
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_get_client_data: device: 0x%p, client: 0x%p => "
+ "no ofs_client", device, client);
+ return (NULL);
+ }
+
+ ofs_lock_enter(&ofs_client->lock);
+ list_for_each(entry, &ofs_client->device_list) {
+ ib_device = entry->ptr;
+ if (ib_device->node_guid == device->node_guid) {
+ found = B_TRUE;
+ break;
+ }
+ }
+ if (!found) {
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_get_client_data: device: 0x%p, client: 0x%p => "
+ "no ib_device found", device, client);
+ return (NULL);
+ }
+ data = ib_device->data;
+ ofs_lock_exit(&ofs_client->lock);
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_get_client_data: device: 0x%p, client: 0x%p",
+ device, client);
+
+ return (data);
+}
+
+/*
+ * ib_set_client_data - Set IB client context
+ * @device:Device to set context for
+ * @client:Client to set context for
+ * @data:Context to set
+ *
+ * ib_set_client_data() sets client context that can be retrieved with
+ * ib_get_client_data(). If the specified device is not found, the function
+ * returns w/o any operations.
+ */
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+ void *data)
+{
+ ofs_client_t *ofs_client;
+ struct ib_device *ib_device;
+ boolean_t found = B_FALSE;
+ llist_head_t *entry;
+
+ ASSERT(device != 0 && client != 0);
+
+ ofs_client = (ofs_client_t *)client->clnt_hdl;
+ if (ofs_client == 0) {
+ cmn_err(CE_WARN, "No client context found for %s/%s\n",
+ device->name, client->name);
+ return;
+ }
+
+ ofs_lock_enter(&ofs_client->lock);
+ list_for_each(entry, &ofs_client->device_list) {
+ ib_device = entry->ptr;
+ if (ib_device->node_guid == device->node_guid) {
+ found = B_TRUE;
+ break;
+ }
+ }
+ if (!found) {
+ cmn_err(CE_WARN, "No client context found for %s/%s\n",
+ device->name, client->name);
+ ofs_lock_exit(&ofs_client->lock);
+ return;
+ }
+ ib_device->data = data;
+ ofs_lock_exit(&ofs_client->lock);
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_set_client_data: device: 0x%p, client: 0x%p, "
+ "data: 0x%p", device, client, data);
+}
+
+/*
+ * ib_query_device - Query IB device attributes
+ * @device:Device to query
+ * @device_attr:Device attributes
+ *
+ * ib_query_device() returns the attributes of a device through the
+ * @device_attr pointer.
+ */
+int
+ib_query_device(struct ib_device *device, struct ib_device_attr *attr)
+{
+ ofs_client_t *ofs_client = (ofs_client_t *)device->clnt_hdl;
+ ibt_hca_attr_t hattr;
+ int rtn;
+
+ ofs_lock_enter(&ofs_client->lock);
+ if (device->reg_state != IB_DEV_OPEN) {
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_query_device: device: 0x%p => "
+ "invalid device state (%d)", device, device->reg_state);
+ return (-ENXIO);
+ }
+ if ((rtn = ibt_query_hca(device->hca_hdl, &hattr)) != IBT_SUCCESS) {
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_query_device: device: 0x%p => "
+ "ibt_query_hca failed w/ 0x%x", device, rtn);
+ return (-EIO);
+ }
+ ofs_lock_exit(&ofs_client->lock);
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_query_device: device: 0x%p, attr: 0x%p, rtn: 0x%p",
+ device, attr, rtn);
+
+ /* OF order is major.micro.minor, so keep it here */
+ attr->fw_ver = (uint64_t)hattr.hca_fw_major_version << 32 |
+ hattr.hca_fw_micro_version << 16 & 0xFFFF0000 |
+ hattr.hca_fw_minor_version & 0xFFFF;
+
+ attr->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT |
+ IB_DEVICE_PORT_ACTIVE_EVENT |
+ IB_DEVICE_SYS_IMAGE_GUID |
+ IB_DEVICE_RC_RNR_NAK_GEN;
+ if (hattr.hca_flags & IBT_HCA_PKEY_CNTR) {
+ attr->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+ }
+ if (hattr.hca_flags & IBT_HCA_QKEY_CNTR) {
+ attr->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+ }
+ if (hattr.hca_flags & IBT_HCA_AUTO_PATH_MIG) {
+ attr->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+ }
+ if (hattr.hca_flags & IBT_HCA_AH_PORT_CHECK) {
+ attr->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+ }
+
+ attr->vendor_id = hattr.hca_vendor_id;
+ attr->vendor_part_id = hattr.hca_device_id;
+ attr->hw_ver = hattr.hca_version_id;
+ attr->sys_image_guid = htonll(hattr.hca_si_guid);
+ attr->max_mr_size = ~0ull;
+ attr->page_size_cap = IBTF2OF_PGSZ(hattr.hca_page_sz);
+ attr->max_qp = hattr.hca_max_qp;
+ attr->max_qp_wr = hattr.hca_max_qp_sz;
+ attr->max_sge = hattr.hca_max_sgl;
+ attr->max_sge_rd = hattr.hca_max_rd_sgl;
+ attr->max_cq = hattr.hca_max_cq;
+ attr->max_cqe = hattr.hca_max_cq_sz;
+ attr->max_mr = hattr.hca_max_memr;
+ attr->max_pd = hattr.hca_max_pd;
+ attr->max_qp_rd_atom = hattr.hca_max_rdma_in_qp;
+ attr->max_qp_init_rd_atom = hattr.hca_max_rdma_in_qp;
+ attr->max_ee_rd_atom = hattr.hca_max_rdma_in_ee;
+ attr->max_ee_init_rd_atom = hattr.hca_max_rdma_in_ee;
+ attr->max_res_rd_atom = hattr.hca_max_rsc;
+ attr->max_srq = hattr.hca_max_srqs;
+ attr->max_srq_wr = hattr.hca_max_srqs_sz -1;
+ attr->max_srq_sge = hattr.hca_max_srq_sgl;
+ attr->local_ca_ack_delay = hattr.hca_local_ack_delay;
+ attr->atomic_cap = hattr.hca_flags & IBT_HCA_ATOMICS_GLOBAL ?
+ IB_ATOMIC_GLOB : (hattr.hca_flags & IBT_HCA_ATOMICS_HCA ?
+ IB_ATOMIC_HCA : IB_ATOMIC_NONE);
+ attr->max_ee = hattr.hca_max_eec;
+ attr->max_rdd = hattr.hca_max_rdd;
+ attr->max_mw = hattr.hca_max_mem_win;
+ attr->max_pkeys = hattr.hca_max_port_pkey_tbl_sz;
+ attr->max_raw_ipv6_qp = hattr.hca_max_ipv6_qp;
+ attr->max_raw_ethy_qp = hattr.hca_max_ether_qp;
+ attr->max_mcast_grp = hattr.hca_max_mcg;
+ attr->max_mcast_qp_attach = hattr.hca_max_qp_per_mcg;
+ attr->max_total_mcast_qp_attach = hattr.hca_max_mcg_qps;
+ attr->max_ah = hattr.hca_max_ah;
+ attr->max_fmr = hattr.hca_max_fmrs;
+ attr->max_map_per_fmr = hattr.hca_opaque9; /* hca_max_map_per_fmr */
+
+ return (0);
+}
+
+/* Protection domains */
+struct ib_pd *
+ib_alloc_pd(struct ib_device *device)
+{
+ ofs_client_t *ofs_client = (ofs_client_t *)device->clnt_hdl;
+ struct ib_pd *pd;
+ int rtn;
+
+ if ((pd = kmem_alloc(sizeof (struct ib_pd), KM_NOSLEEP)) == NULL) {
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_alloc_pd: device: 0x%p => no sufficient memory",
+ device);
+ return ((struct ib_pd *)-ENOMEM);
+ }
+
+ ofs_lock_enter(&ofs_client->lock);
+ if (device->reg_state != IB_DEV_OPEN) {
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_alloc_pd: device: 0x%p => invalid device state (%d)",
+ device, device->reg_state);
+ return ((struct ib_pd *)-ENXIO);
+ }
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_alloc_pd: device: 0x%p", device);
+
+ rtn = ibt_alloc_pd(device->hca_hdl, IBT_PD_NO_FLAGS, &pd->ibt_pd);
+ ofs_lock_exit(&ofs_client->lock);
+
+ if (rtn == IBT_SUCCESS) {
+ pd->device = device;
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_alloc_pd: device: 0x%p, pd: 0x%p, ibt_pd: 0x%p, "
+ "rtn: 0x%x", device, pd, pd->ibt_pd, rtn);
+ return (pd);
+ }
+ kmem_free(pd, sizeof (struct ib_pd));
+
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_alloc_pd: device: 0x%p, pd: 0x%p, ibt_pd: 0x%p => "
+ "ibt_alloc_pd failed w/ 0x%x", device, pd, pd->ibt_pd, rtn);
+
+ switch (rtn) {
+ case IBT_INSUFF_RESOURCE:
+ return ((struct ib_pd *)-ENOMEM);
+ case IBT_HCA_HDL_INVALID:
+ return ((struct ib_pd *)-EFAULT);
+ default:
+ return ((struct ib_pd *)-EIO);
+ }
+}
+
+int
+ib_dealloc_pd(struct ib_pd *pd)
+{
+ ofs_client_t *ofs_client = (ofs_client_t *)pd->device->clnt_hdl;
+ int rtn;
+
+ ofs_lock_enter(&ofs_client->lock);
+ if (pd->device->reg_state != IB_DEV_OPEN) {
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_dealloc_pd: pd: 0x%p => invalid device state (%d)",
+ pd, pd->device->reg_state);
+ return (-ENXIO);
+ }
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_dealloc_pd: pd: 0x%p", pd);
+
+ rtn = ibt_free_pd(pd->device->hca_hdl, pd->ibt_pd);
+ ofs_lock_exit(&ofs_client->lock);
+
+ if (rtn == IBT_SUCCESS) {
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_dealloc_pd: pd: 0x%p, device: 0x%p, ibt_pd: 0x%p, "
+ "rtn: 0x%x", pd, pd->device, pd->ibt_pd, rtn);
+ kmem_free(pd, sizeof (struct ib_pd));
+ return (0);
+ }
+
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_dealloc_pd: pd: 0x%p => ibt_free_pd failed w/ 0x%x",
+ pd, rtn);
+
+ switch (rtn) {
+ case IBT_PD_IN_USE:
+ return (-EBUSY);
+ case IBT_HCA_HDL_INVALID:
+ return (-EFAULT);
+ default:
+ return (-EIO);
+ }
+}
+
+/*
+ * ofs_cq_handler() is a delegated function to handle CQ events,
+ * which dispatches them to corresponding cq handlers registered
+ * with ib_create_cq().
+ */
+static void
+ofs_cq_handler(ibt_cq_hdl_t ibt_cq, void *arg)
+{
+ struct ib_cq *cq = (struct ib_cq *)ibt_get_cq_private(ibt_cq);
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ofs_cq_handler: ibt_cq: 0x%p, ib_cq: 0x%p, comp_handler: 0x%p, "
+ "arg: 0x%p", ibt_cq, cq, cq->comp_handler, arg);
+
+ if (cq->comp_handler) {
+ cq->comp_handler(cq, cq->cq_context);
+ }
+}
+
+/*
+ * ib_create_cq - Creates a CQ on the specified device.
+ * @device: The device on which to create the CQ.
+ * @comp_handler: A user-specified callback that is invoked when a
+ * completion event occurs on the CQ.
+ * @event_handler: A user-specified callback that is invoked when an
+ * asynchronous event not associated with a completion occurs on the CQ.
+ * @cq_context: Context associated with the CQ returned to the user via
+ * the associated completion and event handlers.
+ * @cqe: The minimum size of the CQ.
+ * @comp_vector - Completion vector used to signal completion events.
+ * Must be >= 0 and < context->num_comp_vectors.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ *
+ * Note that comp_vector is not supported currently.
+ */
+struct ib_cq *
+ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler,
+ void (*event_handler)(struct ib_event *, void *), void *cq_context,
+ int cqe, int comp_vector)
+{
+ ofs_client_t *ofs_client = (ofs_client_t *)device->clnt_hdl;
+ ibt_cq_attr_t cq_attr;
+ uint32_t real_size;
+ struct ib_cq *cq;
+ int rtn;
+
+ if ((cq = kmem_alloc(sizeof (struct ib_cq), KM_NOSLEEP)) == NULL) {
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_create_cq: device: 0x%p, comp_handler: 0x%p, "
+ "event_handler: 0x%p, cq_context: 0x%p, cqe: 0x%x, "
+ "comp_vector: %d => no sufficient memory", device,
+ comp_handler, event_handler, cq_context, cqe, comp_vector);
+ return ((struct ib_cq *)-ENOMEM);
+ }
+
+ ofs_lock_enter(&ofs_client->lock);
+ if (device->reg_state != IB_DEV_OPEN) {
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_create_cq: device: 0x%p, comp_handler: 0x%p, "
+ "event_handler: 0x%p, cq_context: 0x%p, cqe: 0x%x, "
+ "comp_vector: %d => invalid device state (%d)", device,
+ comp_handler, event_handler, cq_context, cqe, comp_vector,
+ device->reg_state);
+ return ((struct ib_cq *)-ENXIO);
+ }
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_create_cq: device: 0x%p, comp_handler: 0x%p, "
+ "event_handler: 0x%p, cq_context: 0x%p, cqe: 0x%x, "
+ "comp_vector: %d", device, comp_handler, event_handler,
+ cq_context, cqe, comp_vector);
+
+ cq_attr.cq_size = cqe;
+ cq_attr.cq_sched = 0; /* no hint */
+ cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
+ rtn = ibt_alloc_cq(device->hca_hdl, &cq_attr, &cq->ibt_cq, &real_size);
+ ofs_lock_exit(&ofs_client->lock);
+
+ if (rtn == IBT_SUCCESS) {
+ cq->device = device;
+ cq->comp_handler = comp_handler;
+ cq->event_handler = event_handler;
+ cq->cq_context = cq_context;
+ cq->cqe = real_size;
+ ibt_set_cq_private(cq->ibt_cq, cq);
+ ibt_set_cq_handler(cq->ibt_cq, ofs_cq_handler, cq_context);
+ mutex_init(&cq->lock, NULL, MUTEX_DEFAULT, NULL);
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_create_cq: device: 0x%p, cqe: 0x%x, ibt_cq: 0x%p, "
+ "rtn: 0x%x", device, cqe, cq->ibt_cq, rtn);
+ return (cq);
+ }
+ kmem_free(cq, sizeof (struct ib_cq));
+
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_create_cq: device: 0x%p, cqe: 0x%x, ibt_cq: 0x%p => "
+ "ibt_alloc_cq failed w/ 0x%x", device, cqe, cq->ibt_cq, rtn);
+
+ switch (rtn) {
+ case IBT_HCA_CQ_EXCEEDED:
+ case IBT_INVALID_PARAM:
+ case IBT_HCA_HDL_INVALID:
+ return ((struct ib_cq *)-EINVAL);
+ case IBT_INSUFF_RESOURCE:
+ return ((struct ib_cq *)-ENOMEM);
+ default:
+ return ((struct ib_cq *)-EIO);
+ }
+}
+
+int
+ib_destroy_cq(struct ib_cq *cq)
+{
+ ofs_client_t *ofs_client = (ofs_client_t *)cq->device->clnt_hdl;
+ int rtn;
+
+ ofs_lock_enter(&ofs_client->lock);
+ if (cq->device->reg_state != IB_DEV_OPEN) {
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_destroy_cq: cq: 0x%p => invalid device state (%d)",
+ cq, cq->device->reg_state);
+ return (-ENXIO);
+ }
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_destroy_cq: cq: 0x%p", cq);
+
+ /*
+ * if IBTL_ASYNC_PENDING is set, ibt_qp is not freed
+ * at this moment, but yet alive for a while. Then
+ * there is a possibility that this qp is used even after
+ * ib_destroy_cq() is called. To distinguish this case from
+ * others, clear ibt_qp here.
+ */
+ ibt_set_cq_private(cq->ibt_cq, NULL);
+
+ rtn = ibt_free_cq(cq->ibt_cq);
+ if (rtn == IBT_SUCCESS) {
+ ofs_lock_exit(&ofs_client->lock);
+ kmem_free(cq, sizeof (struct ib_cq));
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_destroy_cq: cq: 0x%p, rtn: 0x%x", cq, rtn);
+ return (0);
+ }
+ ibt_set_cq_private(cq->ibt_cq, cq);
+ ofs_lock_exit(&ofs_client->lock);
+
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_destroy_cq: cq: 0x%p => ibt_free_cq failed w/ 0x%x", cq, rtn);
+
+ switch (rtn) {
+ case IBT_CQ_BUSY:
+ return (-EBUSY);
+ case IBT_HCA_HDL_INVALID:
+ case IBT_CQ_HDL_INVALID:
+ return (-EINVAL);
+ default:
+ return (-EIO);
+ }
+}
+
+struct ib_qp *
+ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr)
+{
+ ofs_client_t *ofs_client = pd->device->clnt_hdl;
+ ibt_qp_alloc_attr_t attrs;
+ ibt_chan_sizes_t sizes;
+ ib_qpn_t qpn;
+ ibt_qp_hdl_t ibt_qp;
+ struct ib_qp *qp;
+ int rtn;
+
+ /* sanity check */
+ if (!(qp_init_attr->send_cq && qp_init_attr->recv_cq)) {
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_create_qp: pd: 0x%p => invalid cqs "
+ "(send_cq=0x%p, recv_cq=0x%p)", pd,
+ qp_init_attr->send_cq, qp_init_attr->recv_cq);
+ return ((struct ib_qp *)-EINVAL);
+ }
+
+ /* UC, Raw IPv6 and Raw Ethernet are not supported */
+ if (qp_init_attr->qp_type == IB_QPT_UC ||
+ qp_init_attr->qp_type == IB_QPT_RAW_IPV6 ||
+ qp_init_attr->qp_type == IB_QPT_RAW_ETY) {
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_create_qp: pd: 0x%p => invalid qp_type",
+ pd, qp_init_attr->qp_type);
+ return ((struct ib_qp *)-EINVAL);
+ }
+
+ if ((qp = kmem_alloc(sizeof (struct ib_qp), KM_NOSLEEP)) == NULL) {
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_create_qp: pd: 0x%p, init_attr: 0x%p => "
+ "no sufficient memory", pd, qp_init_attr);
+ return ((struct ib_qp *)-ENOMEM);
+ }
+
+ ofs_lock_enter(&ofs_client->lock);
+ if (pd->device->reg_state != IB_DEV_OPEN) {
+ ofs_lock_exit(&ofs_client->lock);
+ kmem_free(qp, sizeof (struct ib_qp));
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_create_qp: pd: 0x%p, init_attr: 0x%p => "
+ "invalid device state (%d)", pd, qp_init_attr,
+ pd->device->reg_state);
+ return ((struct ib_qp *)-ENXIO);
+ }
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_create_qp: pd: 0x%p, event_handler: 0x%p, qp_context: 0x%p, "
+ "send_cq: 0x%p, recv_cq: 0x%p, srq: 0x%p, max_send_wr: 0x%x, "
+ "max_recv_wr: 0x%x, max_send_sge: 0x%x, max_recv_sge: 0x%x, "
+ "max_inline_data: 0x%x, sq_sig_type: %d, qp_type: %d, "
+ "port_num: %d",
+ pd, qp_init_attr->event_handler, qp_init_attr->qp_context,
+ qp_init_attr->send_cq, qp_init_attr->recv_cq, qp_init_attr->srq,
+ qp_init_attr->cap.max_send_wr, qp_init_attr->cap.max_recv_wr,
+ qp_init_attr->cap.max_send_sge, qp_init_attr->cap.max_recv_sge,
+ qp_init_attr->cap.max_inline_data, qp_init_attr->sq_sig_type,
+ qp_init_attr->qp_type, qp_init_attr->port_num);
+
+ attrs.qp_alloc_flags = IBT_QP_NO_FLAGS;
+ if (qp_init_attr->srq) {
+ attrs.qp_alloc_flags |= IBT_QP_USES_SRQ;
+ }
+
+ attrs.qp_flags = IBT_ALL_SIGNALED | IBT_FAST_REG_RES_LKEY;
+ if (qp_init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) {
+ attrs.qp_flags |= IBT_WR_SIGNALED;
+ }
+
+ attrs.qp_scq_hdl = qp_init_attr->send_cq->ibt_cq;
+ attrs.qp_rcq_hdl = qp_init_attr->recv_cq->ibt_cq;
+ attrs.qp_pd_hdl = pd->ibt_pd;
+
+ attrs.qp_sizes.cs_sq = qp_init_attr->cap.max_send_wr;
+ attrs.qp_sizes.cs_rq = qp_init_attr->cap.max_recv_wr;
+ attrs.qp_sizes.cs_sq_sgl = qp_init_attr->cap.max_send_sge;
+ attrs.qp_sizes.cs_rq_sgl = qp_init_attr->cap.max_recv_sge;
+ attrs.qp_sizes.cs_inline = qp_init_attr->cap.max_inline_data;
+
+ switch (qp_init_attr->qp_type) {
+ case IB_QPT_RC:
+ rtn = ibt_alloc_qp(pd->device->hca_hdl, IBT_RC_RQP, &attrs,
+ &sizes, &qpn, &ibt_qp);
+ break;
+ case IB_QPT_UD:
+ rtn = ibt_alloc_qp(pd->device->hca_hdl, IBT_UD_RQP, &attrs,
+ &sizes, &qpn, &ibt_qp);
+ break;
+ case IB_QPT_SMI:
+ rtn = ibt_alloc_special_qp(pd->device->hca_hdl,
+ qp_init_attr->port_num, IBT_SMI_SQP, &attrs, &sizes,
+ &ibt_qp);
+ break;
+ case IB_QPT_GSI:
+ rtn = ibt_alloc_special_qp(pd->device->hca_hdl,
+ qp_init_attr->port_num, IBT_GSI_SQP, &attrs, &sizes,
+ &ibt_qp);
+ break;
+ default:
+ /* this should never happens */
+ ofs_lock_exit(&ofs_client->lock);
+ kmem_free(qp, sizeof (struct ib_qp));
+ return ((struct ib_qp *)-EINVAL);
+ }
+ ofs_lock_exit(&ofs_client->lock);
+
+ if (rtn == IBT_SUCCESS) {
+ /* fill in ib_qp_cap w/ the real values */
+ qp_init_attr->cap.max_send_wr = sizes.cs_sq;
+ qp_init_attr->cap.max_recv_wr = sizes.cs_rq;
+ qp_init_attr->cap.max_send_sge = sizes.cs_sq_sgl;
+ qp_init_attr->cap.max_recv_sge = sizes.cs_rq_sgl;
+ /* max_inline_data is not supported */
+ qp_init_attr->cap.max_inline_data = 0;
+ /* fill in ib_qp */
+ qp->device = pd->device;
+ qp->pd = pd;
+ qp->send_cq = qp_init_attr->send_cq;
+ qp->recv_cq = qp_init_attr->recv_cq;
+ qp->srq = qp_init_attr->srq;
+ qp->event_handler = qp_init_attr->event_handler;
+ qp->qp_context = qp_init_attr->qp_context;
+ qp->qp_num = qp_init_attr->qp_type == IB_QPT_SMI ? 0 :
+ qp_init_attr->qp_type == IB_QPT_GSI ? 1 : qpn;
+ qp->qp_type = qp_init_attr->qp_type;
+ qp->ibt_qp = ibt_qp;
+ ibt_set_qp_private(qp->ibt_qp, qp);
+ mutex_init(&qp->lock, NULL, MUTEX_DEFAULT, NULL);
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_create_qp: device: 0x%p, pd: 0x%x, init_attr: 0x%p, "
+ "rtn: 0x%x", pd->device, pd, qp_init_attr, rtn);
+ return (qp);
+ }
+ kmem_free(qp, sizeof (struct ib_qp));
+
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_create_qp: device: 0x%p, pd: 0x%x, init_attr: 0x%p => "
+ "ibt_alloc_(special)_qp failed w/ rtn: 0x%x", pd->device, pd,
+ qp_init_attr, rtn);
+
+ switch (rtn) {
+ case IBT_NOT_SUPPORTED:
+ case IBT_QP_SRV_TYPE_INVALID:
+ case IBT_CQ_HDL_INVALID:
+ case IBT_HCA_HDL_INVALID:
+ case IBT_INVALID_PARAM:
+ case IBT_SRQ_HDL_INVALID:
+ case IBT_PD_HDL_INVALID:
+ case IBT_HCA_SGL_EXCEEDED:
+ case IBT_HCA_WR_EXCEEDED:
+ return ((struct ib_qp *)-EINVAL);
+ case IBT_INSUFF_RESOURCE:
+ return ((struct ib_qp *)-ENOMEM);
+ default:
+ return ((struct ib_qp *)-EIO);
+ }
+}
+
+int
+ib_destroy_qp(struct ib_qp *qp)
+{
+ ofs_client_t *ofs_client = (ofs_client_t *)qp->device->clnt_hdl;
+ int rtn;
+
+ ofs_lock_enter(&ofs_client->lock);
+ if (qp->device->reg_state != IB_DEV_OPEN) {
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_destroy_qp: qp: 0x%p => invalid device state (%d)",
+ qp, qp->device->reg_state);
+ return (-ENXIO);
+ }
+
+ /*
+ * if IBTL_ASYNC_PENDING is set, ibt_qp is not freed
+ * at this moment, but yet alive for a while. Then
+ * there is a possibility that this qp is used even after
+ * ib_destroy_qp() is called. To distinguish this case from
+ * others, clear ibt_qp here.
+ */
+ ibt_set_qp_private(qp->ibt_qp, NULL);
+
+ rtn = ibt_free_qp(qp->ibt_qp);
+ if (rtn == IBT_SUCCESS) {
+ ofs_lock_exit(&ofs_client->lock);
+ kmem_free(qp, sizeof (struct ib_qp));
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_destroy_qp: qp: 0x%p, rtn: 0x%x", qp, rtn);
+ return (0);
+ }
+ ibt_set_qp_private(qp->ibt_qp, qp);
+ ofs_lock_exit(&ofs_client->lock);
+
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_destroy_qp: qp: 0x%p => ibt_free_qp failed w/ 0x%x", qp, rtn);
+
+ switch (rtn) {
+ case IBT_CHAN_STATE_INVALID:
+ case IBT_HCA_HDL_INVALID:
+ case IBT_QP_HDL_INVALID:
+ return (-EINVAL);
+ default:
+ return (-EIO);
+ }
+}
+
+/*
+ * ib_req_notify_cq - Request completion notification on a CQ.
+ * @cq: The CQ to generate an event for.
+ * @flags:
+ * Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP
+ * to request an event on the next solicited event or next work
+ * completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS
+ * may also be |ed in to request a hint about missed events, as
+ * described below.
+ *
+ * Return Value:
+ * < 0 means an error occurred while requesting notification
+ * == 0 means notification was requested successfully, and if
+ * IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events
+ * were missed and it is safe to wait for another event. In
+ * this case is it guaranteed that any work completions added
+ * to the CQ since the last CQ poll will trigger a completion
+ * notification event.
+ * > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed
+ * in. It means that the consumer must poll the CQ again to
+ * make sure it is empty to avoid missing an event because of a
+ * race between requesting notification and an entry being
+ * added to the CQ. This return value means it is possible
+ * (but not guaranteed) that a work completion has been added
+ * to the CQ since the last poll without triggering a
+ * completion notification event.
+ *
+ * Note that IB_CQ_REPORT_MISSED_EVENTS is currently not supported.
+ */
+int
+ib_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags)
+{
+ ibt_cq_notify_flags_t notify_type;
+ int rtn;
+ ofs_client_t *ofs_client = cq->device->clnt_hdl;
+
+ ofs_lock_enter(&ofs_client->lock);
+ if (cq->device->reg_state != IB_DEV_OPEN) {
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_req_notify_cq: cq: 0x%p, flag: 0x%x", cq, flags);
+ return (-ENXIO);
+ }
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_req_notify_cq: cq: 0x%p, flag: 0x%x", cq, flags);
+
+ switch (flags & IB_CQ_SOLICITED_MASK) {
+ case IB_CQ_SOLICITED:
+ notify_type = IBT_NEXT_SOLICITED;
+ break;
+ case IB_CQ_NEXT_COMP:
+ notify_type = IBT_NEXT_COMPLETION;
+ break;
+ default:
+ /* Currently only two flags are supported */
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_req_notify_cq: cq: 0x%p, flag: 0x%x => invalid flag",
+ cq, flags);
+ return (-EINVAL);
+ }
+
+ rtn = ibt_enable_cq_notify(cq->ibt_cq, notify_type);
+ ofs_lock_exit(&ofs_client->lock);
+
+ if (rtn == IBT_SUCCESS) {
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_req_notify_cq: cq: 0x%p, flag: 0x%x rtn: 0x%x",
+ cq, flags, rtn);
+ return (0);
+ }
+
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_req_notify_cq: cq: 0x%p, flag: 0x%x => ibt_enable_cq_notify "
+ "failed w/ 0x%x", cq, flags, rtn);
+
+ switch (rtn) {
+ case IBT_HCA_HDL_INVALID:
+ case IBT_CQ_HDL_INVALID:
+ case IBT_CQ_NOTIFY_TYPE_INVALID:
+ return (-EINVAL);
+ default:
+ return (-EIO);
+ }
+}
+
+static const struct {
+ int valid;
+ enum ib_qp_attr_mask req_param[IB_QPT_RAW_ETY + 1];
+ enum ib_qp_attr_mask opt_param[IB_QPT_RAW_ETY + 1];
+} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+
+ [IB_QPS_RESET] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_INIT] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX | IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
+ }
+ },
+ },
+ [IB_QPS_INIT] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_INIT] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX | IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_RTR] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UC] = (IB_QP_AV | IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN | IB_QP_RQ_PSN),
+ [IB_QPT_RC] = (IB_QP_AV | IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN | IB_QP_RQ_PSN |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER),
+ },
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX),
+ [IB_QPT_RC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_RTR] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UD] = IB_QP_SQ_PSN,
+ [IB_QPT_UC] = IB_QP_SQ_PSN,
+ [IB_QPT_RC] = (IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT | IB_QP_RNR_RETRY |
+ IB_QP_SQ_PSN | IB_QP_MAX_QP_RD_ATOMIC),
+ [IB_QPT_SMI] = IB_QP_SQ_PSN,
+ [IB_QPT_GSI] = IB_QP_SQ_PSN,
+ },
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_RTS] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE | IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_SQD] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY
+ }
+ },
+ },
+ [IB_QPS_SQD] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_SQD] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_AV | IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_PORT | IB_QP_AV |
+ IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC |
+ IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX |
+ IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_SQE] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_ERR] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 }
+ }
+};
+
+static inline int
+ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+ enum ib_qp_type type, enum ib_qp_attr_mask mask)
+{
+ enum ib_qp_attr_mask req_param, opt_param;
+
+ if (cur_state < 0 || cur_state > IB_QPS_ERR ||
+ next_state < 0 || next_state > IB_QPS_ERR) {
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp_is_ok: cur_state: %d, next_state: %d, "
+ "qp_type: %d, attr_mask: 0x%x => invalid state(1)",
+ cur_state, next_state, type, mask);
+ return (0);
+ }
+
+ if (mask & IB_QP_CUR_STATE &&
+ cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
+ cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE) {
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp_is_ok: cur_state: %d, next_state: %d, "
+ "qp_type: %d, attr_mask: 0x%x => invalid state(2)",
+ cur_state, next_state, type, mask);
+ return (0);
+ }
+
+ if (!qp_state_table[cur_state][next_state].valid) {
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp_is_ok: cur_state: %d, next_state: %d, "
+ "qp_type: %d, attr_mask: 0x%x => state is not valid",
+ cur_state, next_state, type, mask);
+ return (0);
+ }
+
+ req_param = qp_state_table[cur_state][next_state].req_param[type];
+ opt_param = qp_state_table[cur_state][next_state].opt_param[type];
+
+ if ((mask & req_param) != req_param) {
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp_is_ok: cur_state: %d, next_state: %d, "
+ "qp_type: %d, attr_mask: 0x%x => "
+ "required param doesn't match. req_param = 0x%x",
+ cur_state, next_state, type, mask, req_param);
+ return (0);
+ }
+
+ if (mask & ~(req_param | opt_param | IB_QP_STATE)) {
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp_is_ok: cur_state: %d, next_state: %d, "
+ "qp_type: %d, attr_mask: 0x%x => "
+ "unsupported options. req_param = 0x%x, opt_param = 0x%x",
+ cur_state, next_state, type, mask, req_param, opt_param);
+ return (0);
+ }
+
+ return (1);
+}
+
+static inline enum ib_qp_state
+qp_current_state(ibt_qp_query_attr_t *qp_attr)
+{
+ ASSERT(qp_attr->qp_info.qp_state != IBT_STATE_SQDRAIN);
+ return (enum ib_qp_state)(qp_attr->qp_info.qp_state);
+}
+
+static inline ibt_tran_srv_t
+of2ibtf_qp_type(enum ib_qp_type type)
+{
+ switch (type) {
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ case IB_QPT_UD:
+ return (IBT_UD_SRV);
+ case IB_QPT_RC:
+ return (IBT_RC_SRV);
+ case IB_QPT_UC:
+ return (IBT_UC_SRV);
+ case IB_QPT_RAW_IPV6:
+ return (IBT_RAWIP_SRV);
+ case IB_QPT_RAW_ETY:
+ default:
+ ASSERT(type == IB_QPT_RAW_ETY);
+ return (IBT_RAWETHER_SRV);
+ }
+}
+
+static inline void
+set_av(struct ib_ah_attr *attr, ibt_cep_path_t *pathp)
+{
+ ibt_adds_vect_t *av = &pathp->cep_adds_vect;
+
+ pathp->cep_hca_port_num = attr->port_num;
+ av->av_srate = OF2IBTF_SRATE(attr->static_rate);
+ av->av_srvl = attr->sl & 0xF;
+ av->av_send_grh = attr->ah_flags & IB_AH_GRH ? 1 : 0;
+
+ if (av->av_send_grh) {
+ av->av_dgid.gid_prefix =
+ attr->grh.dgid.global.subnet_prefix;
+ av->av_dgid.gid_guid =
+ attr->grh.dgid.global.interface_id;
+ av->av_flow = attr->grh.flow_label & 0xFFFFF;
+ av->av_tclass = attr->grh.traffic_class;
+ av->av_hop = attr->grh.hop_limit;
+ av->av_sgid_ix = attr->grh.sgid_index;
+ }
+ av->av_dlid = attr->dlid;
+ av->av_src_path = attr->src_path_bits;
+}
+
+int
+ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, int attr_mask)
+{
+ enum ib_qp_state cur_state, new_state;
+ ibt_hca_attr_t hattr;
+ ibt_qp_query_attr_t qp_attr;
+ ibt_qp_info_t modify_attr;
+ ibt_cep_modify_flags_t flags;
+ int rtn;
+ ofs_client_t *ofs_client = qp->device->clnt_hdl;
+
+ ofs_lock_enter(&ofs_client->lock);
+ if (qp->device->reg_state != IB_DEV_OPEN) {
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp: qp: 0x%p => invalid device state (%d)",
+ qp, qp->device->reg_state);
+ return (-ENXIO);
+ }
+
+ rtn = ibt_query_hca(qp->device->hca_hdl, &hattr);
+ if (rtn != IBT_SUCCESS) {
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp: qp: 0x%p, hca_hdl: 0x%p => "
+ "ibt_query_hca() failed w/ %d",
+ qp, qp->device->hca_hdl, rtn);
+ return (-EIO);
+ }
+
+ /* only one thread per qp is allowed during the qp modification */
+ mutex_enter(&qp->lock);
+
+ /* Get the current QP attributes first */
+ bzero(&qp_attr, sizeof (ibt_qp_query_attr_t));
+ if ((rtn = ibt_query_qp(qp->ibt_qp, &qp_attr)) != IBT_SUCCESS) {
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => "
+ "ibt_query_qp failed w/ 0x%x", qp, attr, attr_mask, rtn);
+ return (-EIO);
+ }
+
+ /* Get the current and new state for this QP */
+ cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state :
+ qp_current_state(&qp_attr);
+ new_state = attr_mask & IB_QP_STATE ? attr->qp_state :
+ cur_state;
+
+ /* Sanity check of the current/new states */
+ if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+ /* Linux OF returns 0 in this case */
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => "
+ "invalid state (both of current/new states are RESET)",
+ qp, attr, attr_mask);
+ return (0);
+ }
+
+ /*
+ * Check if this modification request is supported with the new
+ * and/or current state.
+ */
+ if (!ib_modify_qp_is_ok(cur_state, new_state, qp->qp_type, attr_mask)) {
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => "
+ "invalid arguments",
+ qp, attr, attr_mask);
+ return (-EINVAL);
+ }
+
+ /* Sanity checks */
+ if (attr_mask & IB_QP_PORT && (attr->port_num == 0 ||
+ attr->port_num > hattr.hca_nports)) {
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => "
+ "invalid attr->port_num(%d), max_nports(%d)",
+ qp, attr, attr_mask, attr->port_num, hattr.hca_nports);
+ return (-EINVAL);
+ }
+
+ if (attr_mask & IB_QP_PKEY_INDEX &&
+ attr->pkey_index >= hattr.hca_max_port_pkey_tbl_sz) {
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => "
+ "invalid attr->pkey_index(%d), max_pkey_index(%d)",
+ qp, attr, attr_mask, attr->pkey_index,
+ hattr.hca_max_port_pkey_tbl_sz);
+ return (-EINVAL);
+ }
+
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+ attr->max_rd_atomic > hattr.hca_max_rdma_out_qp) {
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => "
+ "invalid attr->max_rd_atomic(0x%x), max_rdma_out_qp(0x%x)",
+ qp, attr, attr_mask, attr->max_rd_atomic,
+ hattr.hca_max_rdma_out_qp);
+ return (-EINVAL);
+ }
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+ attr->max_dest_rd_atomic > hattr.hca_max_rdma_in_qp) {
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => "
+ "invalid attr->max_dest_rd_atomic(0x%x), "
+ "max_rdma_in_qp(0x%x)", qp, attr, attr_mask,
+ attr->max_dest_rd_atomic, hattr.hca_max_rdma_in_qp);
+ return (-EINVAL);
+ }
+
+ /* copy the current setting */
+ modify_attr = qp_attr.qp_info;
+
+ /*
+ * Since it's already checked if the modification request matches
+ * the new and/or current states, just assign both of states to
+ * modify_attr here. The current state is required if qp_state
+ * is RTR, but it's harmelss otherwise, so it's set always.
+ */
+ modify_attr.qp_current_state = OF2IBTF_STATE(cur_state);
+ modify_attr.qp_state = OF2IBTF_STATE(new_state);
+ modify_attr.qp_trans = of2ibtf_qp_type(qp->qp_type);
+
+ /* Convert OF modification requests into IBTF ones */
+ flags = IBT_CEP_SET_STATE; /* IBTF needs IBT_CEP_SET_STATE */
+ if (cur_state == IB_QPS_RESET &&
+ new_state == IB_QPS_INIT) {
+ flags |= IBT_CEP_SET_RESET_INIT;
+ } else if (cur_state == IB_QPS_INIT &&
+ new_state == IB_QPS_RTR) {
+ flags |= IBT_CEP_SET_INIT_RTR;
+ } else if (cur_state == IB_QPS_RTR &&
+ new_state == IB_QPS_RTS) {
+ flags |= IBT_CEP_SET_RTR_RTS;
+ }
+ if (attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
+ flags |= IBT_CEP_SET_SQD_EVENT;
+ }
+ if (attr_mask & IB_QP_ACCESS_FLAGS) {
+ modify_attr.qp_flags &= ~(IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR |
+ IBT_CEP_ATOMIC);
+ if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) {
+ flags |= IBT_CEP_SET_RDMA_R;
+ modify_attr.qp_flags |= IBT_CEP_RDMA_RD;
+ }
+ if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) {
+ flags |= IBT_CEP_SET_RDMA_W;
+ modify_attr.qp_flags |= IBT_CEP_RDMA_WR;
+ }
+ if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) {
+ flags |= IBT_CEP_SET_ATOMIC;
+ modify_attr.qp_flags |= IBT_CEP_ATOMIC;
+ }
+ }
+ if (attr_mask & IB_QP_PKEY_INDEX) {
+ flags |= IBT_CEP_SET_PKEY_IX;
+ switch (qp->qp_type) {
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ case IB_QPT_UD:
+ modify_attr.qp_transport.ud.ud_pkey_ix =
+ attr->pkey_index;
+ break;
+ case IB_QPT_RC:
+ modify_attr.qp_transport.rc.rc_path.cep_pkey_ix =
+ attr->pkey_index;
+ break;
+ case IB_QPT_UC:
+ modify_attr.qp_transport.uc.uc_path.cep_pkey_ix =
+ attr->pkey_index;
+ break;
+ default:
+ /* This should never happen */
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp(IB_QP_PKEY_INDEX): qp: 0x%p, "
+ "attr: 0x%p, attr_mask: 0x%x => "
+ "invalid qp->qp_type(%d)",
+ qp, attr, attr_mask, qp->qp_type);
+ return (-EINVAL);
+ }
+ }
+ if (attr_mask & IB_QP_PORT) {
+ flags |= IBT_CEP_SET_PORT;
+ switch (qp->qp_type) {
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ case IB_QPT_UD:
+ modify_attr.qp_transport.ud.ud_port = attr->port_num;
+ break;
+ case IB_QPT_RC:
+ modify_attr.qp_transport.rc.rc_path.cep_hca_port_num =
+ attr->port_num;
+ break;
+ case IB_QPT_UC:
+ modify_attr.qp_transport.uc.uc_path.cep_hca_port_num =
+ attr->port_num;
+ break;
+ default:
+ /* This should never happen */
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp(IB_QP_PORT): qp: 0x%p, "
+ "attr: 0x%p, attr_mask: 0x%x => "
+ "invalid qp->qp_type(%d)",
+ qp, attr, attr_mask, qp->qp_type);
+ return (-EINVAL);
+ }
+ }
+ if (attr_mask & IB_QP_QKEY) {
+ ASSERT(qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_SMI ||
+ qp->qp_type == IB_QPT_GSI);
+ flags |= IBT_CEP_SET_QKEY;
+ modify_attr.qp_transport.ud.ud_qkey = attr->qkey;
+ }
+ if (attr_mask & IB_QP_AV) {
+ flags |= IBT_CEP_SET_ADDS_VECT;
+ switch (qp->qp_type) {
+ case IB_QPT_RC:
+ set_av(&attr->ah_attr,
+ &modify_attr.qp_transport.rc.rc_path);
+ break;
+ case IB_QPT_UC:
+ set_av(&attr->ah_attr,
+ &modify_attr.qp_transport.uc.uc_path);
+ break;
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ case IB_QPT_UD:
+ default:
+ /* This should never happen */
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp(IB_QP_AV): qp: 0x%p, "
+ "attr: 0x%p, attr_mask: 0x%x => "
+ "invalid qp->qp_type(%d)",
+ qp, attr, attr_mask, qp->qp_type);
+ return (-EINVAL);
+ }
+ }
+ if (attr_mask & IB_QP_PATH_MTU) {
+ switch (qp->qp_type) {
+ case IB_QPT_RC:
+ modify_attr.qp_transport.rc.rc_path_mtu =
+ OF2IBTF_PATH_MTU(attr->path_mtu);
+ break;
+ case IB_QPT_UC:
+ modify_attr.qp_transport.uc.uc_path_mtu =
+ OF2IBTF_PATH_MTU(attr->path_mtu);
+ break;
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ case IB_QPT_UD:
+ default:
+ /* nothing to do */
+ break;
+ }
+ }
+ if (attr_mask & IB_QP_TIMEOUT && qp->qp_type == IB_QPT_RC) {
+ flags |= IBT_CEP_SET_TIMEOUT;
+ modify_attr.qp_transport.rc.rc_path.cep_timeout =
+ attr->timeout;
+ }
+ if (attr_mask & IB_QP_RETRY_CNT && qp->qp_type == IB_QPT_RC) {
+ flags |= IBT_CEP_SET_RETRY;
+ modify_attr.qp_transport.rc.rc_retry_cnt =
+ attr->retry_cnt & 0x7;
+ }
+ if (attr_mask & IB_QP_RNR_RETRY && qp->qp_type == IB_QPT_RC) {
+ flags |= IBT_CEP_SET_RNR_NAK_RETRY;
+ modify_attr.qp_transport.rc.rc_rnr_retry_cnt =
+ attr->rnr_retry & 0x7;
+ }
+ if (attr_mask & IB_QP_RQ_PSN) {
+ switch (qp->qp_type) {
+ case IB_QPT_RC:
+ modify_attr.qp_transport.rc.rc_rq_psn =
+ attr->rq_psn & 0xFFFFFF;
+ break;
+ case IB_QPT_UC:
+ modify_attr.qp_transport.uc.uc_rq_psn =
+ attr->rq_psn & 0xFFFFFF;
+ break;
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ case IB_QPT_UD:
+ default:
+ /* nothing to do */
+ break;
+ }
+ }
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && qp->qp_type == IB_QPT_RC) {
+ if (attr->max_rd_atomic) {
+ flags |= IBT_CEP_SET_RDMARA_OUT;
+ modify_attr.qp_transport.rc.rc_rdma_ra_out =
+ attr->max_rd_atomic;
+ }
+ }
+ if (attr_mask & IB_QP_ALT_PATH) {
+ /* Sanity checks */
+ if (attr->alt_port_num == 0 ||
+ attr->alt_port_num > hattr.hca_nports) {
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp: qp: 0x%p, attr: 0x%p, "
+ "attr_mask: 0x%x => invalid attr->alt_port_num"
+ "(%d), max_nports(%d)",
+ qp, attr, attr_mask, attr->alt_port_num,
+ hattr.hca_nports);
+ return (-EINVAL);
+ }
+ if (attr->alt_pkey_index >= hattr.hca_max_port_pkey_tbl_sz) {
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp: qp: 0x%p, attr: 0x%p, "
+ "attr_mask: 0x%x => invalid attr->alt_pkey_index"
+ "(%d), max_port_key_index(%d)",
+ qp, attr, attr_mask, attr->alt_pkey_index,
+ hattr.hca_max_port_pkey_tbl_sz);
+ return (-EINVAL);
+ }
+ flags |= IBT_CEP_SET_ALT_PATH;
+ switch (qp->qp_type) {
+ case IB_QPT_RC:
+ modify_attr.qp_transport.rc.rc_alt_path.
+ cep_pkey_ix = attr->alt_pkey_index;
+ modify_attr.qp_transport.rc.rc_alt_path.
+ cep_hca_port_num = attr->alt_port_num;
+ set_av(&attr->alt_ah_attr,
+ &modify_attr.qp_transport.rc.rc_alt_path);
+ modify_attr.qp_transport.rc.rc_alt_path.
+ cep_timeout = attr->alt_timeout;
+ break;
+ case IB_QPT_UC:
+ modify_attr.qp_transport.uc.uc_alt_path.
+ cep_pkey_ix = attr->alt_pkey_index;
+ modify_attr.qp_transport.uc.uc_alt_path.
+ cep_hca_port_num = attr->alt_port_num;
+ set_av(&attr->alt_ah_attr,
+ &modify_attr.qp_transport.uc.uc_alt_path);
+ modify_attr.qp_transport.uc.uc_alt_path.
+ cep_timeout = attr->alt_timeout;
+ break;
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ case IB_QPT_UD:
+ default:
+ /* This should never happen */
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp(IB_QP_ALT_PATH): qp: 0x%p, "
+ "attr: 0x%p, attr_mask: 0x%x => "
+ "invalid qp->qp_type(%d)",
+ qp, attr, attr_mask, qp->qp_type);
+ return (-EINVAL);
+ }
+ }
+ if (attr_mask & IB_QP_MIN_RNR_TIMER && qp->qp_type == IB_QPT_RC) {
+ flags |= IBT_CEP_SET_MIN_RNR_NAK;
+ modify_attr.qp_transport.rc.rc_min_rnr_nak =
+ attr->min_rnr_timer & 0x1F;
+ }
+ if (attr_mask & IB_QP_SQ_PSN) {
+ switch (qp->qp_type) {
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ case IB_QPT_UD:
+ modify_attr.qp_transport.ud.ud_sq_psn =
+ attr->sq_psn;
+ break;
+ case IB_QPT_RC:
+ modify_attr.qp_transport.rc.rc_sq_psn =
+ attr->sq_psn;
+ break;
+ case IB_QPT_UC:
+ modify_attr.qp_transport.uc.uc_sq_psn =
+ attr->sq_psn;
+ break;
+ default:
+ /* This should never happen */
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp(IB_QP_SQ_PSN): qp: 0x%p, "
+ "attr: 0x%p, attr_mask: 0x%x => "
+ "invalid qp->qp_type(%d)",
+ qp, attr, attr_mask, qp->qp_type);
+ return (-EINVAL);
+ }
+ }
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && qp->qp_type == IB_QPT_RC) {
+ /* Linux OF sets the value if max_dest_rd_atomic is not zero */
+ if (attr->max_dest_rd_atomic) {
+ flags |= IBT_CEP_SET_RDMARA_IN;
+ modify_attr.qp_transport.rc.rc_rdma_ra_in =
+ attr->max_dest_rd_atomic;
+ }
+ }
+ if (attr_mask & IB_QP_PATH_MIG_STATE) {
+ flags |= IBT_CEP_SET_MIG;
+ switch (qp->qp_type) {
+ case IB_QPT_RC:
+ modify_attr.qp_transport.rc.rc_mig_state =
+ OF2IBTF_PATH_MIG_STATE(attr->path_mig_state);
+ break;
+ case IB_QPT_UC:
+ modify_attr.qp_transport.uc.uc_mig_state =
+ OF2IBTF_PATH_MIG_STATE(attr->path_mig_state);
+ break;
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ case IB_QPT_UD:
+ default:
+ /* This should never happen */
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp(IB_QP_PATH_MIG_STATE): qp: 0x%p, "
+ "attr: 0x%p, attr_mask: 0x%x => "
+ "invalid qp->qp_type(%d)",
+ qp, attr, attr_mask, qp->qp_type);
+ return (-EINVAL);
+ }
+ }
+ if (attr_mask & IB_QP_CAP) {
+ /* IB_QP_CAP is not supported */
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp: qp: 0x%p, attr: 0x%p, "
+ "attr_mask: 0x%x => IB_QP_CAP is not supported",
+ qp, attr, attr_mask);
+ return (-EINVAL);
+ }
+ if (attr_mask & IB_QP_DEST_QPN) {
+ switch (qp->qp_type) {
+ case IB_QPT_RC:
+ modify_attr.qp_transport.rc.rc_dst_qpn =
+ attr->dest_qp_num;
+ break;
+ case IB_QPT_UC:
+ modify_attr.qp_transport.uc.uc_dst_qpn =
+ attr->dest_qp_num;
+ break;
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ case IB_QPT_UD:
+ default:
+ /* This should never happen */
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp(IB_QP_DEST_PSN): qp: 0x%p, "
+ "attr: 0x%p, attr_mask: 0x%x => "
+ "invalid qp->qp_type(%d)",
+ qp, attr, attr_mask, qp->qp_type);
+ return (-EINVAL);
+ }
+ }
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x, "
+ "flags: 0x%x, modify_attr: 0x%p",
+ qp, attr, attr_mask, flags, &modify_attr);
+
+ /* Modify the QP attributes */
+ rtn = ibt_modify_qp(qp->ibt_qp, flags, &modify_attr, NULL);
+ if (rtn == IBT_SUCCESS) {
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+ return (0);
+ }
+ mutex_exit(&qp->lock);
+ ofs_lock_exit(&ofs_client->lock);
+
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => "
+ "ibt_modify_qp failed w/ %d, flags: 0x%x",
+ qp, attr, attr_mask, rtn, flags);
+
+ switch (rtn) {
+ case IBT_HCA_HDL_INVALID:
+ case IBT_QP_HDL_INVALID:
+ case IBT_QP_SRV_TYPE_INVALID:
+ case IBT_QP_STATE_INVALID:
+ case IBT_HCA_PORT_INVALID:
+ case IBT_PKEY_IX_ILLEGAL:
+ return (-EINVAL);
+ default:
+ return (-EIO);
+ }
+}
+
+static inline enum ib_wc_status
+ibt2of_wc_status(ibt_wc_status_t status)
+{
+ switch (status) {
+ case IBT_WC_LOCAL_LEN_ERR:
+ return (IB_WC_LOC_LEN_ERR);
+ case IBT_WC_LOCAL_CHAN_OP_ERR:
+ return (IB_WC_LOC_QP_OP_ERR);
+ case IBT_WC_LOCAL_PROTECT_ERR:
+ return (IB_WC_LOC_PROT_ERR);
+ case IBT_WC_WR_FLUSHED_ERR:
+ return (IB_WC_WR_FLUSH_ERR);
+ case IBT_WC_MEM_WIN_BIND_ERR:
+ return (IB_WC_MW_BIND_ERR);
+ case IBT_WC_BAD_RESPONSE_ERR:
+ return (IB_WC_BAD_RESP_ERR);
+ case IBT_WC_LOCAL_ACCESS_ERR:
+ return (IB_WC_LOC_ACCESS_ERR);
+ case IBT_WC_REMOTE_INVALID_REQ_ERR:
+ return (IB_WC_REM_INV_REQ_ERR);
+ case IBT_WC_REMOTE_ACCESS_ERR:
+ return (IB_WC_REM_ACCESS_ERR);
+ case IBT_WC_REMOTE_OP_ERR:
+ return (IB_WC_REM_OP_ERR);
+ case IBT_WC_TRANS_TIMEOUT_ERR:
+ return (IB_WC_RETRY_EXC_ERR);
+ case IBT_WC_RNR_NAK_TIMEOUT_ERR:
+ return (IB_WC_RNR_RETRY_EXC_ERR);
+ case IBT_WC_SUCCESS:
+ default:
+ /* Hermon doesn't support EEC yet */
+ ASSERT(status == IBT_WC_SUCCESS);
+ return (IB_WC_SUCCESS);
+ }
+}
+
+static inline enum ib_wc_opcode
+ibt2of_wc_opcode(ibt_wrc_opcode_t wc_type)
+{
+ switch (wc_type) {
+ case IBT_WRC_SEND:
+ return (IB_WC_SEND);
+ case IBT_WRC_RDMAR:
+ return (IB_WC_RDMA_READ);
+ case IBT_WRC_RDMAW:
+ return (IB_WC_RDMA_WRITE);
+ case IBT_WRC_CSWAP:
+ return (IB_WC_COMP_SWAP);
+ case IBT_WRC_FADD:
+ return (IB_WC_FETCH_ADD);
+ case IBT_WRC_BIND:
+ return (IB_WC_BIND_MW);
+ case IBT_WRC_RECV:
+ return (IB_WC_RECV);
+ case IBT_WRC_RECV_RDMAWI:
+ default:
+ ASSERT(wc_type == IBT_WRC_RECV_RDMAWI);
+ return (IB_WC_RECV_RDMA_WITH_IMM);
+ }
+}
+
+static inline int
+ibt2of_wc_flags(ibt_wc_flags_t wc_flags)
+{
+ return (wc_flags & ~IBT_WC_CKSUM_OK);
+}
+
+static inline void
+set_wc(ibt_wc_t *ibt_wc, struct ib_wc *wc)
+{
+ wc->wr_id = ibt_wc->wc_id;
+ wc->status = ibt2of_wc_status(ibt_wc->wc_status);
+ /* opcode can be undefined if status is not success */
+ if (wc->status == IB_WC_SUCCESS) {
+ wc->opcode = ibt2of_wc_opcode(ibt_wc->wc_type);
+ }
+ wc->vendor_err = 0; /* not supported */
+ wc->byte_len = ibt_wc->wc_bytes_xfer;
+ wc->qp = NULL; /* not supported */
+ wc->imm_data = htonl(ibt_wc->wc_immed_data);
+ wc->src_qp = ibt_wc->wc_qpn;
+ wc->wc_flags = ibt2of_wc_flags(ibt_wc->wc_flags);
+ wc->pkey_index = ibt_wc->wc_pkey_ix;
+ wc->slid = ibt_wc->wc_slid;
+ wc->sl = ibt_wc->wc_sl;
+ wc->dlid_path_bits = ibt_wc->wc_path_bits;
+ wc->port_num = 0; /* not supported */
+}
+
+/*
+ * ib_poll_cq - poll a CQ for completion(s)
+ * @cq:the CQ being polled
+ * @num_entries:maximum number of completions to return
+ * @wc:array of at least @num_entries &struct ib_wc where completions
+ * will be returned
+ *
+ * Poll a CQ for (possibly multiple) completions. If the return value
+ * is < 0, an error occurred. If the return value is >= 0, it is the
+ * number of completions returned. If the return value is
+ * non-negative and < num_entries, then the CQ was emptied.
+ *
+ * Note that three following memebers in struct ib_wc are not supported
+ * currently, and the values are always either 0 or NULL.
+ * u32 vendor_err;
+ * struct ib_qp *qp;
+ * u8 port_num;
+ */
+int
+ib_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc)
+{
+ ibt_wc_t ibt_wc;
+ int npolled;
+ ibt_status_t rtn;
+ ofs_client_t *ofs_client = (ofs_client_t *)cq->device->clnt_hdl;
+
+ ofs_lock_enter(&ofs_client->lock);
+ if (cq->device->reg_state != IB_DEV_OPEN) {
+ ofs_lock_exit(&ofs_client->lock);
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_poll_cq: cq: 0x%p => invalid device state (%d)",
+ cq, cq->device->reg_state);
+ return (-ENXIO);
+ }
+
+ SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
+ "ib_poll_cq: cq: 0x%p, num_entries: %d, wc: 0x%p, "
+ "ibt_cq: 0x%p, ibt_wc: 0x%p",
+ cq, num_entries, wc, cq->ibt_cq, &ibt_wc);
+
+ /* only one thread per cq is allowed during ibt_poll_cq() */
+ mutex_enter(&cq->lock);
+ for (npolled = 0; npolled < num_entries; ++npolled) {
+ bzero(&ibt_wc, sizeof (ibt_wc_t));
+ rtn = ibt_poll_cq(cq->ibt_cq, &ibt_wc, 1, NULL);
+ if (rtn != IBT_SUCCESS) {
+ break;
+ }
+ /* save this result to struct ib_wc */
+ set_wc(&ibt_wc, wc + npolled);
+ }
+ mutex_exit(&cq->lock);
+ ofs_lock_exit(&ofs_client->lock);
+
+ if (rtn == IBT_SUCCESS || rtn == IBT_CQ_EMPTY) {
+ return (npolled);
+ }
+
+ SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
+ "ib_poll_cq: cq: 0x%p, num_entries: %d, wc: 0x%p => "
+ "ibt_poll_cq failed w/ %d, npolled = %d",
+ cq, num_entries, wc, rtn, npolled);
+
+ switch (rtn) {
+ case IBT_HCA_HDL_INVALID:
+ case IBT_CQ_HDL_INVALID:
+ case IBT_INVALID_PARAM:
+ return (-EINVAL);
+ default:
+ return (-EIO);
+ }
+}
+
+ibt_hca_hdl_t
+ib_get_ibt_hca_hdl(struct ib_device *device)
+{
+ return (device->hca_hdl);
+}
+
+ibt_channel_hdl_t
+ib_get_ibt_channel_hdl(struct rdma_cm_id *cm)
+{
+ return (cm->qp == NULL ? NULL : cm->qp->ibt_qp);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/LICENSE b/usr/src/uts/common/io/ib/clients/rdsv3/LICENSE
new file mode 100644
index 0000000000..29c3aeca4f
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/LICENSE
@@ -0,0 +1,40 @@
+DO NOT TRANSLATE OR LOCALIZE.
+
+This package includes software from the OpenFabrics SW Stack.
+Use of any of this software is governed by the terms of the license below:
+
+OpenIB BSD License
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+The Oracle components of this package are licensed under version 1.0 of
+the Common Development and Distribution License ("CDDL").
+You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or
+http://www.opensolaris.org/os/licensing. See the License for the
+specific language governing permissions and limitations under the License.
+
+When distributing Covered Code, include this CDDL HEADER in each
+file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+If applicable, add the following below this CDDL HEADER, with the
+fields enclosed by brackets "[]" replaced with your own identifying
+information: Portions Copyright [yyyy] [name of copyright owner]
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/LICENSE.descrip b/usr/src/uts/common/io/ib/clients/rdsv3/LICENSE.descrip
new file mode 100644
index 0000000000..dc724239e2
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/LICENSE.descrip
@@ -0,0 +1 @@
+Support of Reliable Datagram Service (RDS)
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/af_rds.c b/usr/src/uts/common/io/ib/clients/rdsv3/af_rds.c
new file mode 100644
index 0000000000..49789637f6
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/af_rds.c
@@ -0,0 +1,1009 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/modctl.h>
+#include <sys/rds.h>
+#include <sys/stropts.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sockio.h>
+#include <sys/sysmacros.h>
+
+#include <inet/ip.h>
+#include <net/if_types.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/rdma.h>
+#include <sys/ib/clients/rdsv3/rdma_transport.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+extern void rdsv3_remove_bound(struct rdsv3_sock *rds);
+extern int rdsv3_verify_bind_address(ipaddr_t addr);
+
+extern ddi_taskq_t *rdsv3_taskq;
+extern struct rdma_cm_id *rdsv3_rdma_listen_id;
+
+/* this is just used for stats gathering :/ */
+kmutex_t rdsv3_sock_lock;
+static unsigned long rdsv3_sock_count;
+list_t rdsv3_sock_list;
+rdsv3_wait_queue_t rdsv3_poll_waitq;
+
+/*
+ * This is called as the final descriptor referencing this socket is closed.
+ * We have to unbind the socket so that another socket can be bound to the
+ * address it was using.
+ *
+ * We have to be careful about racing with the incoming path. sock_orphan()
+ * sets SOCK_DEAD and we use that as an indicator to the rx path that new
+ * messages shouldn't be queued.
+ */
+/* ARGSUSED */
+static int
+rdsv3_release(sock_lower_handle_t proto_handle, int flgs, cred_t *cr)
+{
+ struct rsock *sk = (struct rsock *)proto_handle;
+ struct rdsv3_sock *rs;
+
+ if (sk == NULL)
+ goto out;
+
+ rs = rdsv3_sk_to_rs(sk);
+ RDSV3_DPRINTF4("rdsv3_release", "Enter(rs: %p, sk: %p)", rs, sk);
+
+ rdsv3_sk_sock_orphan(sk);
+ rdsv3_cong_remove_socket(rs);
+ rdsv3_remove_bound(rs);
+ /*
+ * Note - rdsv3_clear_recv_queue grabs rs_recv_lock, so
+ * that ensures the recv path has completed messing
+ * with the socket.
+ */
+ rdsv3_clear_recv_queue(rs);
+ rdsv3_send_drop_to(rs, NULL);
+ rdsv3_rdma_drop_keys(rs);
+ (void) rdsv3_notify_queue_get(rs, NULL);
+
+ mutex_enter(&rdsv3_sock_lock);
+ list_remove_node(&rs->rs_item);
+ rdsv3_sock_count--;
+ mutex_exit(&rdsv3_sock_lock);
+
+ rdsv3_sk_sock_put(sk);
+
+ RDSV3_DPRINTF4("rdsv3_release", "Return (rds: %p)", rs);
+out:
+ return (0);
+}
+
+void
+__rdsv3_wake_sk_sleep(struct rsock *sk)
+{
+ /* wakup anyone waiting in recvmsg */
+ if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD) && sk->sk_sleep)
+ rdsv3_wake_up(sk->sk_sleep);
+}
+
+/*
+ * Careful not to race with rdsv3_release -> sock_orphan which clears sk_sleep.
+ * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK
+ * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
+ * this seems more conservative.
+ * NB - normally, one would use sk_callback_lock for this, but we can
+ * get here from interrupts, whereas the network code grabs sk_callback_lock
+ * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
+ */
+void
+rdsv3_wake_sk_sleep(struct rdsv3_sock *rs)
+{
+ RDSV3_DPRINTF4("rdsv3_wake_sk_sleep", "Enter(rs: %p)", rs);
+
+ rw_enter(&rs->rs_recv_lock, RW_READER);
+ __rdsv3_wake_sk_sleep(rdsv3_rs_to_sk(rs));
+ rw_exit(&rs->rs_recv_lock);
+}
+
+/*ARGSUSED*/
+static int
+rdsv3_getname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
+ socklen_t *addr_len, cred_t *cr)
+{
+ struct rsock *sk = (struct rsock *)proto_handle;
+ struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+ struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
+
+ RDSV3_DPRINTF4("rdsv3_getname", "Enter(rs: %p, port: %d)", rs,
+ rs->rs_bound_port);
+
+ sin->sin_port = rs->rs_bound_port;
+ sin->sin_addr.s_addr = rs->rs_bound_addr;
+
+ sin->sin_family = AF_INET_OFFLOAD;
+
+ *addr_len = sizeof (*sin);
+ return (0);
+}
+
+/*
+ * RDS' poll is without a doubt the least intuitive part of the interface,
+ * as POLLIN and POLLOUT do not behave entirely as you would expect from
+ * a network protocol.
+ *
+ * POLLIN is asserted if
+ * - there is data on the receive queue.
+ * - to signal that a previously congested destination may have become
+ * uncongested
+ * - A notification has been queued to the socket (this can be a congestion
+ * update, or a RDMA completion).
+ *
+ * POLLOUT is asserted if there is room on the send queue. This does not mean
+ * however, that the next sendmsg() call will succeed. If the application tries
+ * to send to a congested destination, the system call may still fail (and
+ * return ENOBUFS).
+ */
+/* ARGSUSED */
+static short
+rdsv3_poll(sock_lower_handle_t proto_handle, short events, int anyyet,
+ cred_t *cr)
+{
+ struct rsock *sk = (struct rsock *)proto_handle;
+ struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
+ unsigned short mask = 0;
+
+#if 0
+ RDSV3_DPRINTF4("rdsv3_poll", "enter(%p %x %d)", rs, events, anyyet);
+#endif
+
+ rw_enter(&rs->rs_recv_lock, RW_READER);
+ if (!rs->rs_cong_monitor) {
+ /*
+ * When a congestion map was updated, we signal POLLIN for
+ * "historical" reasons. Applications can also poll for
+ * WRBAND instead.
+ */
+ if (rdsv3_cong_updated_since(&rs->rs_cong_track))
+ mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
+ } else {
+ mutex_enter(&rs->rs_lock);
+ if (rs->rs_cong_notify)
+ mask |= (POLLIN | POLLRDNORM);
+ mutex_exit(&rs->rs_lock);
+ }
+ if (!list_is_empty(&rs->rs_recv_queue) ||
+ !list_is_empty(&rs->rs_notify_queue))
+ mask |= (POLLIN | POLLRDNORM);
+ if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs))
+ mask |= (POLLOUT | POLLWRNORM);
+ rw_exit(&rs->rs_recv_lock);
+
+#if 0
+ RDSV3_DPRINTF4("rdsv3_poll", "return(%p %x)", rs, mask);
+#endif
+
+ return (mask);
+}
+
+/* ARGSUSED */
+static int
+rdsv3_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
+ int mode, int32_t *rvalp, cred_t *cr)
+{
+ ksocket_t so4;
+ struct lifconf lifc;
+ struct lifreq lifr, *lifrp;
+ struct ifconf ifc;
+ struct ifreq ifr;
+ int rval = 0, rc, len;
+ int numifs;
+ int bufsize;
+ void *buf;
+
+ RDSV3_DPRINTF4("rdsv3_ioctl", "enter: cmd: %d", cmd);
+
+ /* Only ipv4 for now */
+ rval = ksocket_socket(&so4, PF_INET, SOCK_DGRAM, 0, KSOCKET_NOSLEEP,
+ CRED());
+ if (rval != 0) {
+ RDSV3_DPRINTF2("rdsv3_ioctl", "ksocket_socket returned %d",
+ rval);
+ return (rval);
+ }
+
+ switch (cmd) {
+ case SIOCGLIFNUM :
+ case SIOCGIFNUM :
+ rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs);
+ if (rval != 0) break;
+ if (cmd == SIOCGLIFNUM) {
+ (void) ddi_copyout(&numifs, (void *)arg,
+ sizeof (int), 0);
+ } else {
+ len = 0;
+ for (lifrp = (struct lifreq *)buf, rc = 0; rc < numifs;
+ rc++, lifrp++) {
+ if (strlen(lifrp->lifr_name) <= IFNAMSIZ) {
+ len++;
+ }
+ }
+ (void) ddi_copyout(&len, (void *)arg,
+ sizeof (int), 0);
+ }
+ kmem_free(buf, bufsize);
+ break;
+
+ case SIOCGLIFCONF :
+ if (ddi_copyin((void *)arg, &lifc, sizeof (struct lifconf), 0)
+ != 0) {
+ RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifc");
+ rval = EFAULT;
+ break;
+ }
+
+ rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs);
+ if (rval != 0) {
+ RDSV3_DPRINTF2("rdsv3_ioctl",
+ "rdsv3_do_ip_ioctl failed: %d", rval);
+ break;
+ }
+
+ if ((lifc.lifc_len > 0) && (numifs > 0)) {
+ if (ddi_copyout(buf, (void *)lifc.lifc_req,
+ (lifc.lifc_len < bufsize) ? lifc.lifc_len :
+ bufsize, 0) != 0) {
+ RDSV3_DPRINTF2("rdsv3_ioctl",
+ "copyout of records failed");
+ rval = EFAULT;
+ }
+
+ }
+
+ lifc.lifc_len = bufsize;
+ if (ddi_copyout(&lifc, (void *)arg, sizeof (struct lifconf),
+ 0) != 0) {
+ RDSV3_DPRINTF2("rdsv3_ioctl",
+ "copyout of lifconf failed");
+ rval = EFAULT;
+ }
+
+ kmem_free(buf, bufsize);
+ break;
+
+ case SIOCGIFCONF :
+ case O_SIOCGIFCONF :
+ if (ddi_copyin((void *)arg, &ifc, sizeof (struct ifconf), 0)
+ != 0) {
+ RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifc");
+ rval = EFAULT;
+ break;
+ }
+
+ RDSV3_DPRINTF2("rdsv3_ioctl",
+ "O_SIOCGIFCONF: ifc_len: %d, req: %p",
+ ifc.ifc_len, ifc.ifc_req);
+
+ rval = rdsv3_do_ip_ioctl_old(so4, &buf, &bufsize, &numifs);
+ if (rval != 0) {
+ RDSV3_DPRINTF2("rdsv3_ioctl",
+ "rdsv3_do_ip_ioctl_old failed: %d", rval);
+ break;
+ }
+
+ if ((ifc.ifc_len > 0) && (numifs > 0)) {
+ if (ddi_copyout(buf, (void *)ifc.ifc_req,
+ (ifc.ifc_len < bufsize) ? ifc.ifc_len :
+ bufsize, 0) != 0) {
+ RDSV3_DPRINTF2("rdsv3_ioctl",
+ "copyout of records failed");
+ rval = EFAULT;
+ }
+
+ }
+
+ ifc.ifc_len = bufsize;
+ if (ddi_copyout(&ifc, (void *)arg, sizeof (struct ifconf),
+ 0) != 0) {
+ RDSV3_DPRINTF2("rdsv3_ioctl",
+ "copyout of ifconf failed");
+ rval = EFAULT;
+ }
+
+ kmem_free(buf, bufsize);
+ break;
+
+ case SIOCGLIFFLAGS :
+ case SIOCSLIFFLAGS :
+ case SIOCGLIFMTU :
+ case SIOCGLIFNETMASK :
+ case SIOCGLIFINDEX :
+ if (ddi_copyin((void *)arg, &lifr, sizeof (struct lifreq), 0)
+ != 0) {
+ RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifr");
+ rval = EFAULT;
+ break;
+ }
+
+ rc = ksocket_ioctl(so4, cmd, (intptr_t)&lifr, &rval, CRED());
+ if (rc != 0) {
+ RDSV3_DPRINTF2("rdsv3_ioctl",
+ "ksocket_ioctl failed: %d, name: %s cmd: 0x%x",
+ rc, lifr.lifr_name, cmd);
+ break;
+ }
+
+ (void) ddi_copyout(&lifr, (void *)arg,
+ sizeof (struct lifreq), 0);
+ break;
+
+ case SIOCGIFFLAGS :
+ case SIOCSIFFLAGS :
+ case SIOCGIFMTU :
+ case SIOCGIFNETMASK :
+ case SIOCGIFINDEX :
+ if (ddi_copyin((void *)arg, &ifr, sizeof (struct ifreq), 0)
+ != 0) {
+ RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifr");
+ rval = EFAULT;
+ break;
+ }
+
+ RDSV3_DPRINTF2("rdsv3_ioctl", "1. name: %s", ifr.ifr_name);
+
+ rc = ksocket_ioctl(so4, cmd, (intptr_t)&ifr, &rval, CRED());
+ if (rc != 0) {
+ RDSV3_DPRINTF2("rdsv3_ioctl",
+ "ksocket_ioctl failed: %d, name: %s cmd: 0x%x",
+ rc, ifr.ifr_name, cmd);
+
+ break;
+ }
+
+ RDSV3_DPRINTF2("rdsv3_ioctl", "2. name: %s", ifr.ifr_name);
+
+ (void) ddi_copyout(&ifr, (void *)arg,
+ sizeof (struct ifreq), 0);
+ break;
+
+ default:
+ cmn_err(CE_CONT, "unsupported IOCTL cmd: %d \n", cmd);
+ rval = EOPNOTSUPP;
+ }
+
+ (void) ksocket_close(so4, CRED());
+
+ RDSV3_DPRINTF4("rdsv3_ioctl", "return: %d cmd: %d", rval, cmd);
+
+ *rvalp = rval;
+ return (rval);
+}
+
+static int
+rdsv3_cancel_sent_to(struct rdsv3_sock *rs, char *optval, int len)
+{
+ struct sockaddr_in sin;
+
+ /* racing with another thread binding seems ok here */
+ if (rs->rs_bound_addr == 0)
+ return (-ENOTCONN); /* XXX not a great errno */
+
+ if (len < sizeof (struct sockaddr_in))
+ return (-EINVAL);
+
+ if (ddi_copyin((void *)optval, &sin, sizeof (struct sockaddr_in),
+ 0) != 0) {
+ RDSV3_DPRINTF2("rdsv3_cancel_sent_to", "ddi_copyin failed sin");
+ return (-EFAULT);
+ }
+
+ rdsv3_send_drop_to(rs, &sin);
+
+ return (0);
+}
+
+static int
+rdsv3_set_bool_option(unsigned char *optvar, char *optval, int optlen)
+{
+ int value = *optval;
+
+ if (optlen < sizeof (int))
+ return (-EINVAL);
+ *optvar = !!value;
+ return (0);
+}
+
+static int
+rdsv3_cong_monitor(struct rdsv3_sock *rs, char *optval, int optlen)
+{
+ int ret;
+
+ ret = rdsv3_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
+ if (ret == 0) {
+ if (rs->rs_cong_monitor) {
+ rdsv3_cong_add_socket(rs);
+ } else {
+ rdsv3_cong_remove_socket(rs);
+ rs->rs_cong_mask = 0;
+ rs->rs_cong_notify = 0;
+ }
+ }
+ return (ret);
+}
+
+/*ARGSUSED*/
+static int
+rdsv3_setsockopt(sock_lower_handle_t proto_handle, int level,
+ int optname, const void *optval, socklen_t optlen, cred_t *cr)
+{
+ struct rsock *sk = (struct rsock *)proto_handle;
+ struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
+ int ret = 0;
+
+ RDSV3_DPRINTF4("rdsv3_setsockopt", "enter(%p %d %d)",
+ rs, level, optname);
+
+ switch (optname) {
+ case RDSV3_CANCEL_SENT_TO:
+ ret = rdsv3_cancel_sent_to(rs, (char *)optval, optlen);
+ break;
+ case RDSV3_GET_MR:
+ ret = rdsv3_get_mr(rs, optval, optlen);
+ break;
+ case RDSV3_FREE_MR:
+ ret = rdsv3_free_mr(rs, optval, optlen);
+ break;
+ case RDSV3_RECVERR:
+ ret = rdsv3_set_bool_option(&rs->rs_recverr,
+ (char *)optval, optlen);
+ break;
+ case RDSV3_CONG_MONITOR:
+ ret = rdsv3_cong_monitor(rs, (char *)optval, optlen);
+ break;
+ case SO_SNDBUF:
+ sk->sk_sndbuf = *(uint_t *)optval;
+ return (ret);
+ case SO_RCVBUF:
+ sk->sk_rcvbuf = *(uint_t *)optval;
+ return (ret);
+ default:
+#if 1
+ break;
+#else
+ ret = -ENOPROTOOPT;
+#endif
+ }
+out:
+ return (ret);
+}
+
+/* XXX */
+/*ARGSUSED*/
+static int
+rdsv3_getsockopt(sock_lower_handle_t proto_handle, int level,
+ int optname, void *optval, socklen_t *optlen, cred_t *cr)
+{
+ struct rsock *sk = (struct rsock *)proto_handle;
+ struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
+ int ret = 0;
+
+ RDSV3_DPRINTF4("rdsv3_getsockopt", "enter(%p %d %d)",
+ rs, optname, *optlen);
+
+ switch (optname) {
+ case SO_SNDBUF:
+ RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_SNDBUF(%d)",
+ sk->sk_sndbuf);
+ if (*optlen != 0) {
+ *((int *)optval) = sk->sk_sndbuf;
+ *optlen = sizeof (uint_t);
+ }
+ return (ret);
+ case SO_RCVBUF:
+ RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_RCVBUF(%d)",
+ sk->sk_rcvbuf);
+ if (*optlen != 0) {
+ *((int *)optval) = sk->sk_rcvbuf;
+ *optlen = sizeof (uint_t);
+ }
+ return (ret);
+ case RDSV3_RECVERR:
+ RDSV3_DPRINTF4("rdsv3_getsockopt", "RDSV3_RECVERR(%d)",
+ rs->rs_recverr);
+ if (*optlen < sizeof (int))
+ return (-EINVAL);
+ else {
+ *(int *)optval = rs->rs_recverr;
+ *optlen = sizeof (int);
+ }
+ return (0);
+ default:
+ if ((optname >= RDSV3_INFO_FIRST) &&
+ (optname <= RDSV3_INFO_LAST)) {
+ return (rdsv3_info_getsockopt(sk, optname, optval,
+ optlen));
+ }
+ RDSV3_DPRINTF2("rdsv3_getsockopt",
+ "Unknown: level: %d optname: %d", level, optname);
+ ret = -ENOPROTOOPT;
+ }
+
+ RDSV3_DPRINTF4("rdsv3_getsockopt", "return(%p %d %d)",
+ rs, optname, ret);
+ return (ret);
+}
+
+/*ARGSUSED*/
+static int rdsv3_connect(sock_lower_handle_t proto_handle,
+ const struct sockaddr *addr, socklen_t addr_len, sock_connid_t *conn,
+ cred_t *cr)
+{
+ struct rsock *sk = (struct rsock *)proto_handle;
+ struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+ struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
+ int ret = 0;
+
+ RDSV3_DPRINTF4("rdsv3_connect", "Enter(rs: %p)", rs);
+
+ mutex_enter(&sk->sk_lock);
+
+ if (addr_len != sizeof (struct sockaddr_in)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (sin->sin_family != AF_INET_OFFLOAD) {
+ ret = -EAFNOSUPPORT;
+ goto out;
+ }
+
+ if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+ ret = -EDESTADDRREQ;
+ goto out;
+ }
+
+ rs->rs_conn_addr = sin->sin_addr.s_addr;
+ rs->rs_conn_port = sin->sin_port;
+
+ sk->sk_upcalls->su_connected(sk->sk_upper_handle, 0, NULL, -1);
+
+ RDSV3_DPRINTF4("rdsv3_connect", "Return(rs: %p)", rs);
+
+out:
+ mutex_exit(&sk->sk_lock);
+ return (ret);
+}
+
+/*ARGSUSED*/
+static int
+rdsv3_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
+{
+ struct rsock *sk = (struct rsock *)proto_handle;
+ struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
+
+ RDSV3_DPRINTF4("rdsv3_shutdown", "Enter(rs: %p)", rs);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+rdsv3_activate(sock_lower_handle_t proto_handle,
+ sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls,
+ int flags, cred_t *cr)
+{
+ struct rsock *sk = (struct rsock *)proto_handle;
+ struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
+
+ RDSV3_DPRINTF4("rdsv3_activate", "Enter(rs: %p)", rs);
+
+ sk->sk_upcalls = sock_upcalls;
+ sk->sk_upper_handle = sock_handle;
+
+ RDSV3_DPRINTF4("rdsv3_activate", "Return (rs: %p)", rs);
+}
+
+
+/* ARGSUSED */
+int
+rdsv3_send_uio(sock_lower_handle_t proto_handle, uio_t *uio,
+ struct nmsghdr *msg, cred_t *cr)
+{
+ struct rsock *sk = (struct rsock *)proto_handle;
+ struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
+ int ret;
+
+ RDSV3_DPRINTF4("rdsv3_send_uio", "Enter(rs: %p)", rs);
+ ret = rdsv3_sendmsg(rs, uio, msg, uio->uio_resid);
+
+ RDSV3_DPRINTF4("rdsv3_send_uio", "Return(rs: %p ret %d)", rs, ret);
+ if (ret < 0) {
+ return (-ret);
+ }
+
+ return (0);
+}
+
+/* ARGSUSED */
+int
+rdsv3_recv_uio(sock_lower_handle_t proto_handle, uio_t *uio,
+ struct nmsghdr *msg, cred_t *cr)
+{
+ struct rsock *sk = (struct rsock *)proto_handle;
+ struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
+ int ret;
+
+ RDSV3_DPRINTF4("rdsv3_recv_uio", "Enter (rs: %p)", rs);
+ ret = rdsv3_recvmsg(rs, uio, msg, uio->uio_resid, msg->msg_flags);
+
+ RDSV3_DPRINTF4("rdsv3_recv_uio", "Return(rs: %p ret %d)", rs, ret);
+
+ if (ret < 0) {
+ return (-ret);
+ }
+
+ return (0);
+}
+
+/*ARGSUSED*/
+int
+rdsv3_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
+ socklen_t *addr_len, cred_t *cr)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+ struct rsock *sk = (struct rsock *)proto_handle;
+ struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
+
+ RDSV3_DPRINTF2("rdsv3_getpeername", "enter(rs: %p)", rs);
+
+ (void) memset(sin->sin_zero, 0, sizeof (sin->sin_zero));
+
+ /* racey, don't care */
+ if (!rs->rs_conn_addr)
+ return (-ENOTCONN);
+
+ sin->sin_port = rs->rs_conn_port;
+ sin->sin_addr.s_addr = rs->rs_conn_addr;
+
+ sin->sin_family = AF_INET_OFFLOAD;
+
+ *addr_len = sizeof (*sin);
+ return (0);
+}
+
+void
+rdsv3_clrflowctrl(sock_lower_handle_t proto_handle)
+{
+ struct rsock *sk = (struct rsock *)proto_handle;
+ struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
+
+ RDSV3_DPRINTF2("rdsv3_clrflowctrl", "enter(rs: %p)", rs);
+}
+
+#ifndef __lock_lint
+static struct sock_downcalls_s rdsv3_sock_downcalls = {
+ .sd_close = rdsv3_release,
+ .sd_bind = rdsv3_bind,
+ .sd_connect = rdsv3_connect,
+ .sd_accept = NULL,
+ .sd_getsockname = rdsv3_getname,
+ .sd_poll = rdsv3_poll,
+ .sd_ioctl = rdsv3_ioctl,
+ .sd_listen = NULL,
+ .sd_shutdown = rdsv3_shutdown,
+ .sd_setsockopt = rdsv3_setsockopt,
+ .sd_getsockopt = rdsv3_getsockopt,
+ .sd_send_uio = rdsv3_send_uio,
+ .sd_recv_uio = rdsv3_recv_uio,
+ .sd_activate = rdsv3_activate,
+ .sd_getpeername = rdsv3_getpeername,
+ .sd_send = NULL,
+ .sd_clr_flowctrl = NULL
+};
+#else
+static struct sock_downcalls_s rdsv3_sock_downcalls = {
+ rdsv3_activate,
+ NULL,
+ rdsv3_bind,
+ NULL,
+ rdsv3_connect,
+ rdsv3_getpeername,
+ rdsv3_getname,
+ rdsv3_getsockopt,
+ rdsv3_setsockopt,
+ NULL,
+ rdsv3_send_uio,
+ rdsv3_recv_uio,
+ rdsv3_poll,
+ rdsv3_shutdown,
+ NULL,
+ rdsv3_ioctl,
+ rdsv3_release
+};
+#endif
+
+sock_lower_handle_t
+rdsv3_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
+ uint_t *smodep, int *errorp, int flags, cred_t *credp)
+{
+ struct rdsv3_sock *rs;
+ struct rsock *sk;
+
+ RDSV3_DPRINTF4("rdsv3_create", "Enter (family: %d type: %d, proto: %d "
+ "flags: %d", family, type, proto, flags);
+
+ sk = rdsv3_sk_alloc();
+ if (sk == NULL)
+ return (NULL);
+ rdsv3_sock_init_data(sk);
+
+ rs = rdsv3_sk_to_rs(sk);
+ rs->rs_sk = sk;
+ mutex_init(&rs->rs_lock, NULL, MUTEX_DRIVER, NULL);
+ rw_init(&rs->rs_recv_lock, NULL, RW_DRIVER, NULL);
+ list_create(&rs->rs_send_queue, sizeof (struct rdsv3_message),
+ offsetof(struct rdsv3_message, m_sock_item));
+ list_create(&rs->rs_recv_queue, sizeof (struct rdsv3_incoming),
+ offsetof(struct rdsv3_incoming, i_item));
+ list_create(&rs->rs_notify_queue, sizeof (struct rdsv3_notifier),
+ offsetof(struct rdsv3_notifier, n_list));
+ mutex_init(&rs->rs_rdma_lock, NULL, MUTEX_DRIVER, NULL);
+ avl_create(&rs->rs_rdma_keys, rdsv3_mr_compare,
+ sizeof (struct rdsv3_mr), offsetof(struct rdsv3_mr, r_rb_node));
+ mutex_init(&rs->rs_conn_lock, NULL, MUTEX_DRIVER, NULL);
+ rs->rs_cred = credp;
+ rs->rs_zoneid = getzoneid();
+ crhold(credp);
+
+ mutex_enter(&rdsv3_sock_lock);
+ list_insert_tail(&rdsv3_sock_list, rs);
+ rdsv3_sock_count++;
+ /* Initialize RDMA/IB on the 1st socket if not done at attach */
+ if (rdsv3_sock_count == 1) {
+ rdsv3_rdma_init();
+ }
+ mutex_exit(&rdsv3_sock_lock);
+
+ *errorp = 0;
+ *smodep = SM_ATOMIC;
+ *sock_downcalls = &rdsv3_sock_downcalls;
+
+ RDSV3_DPRINTF4("rdsv3_create", "Return: %p", rs);
+
+ return ((sock_lower_handle_t)rdsv3_rs_to_sk(rs));
+}
+
+void
+rdsv3_sock_addref(struct rdsv3_sock *rs)
+{
+ RDSV3_DPRINTF4("rdsv3_sock_addref", "Enter(rs: %p)", rs);
+ rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs));
+}
+
+void
+rdsv3_sock_put(struct rdsv3_sock *rs)
+{
+ RDSV3_DPRINTF4("rdsv3_sock_put", "Enter(rs: %p)", rs);
+ rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs));
+}
+
+static void
+rdsv3_sock_inc_info(struct rsock *sock, unsigned int len,
+ struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens)
+{
+ struct rdsv3_sock *rs;
+ struct rdsv3_incoming *inc;
+ unsigned int total = 0;
+
+ RDSV3_DPRINTF4("rdsv3_sock_inc_info", "Enter(rs: %p)",
+ rdsv3_sk_to_rs(sock));
+
+ len /= sizeof (struct rdsv3_info_message);
+
+ mutex_enter(&rdsv3_sock_lock);
+
+ RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) {
+ rw_enter(&rs->rs_recv_lock, RW_READER);
+
+ /* XXX too lazy to maintain counts.. */
+ RDSV3_FOR_EACH_LIST_NODE(inc, &rs->rs_recv_queue, i_item) {
+ total++;
+ if (total <= len)
+ rdsv3_inc_info_copy(inc, iter, inc->i_saddr,
+ rs->rs_bound_addr, 1);
+ }
+
+ rw_exit(&rs->rs_recv_lock);
+ }
+
+ mutex_exit(&rdsv3_sock_lock);
+
+ lens->nr = total;
+ lens->each = sizeof (struct rdsv3_info_message);
+
+ RDSV3_DPRINTF4("rdsv3_sock_inc_info", "return(rs: %p)",
+ rdsv3_sk_to_rs(sock));
+}
+
+static void
+rdsv3_sock_info(struct rsock *sock, unsigned int len,
+ struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens)
+{
+ struct rdsv3_info_socket sinfo;
+ struct rdsv3_sock *rs;
+ unsigned long bytes;
+
+ RDSV3_DPRINTF4("rdsv3_sock_info", "Enter(rs: %p)",
+ rdsv3_sk_to_rs(sock));
+
+ len /= sizeof (struct rdsv3_info_socket);
+
+ mutex_enter(&rdsv3_sock_lock);
+
+ if ((len < rdsv3_sock_count) || (iter->addr == NULL))
+ goto out;
+
+ bytes = sizeof (struct rdsv3_info_socket);
+ RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) {
+ sinfo.sndbuf = rdsv3_sk_sndbuf(rs);
+ sinfo.rcvbuf = rdsv3_sk_rcvbuf(rs);
+ sinfo.bound_addr = rs->rs_bound_addr;
+ sinfo.connected_addr = rs->rs_conn_addr;
+ sinfo.bound_port = rs->rs_bound_port;
+ sinfo.connected_port = rs->rs_conn_port;
+
+ rdsv3_info_copy(iter, &sinfo, bytes);
+ }
+
+ RDSV3_DPRINTF4("rdsv3_sock_info", "Return(rs: %p)",
+ rdsv3_sk_to_rs(sock));
+
+out:
+ lens->nr = rdsv3_sock_count;
+ lens->each = sizeof (struct rdsv3_info_socket);
+
+ mutex_exit(&rdsv3_sock_lock);
+}
+
+rdsv3_delayed_work_t *rdsv3_rdma_dwp = NULL;
+uint_t rdsv3_rdma_init_delay = 5; /* secs */
+extern void rdsv3_rdma_init_worker(struct rdsv3_work_s *work);
+
+void
+rdsv3_exit(void)
+{
+ RDSV3_DPRINTF4("rdsv3_exit", "Enter");
+
+ if (rdsv3_rdma_dwp) {
+ rdsv3_cancel_delayed_work(rdsv3_rdma_dwp);
+ }
+
+ (void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_rdma_exit,
+ NULL, DDI_SLEEP);
+ while (rdsv3_rdma_listen_id != NULL) {
+#ifndef __lock_lint
+ RDSV3_DPRINTF5("rdsv3", "%s-%d Waiting for rdsv3_rdma_exit",
+ __func__, __LINE__);
+#endif
+ delay(drv_usectohz(1000));
+ }
+
+ rdsv3_conn_exit();
+ rdsv3_cong_exit();
+ rdsv3_sysctl_exit();
+ rdsv3_threads_exit();
+ rdsv3_stats_exit();
+ rdsv3_info_deregister_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info);
+ rdsv3_info_deregister_func(RDSV3_INFO_RECV_MESSAGES,
+ rdsv3_sock_inc_info);
+
+ if (rdsv3_rdma_dwp) {
+ kmem_free(rdsv3_rdma_dwp, sizeof (rdsv3_delayed_work_t));
+ rdsv3_rdma_dwp = NULL;
+ }
+
+ RDSV3_DPRINTF4("rdsv3_exit", "Return");
+}
+
+/*ARGSUSED*/
+int
+rdsv3_init()
+{
+ int ret;
+
+ RDSV3_DPRINTF4("rdsv3_init", "Enter");
+
+ rdsv3_cong_init();
+ ret = rdsv3_conn_init();
+ if (ret)
+ goto out;
+ ret = rdsv3_threads_init();
+ if (ret)
+ goto out_conn;
+ ret = rdsv3_sysctl_init();
+ if (ret)
+ goto out_threads;
+ ret = rdsv3_stats_init();
+ if (ret)
+ goto out_sysctl;
+
+ rdsv3_info_register_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info);
+ rdsv3_info_register_func(RDSV3_INFO_RECV_MESSAGES, rdsv3_sock_inc_info);
+
+ /* rdsv3_rdma_init need to be called with a little delay */
+ rdsv3_rdma_dwp = kmem_zalloc(sizeof (rdsv3_delayed_work_t), KM_SLEEP);
+ RDSV3_INIT_DELAYED_WORK(rdsv3_rdma_dwp, rdsv3_rdma_init_worker);
+ rdsv3_queue_delayed_work(rdsv3_wq, rdsv3_rdma_dwp,
+ rdsv3_rdma_init_delay);
+
+ RDSV3_DPRINTF4("rdsv3_init", "Return");
+
+ goto out;
+
+out_stats:
+ rdsv3_stats_exit();
+out_sysctl:
+ rdsv3_sysctl_exit();
+out_threads:
+ rdsv3_threads_exit();
+out_conn:
+ rdsv3_conn_exit();
+ rdsv3_cong_exit();
+out:
+ return (ret);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/bind.c b/usr/src/uts/common/io/ib/clients/rdsv3/bind.c
new file mode 100644
index 0000000000..965b2977d0
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/bind.c
@@ -0,0 +1,202 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/random.h>
+#include <sys/rds.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+/*
+ * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't
+ * particularly zippy.
+ *
+ * This is now called for every incoming frame so we arguably care much more
+ * about it than we used to.
+ */
+kmutex_t rdsv3_bind_lock;
+avl_tree_t rdsv3_bind_tree;
+
+static struct rdsv3_sock *
+rdsv3_bind_tree_walk(uint32_be_t addr, uint16_be_t port,
+ struct rdsv3_sock *insert)
+{
+ struct rdsv3_sock *rs;
+ avl_index_t where;
+
+ rs = avl_find(&rdsv3_bind_tree, &port, &where);
+ if ((rs == NULL) && (insert != NULL)) {
+ insert->rs_bound_addr = addr;
+ insert->rs_bound_port = port;
+ avl_insert(&rdsv3_bind_tree, insert, where);
+ }
+
+ return (rs);
+}
+
+/*
+ * Return the rdsv3_sock bound at the given local address.
+ *
+ * The rx path can race with rdsv3_release. We notice if rdsv3_release() has
+ * marked this socket and don't return a rs ref to the rx path.
+ */
+struct rdsv3_sock *
+rdsv3_find_bound(uint32_be_t addr, uint16_be_t port)
+{
+ struct rdsv3_sock *rs;
+
+ RDSV3_DPRINTF4("rdsv3_find_bound", "Enter(port: %x)", port);
+
+ mutex_enter(&rdsv3_bind_lock);
+ rs = rdsv3_bind_tree_walk(addr, port, NULL);
+ if (rs && !rdsv3_sk_sock_flag(rdsv3_rs_to_sk(rs), SOCK_DEAD))
+ rdsv3_sock_addref(rs);
+ else
+ rs = NULL;
+ mutex_exit(&rdsv3_bind_lock);
+
+ RDSV3_DPRINTF5("rdsv3_find_bound", "returning rs %p for %u.%u.%u.%u:%x",
+ rs, NIPQUAD(addr), port);
+ return (rs);
+}
+
+/* returns -ve errno or +ve port */
+static int
+rdsv3_add_bound(struct rdsv3_sock *rs, uint32_be_t addr, uint16_be_t *port)
+{
+ int ret = -EADDRINUSE;
+ uint16_t rover, last;
+
+ RDSV3_DPRINTF4("rdsv3_add_bound", "Enter(port: %x)", *port);
+
+ if (*port != 0) {
+ rover = ntohs(*port);
+ last = rover;
+ } else {
+ (void) random_get_pseudo_bytes((uint8_t *)&rover,
+ sizeof (uint16_t));
+ rover = MAX(rover, 2);
+ last = rover - 1;
+ }
+
+ mutex_enter(&rdsv3_bind_lock);
+
+ do {
+ if (rover == 0)
+ rover++;
+ if (rdsv3_bind_tree_walk(addr, htons(rover), rs) == NULL) {
+ *port = htons(rover);
+ ret = 0;
+ break;
+ }
+ } while (rover++ != last);
+
+ if (ret == 0) {
+ rs->rs_bound_addr = addr;
+ rs->rs_bound_port = *port;
+ rdsv3_sock_addref(rs);
+
+ RDSV3_DPRINTF5("rdsv3_add_bound",
+ "rs %p binding to %u.%u.%u.%u:%x",
+ rs, NIPQUAD(addr), *port);
+
+ }
+
+ mutex_exit(&rdsv3_bind_lock);
+
+ RDSV3_DPRINTF4("rdsv3_add_bound", "Return(port: %x)", *port);
+
+ return (ret);
+}
+
+void
+rdsv3_remove_bound(struct rdsv3_sock *rs)
+{
+ RDSV3_DPRINTF4("rdsv3_remove_bound", "Enter(rs: %p)", rs);
+
+ mutex_enter(&rdsv3_bind_lock);
+
+ if (rs->rs_bound_addr) {
+ RDSV3_DPRINTF5("rdsv3_remove_bound",
+ "rs %p unbinding from %u.%u.%u.%u:%x",
+ rs, NIPQUAD(rs->rs_bound_addr), rs->rs_bound_port);
+
+ avl_remove(&rdsv3_bind_tree, rs);
+ rdsv3_sock_put(rs);
+ rs->rs_bound_addr = 0;
+ }
+
+ mutex_exit(&rdsv3_bind_lock);
+
+ RDSV3_DPRINTF4("rdsv3_remove_bound", "Return(rs: %p)", rs);
+}
+
+/* ARGSUSED */
+int
+rdsv3_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
+ socklen_t len, cred_t *cr)
+{
+ struct rsock *sk = (struct rsock *)proto_handle;
+ sin_t *sin = (sin_t *)sa;
+ struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
+ int ret;
+
+ if (len != sizeof (sin_t) || (sin == NULL) ||
+ !OK_32PTR((char *)sin)) {
+ RDSV3_DPRINTF2("rdsv3_bind", "address to bind not specified");
+ return (EINVAL);
+ }
+
+ RDSV3_DPRINTF4("rdsv3_bind", "Enter(rs: %p, addr: 0x%x, port: %x)",
+ rs, ntohl(sin->sin_addr.s_addr), htons(sin->sin_port));
+
+ if (sin->sin_addr.s_addr == INADDR_ANY) {
+ RDSV3_DPRINTF2("rdsv3_bind", "Invalid address");
+ return (EINVAL);
+ }
+
+ /* We don't allow multiple binds */
+ if (rs->rs_bound_addr) {
+ RDSV3_DPRINTF2("rdsv3_bind", "Multiple binds not allowed");
+ return (EINVAL);
+ }
+
+ ret = rdsv3_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
+ if (ret) {
+ return (ret);
+ }
+
+ rs->rs_transport = rdsv3_trans_get_preferred(sin->sin_addr.s_addr);
+ if (rs->rs_transport == NULL) {
+ rdsv3_remove_bound(rs);
+ return (EADDRNOTAVAIL);
+ }
+
+ RDSV3_DPRINTF4("rdsv3_bind", "Return: Assigned port: %x to sock: %p",
+ sin->sin_port, rs);
+
+ return (0);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/cong.c b/usr/src/uts/common/io/ib/clients/rdsv3/cong.c
new file mode 100644
index 0000000000..634459f0fe
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/cong.c
@@ -0,0 +1,523 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2007 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/rds.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/rdsv3_impl.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+/*
+ * This file implements the receive side of the unconventional congestion
+ * management in RDS.
+ *
+ * Messages waiting in the receive queue on the receiving socket are accounted
+ * against the sockets SO_RCVBUF option value. Only the payload bytes in the
+ * message are accounted for. If the number of bytes queued equals or exceeds
+ * rcvbuf then the socket is congested. All sends attempted to this socket's
+ * address should return block or return -EWOULDBLOCK.
+ *
+ * Applications are expected to be reasonably tuned such that this situation
+ * very rarely occurs. An application encountering this "back-pressure" is
+ * considered a bug.
+ *
+ * This is implemented by having each node maintain bitmaps which indicate
+ * which ports on bound addresses are congested. As the bitmap changes it is
+ * sent through all the connections which terminate in the local address of the
+ * bitmap which changed.
+ *
+ * The bitmaps are allocated as connections are brought up. This avoids
+ * allocation in the interrupt handling path which queues messages on sockets.
+ * The dense bitmaps let transports send the entire bitmap on any bitmap change
+ * reasonably efficiently. This is much easier to implement than some
+ * finer-grained communication of per-port congestion. The sender does a very
+ * inexpensive bit test to test if the port it's about to send to is congested
+ * or not.
+ */
+
+/*
+ * Interaction with poll is a tad tricky. We want all processes stuck in
+ * poll to wake up and check whether a congested destination became uncongested.
+ * The really sad thing is we have no idea which destinations the application
+ * wants to send to - we don't even know which rdsv3_connections are involved.
+ * So until we implement a more flexible rds poll interface, we have to make
+ * do with this:
+ * We maintain a global counter that is incremented each time a congestion map
+ * update is received. Each rds socket tracks this value, and if rdsv3_poll
+ * finds that the saved generation number is smaller than the global generation
+ * number, it wakes up the process.
+ */
+static atomic_t rdsv3_cong_generation = ATOMIC_INIT(0);
+
+/*
+ * Congestion monitoring
+ */
+static struct list rdsv3_cong_monitor;
+static krwlock_t rdsv3_cong_monitor_lock;
+
+/*
+ * Yes, a global lock. It's used so infrequently that it's worth keeping it
+ * global to simplify the locking. It's only used in the following
+ * circumstances:
+ *
+ * - on connection buildup to associate a conn with its maps
+ * - on map changes to inform conns of a new map to send
+ *
+ * It's sadly ordered under the socket callback lock and the connection lock.
+ * Receive paths can mark ports congested from interrupt context so the
+ * lock masks interrupts.
+ */
+static kmutex_t rdsv3_cong_lock;
+static struct avl_tree rdsv3_cong_tree;
+
+static struct rdsv3_cong_map *
+rdsv3_cong_tree_walk(uint32_be_t addr, struct rdsv3_cong_map *insert)
+{
+ struct rdsv3_cong_map *map;
+ avl_index_t where;
+
+ if (insert) {
+ map = avl_find(&rdsv3_cong_tree, insert, &where);
+ if (map == NULL) {
+ avl_insert(&rdsv3_cong_tree, insert, where);
+ return (NULL);
+ }
+ } else {
+ struct rdsv3_cong_map map1;
+ map1.m_addr = addr;
+ map = avl_find(&rdsv3_cong_tree, &map1, &where);
+ }
+
+ return (map);
+}
+
+/*
+ * There is only ever one bitmap for any address. Connections try and allocate
+ * these bitmaps in the process getting pointers to them. The bitmaps are only
+ * ever freed as the module is removed after all connections have been freed.
+ */
+static struct rdsv3_cong_map *
+rdsv3_cong_from_addr(uint32_be_t addr)
+{
+ struct rdsv3_cong_map *map;
+ struct rdsv3_cong_map *ret = NULL;
+ unsigned long zp;
+ unsigned long i;
+
+ RDSV3_DPRINTF4("rdsv3_cong_from_addr", "Enter(addr: %x)", ntohl(addr));
+
+ map = kmem_zalloc(sizeof (struct rdsv3_cong_map), KM_NOSLEEP);
+ if (map == NULL)
+ return (NULL);
+
+ map->m_addr = addr;
+ rdsv3_init_waitqueue(&map->m_waitq);
+ list_create(&map->m_conn_list, sizeof (struct rdsv3_connection),
+ offsetof(struct rdsv3_connection, c_map_item));
+
+ for (i = 0; i < RDSV3_CONG_MAP_PAGES; i++) {
+ zp = (unsigned long)kmem_zalloc(PAGE_SIZE, KM_NOSLEEP);
+ if (zp == 0)
+ goto out;
+ map->m_page_addrs[i] = zp;
+ }
+
+ mutex_enter(&rdsv3_cong_lock);
+ ret = rdsv3_cong_tree_walk(addr, map);
+ mutex_exit(&rdsv3_cong_lock);
+
+ if (ret == NULL) {
+ ret = map;
+ map = NULL;
+ }
+
+out:
+ if (map) {
+ for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i];
+ i++)
+ kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE);
+ kmem_free(map, sizeof (*map));
+ }
+
+ RDSV3_DPRINTF5("rdsv3_cong_from_addr", "map %p for addr %x",
+ ret, ntohl(addr));
+
+ return (ret);
+}
+
+/*
+ * Put the conn on its local map's list. This is called when the conn is
+ * really added to the hash. It's nested under the rdsv3_conn_lock, sadly.
+ */
+void
+rdsv3_cong_add_conn(struct rdsv3_connection *conn)
+{
+ RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Enter(conn: %p)", conn);
+
+ RDSV3_DPRINTF5("rdsv3_cong_add_conn", "conn %p now on map %p",
+ conn, conn->c_lcong);
+ mutex_enter(&rdsv3_cong_lock);
+ list_insert_tail(&conn->c_lcong->m_conn_list, conn);
+ mutex_exit(&rdsv3_cong_lock);
+
+ RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Return(conn: %p)", conn);
+}
+
+void
+rdsv3_cong_remove_conn(struct rdsv3_connection *conn)
+{
+ RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Enter(conn: %p)", conn);
+
+ RDSV3_DPRINTF5("rdsv3_cong_remove_conn", "removing conn %p from map %p",
+ conn, conn->c_lcong);
+ mutex_enter(&rdsv3_cong_lock);
+ list_remove_node(&conn->c_map_item);
+ mutex_exit(&rdsv3_cong_lock);
+
+ RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Return(conn: %p)", conn);
+}
+
+int
+rdsv3_cong_get_maps(struct rdsv3_connection *conn)
+{
+ conn->c_lcong = rdsv3_cong_from_addr(conn->c_laddr);
+ conn->c_fcong = rdsv3_cong_from_addr(conn->c_faddr);
+
+ if (conn->c_lcong == NULL || conn->c_fcong == NULL)
+ return (-ENOMEM);
+
+ return (0);
+}
+
+void
+rdsv3_cong_queue_updates(struct rdsv3_cong_map *map)
+{
+ struct rdsv3_connection *conn;
+
+ RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Enter(map: %p)", map);
+
+ mutex_enter(&rdsv3_cong_lock);
+
+ RDSV3_FOR_EACH_LIST_NODE(conn, &map->m_conn_list, c_map_item) {
+ if (!test_and_set_bit(0, &conn->c_map_queued)) {
+ rdsv3_stats_inc(s_cong_update_queued);
+ rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0);
+ }
+ }
+
+ mutex_exit(&rdsv3_cong_lock);
+
+ RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Return(map: %p)", map);
+}
+
+void
+rdsv3_cong_map_updated(struct rdsv3_cong_map *map, uint64_t portmask)
+{
+ RDSV3_DPRINTF4("rdsv3_cong_map_updated",
+ "waking map %p for %u.%u.%u.%u",
+ map, NIPQUAD(map->m_addr));
+ rdsv3_stats_inc(s_cong_update_received);
+ atomic_add_32(&rdsv3_cong_generation, 1);
+#if 0
+XXX
+ if (waitqueue_active(&map->m_waitq))
+#endif
+ rdsv3_wake_up(&map->m_waitq);
+#if 0
+XXX
+ if (waitqueue_active(&rds_poll_waitq))
+#endif
+ rdsv3_wake_up_all(&rdsv3_poll_waitq);
+
+ if (portmask && !list_is_empty(&rdsv3_cong_monitor)) {
+ struct rdsv3_sock *rs;
+
+ rw_enter(&rdsv3_cong_monitor_lock, RW_READER);
+ RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_cong_monitor,
+ rs_cong_list) {
+ mutex_enter(&rs->rs_lock);
+ rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
+ rs->rs_cong_mask &= ~portmask;
+ mutex_exit(&rs->rs_lock);
+ if (rs->rs_cong_notify)
+ rdsv3_wake_sk_sleep(rs);
+ }
+ rw_exit(&rdsv3_cong_monitor_lock);
+ }
+
+ RDSV3_DPRINTF4("rdsv3_cong_map_updated", "Return(map: %p)", map);
+}
+
+int
+rdsv3_cong_updated_since(unsigned long *recent)
+{
+ unsigned long gen = atomic_get(&rdsv3_cong_generation);
+
+ if (*recent == gen)
+ return (0);
+ *recent = gen;
+ return (1);
+}
+
+/*
+ * These should be using generic_{test,__{clear,set}}_le_bit() but some old
+ * kernels don't have them. Sigh.
+ */
+#if defined(sparc)
+#define LE_BIT_XOR ((BITS_PER_LONG-1) & ~0x7)
+#else
+#define LE_BIT_XOR 0
+#endif
+
+/*
+ * We're called under the locking that protects the sockets receive buffer
+ * consumption. This makes it a lot easier for the caller to only call us
+ * when it knows that an existing set bit needs to be cleared, and vice versa.
+ * We can't block and we need to deal with concurrent sockets working against
+ * the same per-address map.
+ */
+void
+rdsv3_cong_set_bit(struct rdsv3_cong_map *map, uint16_be_t port)
+{
+ unsigned long i;
+ unsigned long off;
+
+ RDSV3_DPRINTF4("rdsv3_cong_set_bit",
+ "setting congestion for %u.%u.%u.%u:%u in map %p",
+ NIPQUAD(map->m_addr), ntohs(port), map);
+
+ i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
+ off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
+
+ set_bit(off ^ LE_BIT_XOR, (void *)map->m_page_addrs[i]);
+}
+
+void
+rdsv3_cong_clear_bit(struct rdsv3_cong_map *map, uint16_be_t port)
+{
+ unsigned long i;
+ unsigned long off;
+
+ RDSV3_DPRINTF4("rdsv3_cong_clear_bit",
+ "clearing congestion for %u.%u.%u.%u:%u in map %p\n",
+ NIPQUAD(map->m_addr), ntohs(port), map);
+
+ i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
+ off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
+
+ clear_bit(off ^ LE_BIT_XOR, (void *)map->m_page_addrs[i]);
+}
+
+static int
+rdsv3_cong_test_bit(struct rdsv3_cong_map *map, uint16_be_t port)
+{
+ unsigned long i;
+ unsigned long off;
+
+ i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
+ off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
+
+ RDSV3_DPRINTF5("rdsv3_cong_test_bit", "port: 0x%x i = %lx off = %lx",
+ ntohs(port), i, off);
+
+ return (test_bit(off ^ LE_BIT_XOR, (void *)map->m_page_addrs[i]));
+}
+
+#undef LE_BIT_XOR
+
+void
+rdsv3_cong_add_socket(struct rdsv3_sock *rs)
+{
+ RDSV3_DPRINTF4("rdsv3_cong_add_socket", "Enter(rs: %p)", rs);
+
+ rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER);
+ if (!list_link_active(&rs->rs_cong_list))
+ list_insert_head(&rdsv3_cong_monitor, rs);
+ rw_exit(&rdsv3_cong_monitor_lock);
+}
+
+void
+rdsv3_cong_remove_socket(struct rdsv3_sock *rs)
+{
+ struct rdsv3_cong_map *map;
+
+ RDSV3_DPRINTF4("rdsv3_cong_remove_socket", "Enter(rs: %p)", rs);
+
+ rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER);
+ list_remove_node(&rs->rs_cong_list);
+ rw_exit(&rdsv3_cong_monitor_lock);
+
+ /* update congestion map for now-closed port */
+ mutex_enter(&rdsv3_cong_lock);
+ map = rdsv3_cong_tree_walk(rs->rs_bound_addr, NULL);
+ mutex_exit(&rdsv3_cong_lock);
+
+ if (map && rdsv3_cong_test_bit(map, rs->rs_bound_port)) {
+ rdsv3_cong_clear_bit(map, rs->rs_bound_port);
+ rdsv3_cong_queue_updates(map);
+ }
+}
+
+int
+rdsv3_cong_wait(struct rdsv3_cong_map *map, uint16_be_t port, int nonblock,
+ struct rdsv3_sock *rs)
+{
+ int ret = 0;
+
+ RDSV3_DPRINTF4("rdsv3_cong_wait", "Enter(rs: %p, mode: %d)",
+ rs, nonblock);
+
+ if (!rdsv3_cong_test_bit(map, port))
+ return (0);
+ if (nonblock) {
+ if (rs && rs->rs_cong_monitor) {
+ /*
+ * It would have been nice to have an atomic set_bit on
+ * a uint64_t.
+ */
+ mutex_enter(&rs->rs_lock);
+ rs->rs_cong_mask |=
+ RDSV3_CONG_MONITOR_MASK(ntohs(port));
+ mutex_exit(&rs->rs_lock);
+
+ /*
+ * Test again - a congestion update may have arrived in
+ * the meantime.
+ */
+ if (!rdsv3_cong_test_bit(map, port))
+ return (0);
+ }
+ rdsv3_stats_inc(s_cong_send_error);
+ return (-ENOBUFS);
+ }
+
+ rdsv3_stats_inc(s_cong_send_blocked);
+ RDSV3_DPRINTF3("rdsv3_cong_wait", "waiting on map %p for port %u",
+ map, ntohs(port));
+
+ mutex_enter(&map->m_waitq.waitq_mutex);
+ while (rdsv3_cong_test_bit(map, port)) {
+ if (cv_wait_sig(&map->m_waitq.waitq_cv,
+ &map->m_waitq.waitq_mutex) == 0) {
+ ret = -ERESTART;
+ break;
+ }
+ }
+ mutex_exit(&map->m_waitq.waitq_mutex);
+
+ return (ret);
+}
+
+void
+rdsv3_cong_exit(void)
+{
+ struct rdsv3_cong_map *map;
+ unsigned long i;
+
+ RDSV3_DPRINTF4("rdsv3_cong_exit", "Enter");
+
+ while ((map = avl_first(&rdsv3_cong_tree))) {
+ RDSV3_DPRINTF5("rdsv3_cong_exit", "freeing map %p\n", map);
+ avl_remove(&rdsv3_cong_tree, map);
+ for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i];
+ i++)
+ kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE);
+ kmem_free(map, sizeof (*map));
+ }
+
+ RDSV3_DPRINTF4("rdsv3_cong_exit", "Return");
+}
+
+/*
+ * Allocate a RDS message containing a congestion update.
+ */
+struct rdsv3_message *
+rdsv3_cong_update_alloc(struct rdsv3_connection *conn)
+{
+ struct rdsv3_cong_map *map = conn->c_lcong;
+ struct rdsv3_message *rm;
+
+ rm = rdsv3_message_map_pages(map->m_page_addrs, RDSV3_CONG_MAP_BYTES);
+ if (!IS_ERR(rm))
+ rm->m_inc.i_hdr.h_flags = RDSV3_FLAG_CONG_BITMAP;
+
+ return (rm);
+}
+
+static int
+rdsv3_cong_compare(const void *map1, const void *map2)
+{
+#define addr1 ((struct rdsv3_cong_map *)map1)->m_addr
+#define addr2 ((struct rdsv3_cong_map *)map2)->m_addr
+
+ if (addr1 < addr2)
+ return (-1);
+ if (addr1 > addr2)
+ return (1);
+ return (0);
+}
+
+void
+rdsv3_cong_init(void)
+{
+ list_create(&rdsv3_cong_monitor, sizeof (struct rdsv3_sock),
+ offsetof(struct rdsv3_sock, rs_cong_list));
+ rw_init(&rdsv3_cong_monitor_lock, NULL, RW_DRIVER, NULL);
+ mutex_init(&rdsv3_cong_lock, NULL, MUTEX_DRIVER, NULL);
+ avl_create(&rdsv3_cong_tree, rdsv3_cong_compare,
+ sizeof (struct rdsv3_cong_map), offsetof(struct rdsv3_cong_map,
+ m_rb_node));
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/connection.c b/usr/src/uts/common/io/ib/clients/rdsv3/connection.c
new file mode 100644
index 0000000000..4df9489c9f
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/connection.c
@@ -0,0 +1,546 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/rds.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/loop.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+/* converting this to RCU is a chore for another day.. */
+static krwlock_t rdsv3_conn_lock;
+static unsigned long rdsv3_conn_count;
+struct avl_tree rdsv3_conn_hash;
+static struct kmem_cache *rdsv3_conn_slab = NULL;
+
+#define rdsv3_conn_info_set(var, test, suffix) do { \
+ if (test) \
+ var |= RDSV3_INFO_CONNECTION_FLAG_##suffix; \
+} while (0)
+
+static inline int
+rdsv3_conn_is_sending(struct rdsv3_connection *conn)
+{
+ int ret = 0;
+
+ if (!mutex_tryenter(&conn->c_send_lock))
+ ret = 1;
+ else
+ mutex_exit(&conn->c_send_lock);
+
+ return (ret);
+}
+
+static struct rdsv3_connection *
+rdsv3_conn_lookup(uint32_be_t laddr, uint32_be_t faddr, avl_index_t *pos)
+{
+ struct rdsv3_connection *conn;
+ struct rdsv3_conn_info_s conn_info;
+ avl_index_t place = 0;
+
+ conn_info.c_laddr = laddr;
+ conn_info.c_faddr = faddr;
+
+ conn = avl_find(&rdsv3_conn_hash, &conn_info, &place);
+
+ RDSV3_DPRINTF5("rdsv3_conn_lookup",
+ "returning conn %p for %u.%u.%u.%u -> %u.%u.%u.%u",
+ conn, NIPQUAD(laddr), NIPQUAD(faddr));
+
+ if (pos != NULL)
+ *pos = place;
+
+ return (conn);
+}
+
+/*
+ * This is called by transports as they're bringing down a connection.
+ * It clears partial message state so that the transport can start sending
+ * and receiving over this connection again in the future. It is up to
+ * the transport to have serialized this call with its send and recv.
+ */
+void
+rdsv3_conn_reset(struct rdsv3_connection *conn)
+{
+ RDSV3_DPRINTF2("rdsv3_conn_reset",
+ "connection %u.%u.%u.%u to %u.%u.%u.%u reset",
+ NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr));
+
+ rdsv3_stats_inc(s_conn_reset);
+ rdsv3_send_reset(conn);
+ conn->c_flags = 0;
+
+ /*
+ * Do not clear next_rx_seq here, else we cannot distinguish
+ * retransmitted packets from new packets, and will hand all
+ * of them to the application. That is not consistent with the
+ * reliability guarantees of RDS.
+ */
+}
+
+/*
+ * There is only every one 'conn' for a given pair of addresses in the
+ * system at a time. They contain messages to be retransmitted and so
+ * span the lifetime of the actual underlying transport connections.
+ *
+ * For now they are not garbage collected once they're created. They
+ * are torn down as the module is removed, if ever.
+ */
+static struct rdsv3_connection *
+__rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr,
+ struct rdsv3_transport *trans, int gfp,
+ int is_outgoing)
+{
+ struct rdsv3_connection *conn, *parent = NULL;
+ avl_index_t pos;
+ int ret;
+
+ rw_enter(&rdsv3_conn_lock, RW_READER);
+ conn = rdsv3_conn_lookup(laddr, faddr, &pos);
+ if (conn &&
+ conn->c_loopback &&
+ conn->c_trans != &rdsv3_loop_transport &&
+ !is_outgoing) {
+ /*
+ * This is a looped back IB connection, and we're
+ * called by the code handling the incoming connect.
+ * We need a second connection object into which we
+ * can stick the other QP.
+ */
+ parent = conn;
+ conn = parent->c_passive;
+ }
+ rw_exit(&rdsv3_conn_lock);
+ if (conn)
+ goto out;
+
+ RDSV3_DPRINTF2("__rdsv3_conn_create", "Enter(%x -> %x)",
+ ntohl(laddr), ntohl(faddr));
+
+ conn = kmem_cache_alloc(rdsv3_conn_slab, gfp);
+ if (conn == NULL) {
+ conn = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ /* see rdsv3_conn_constructor */
+ conn->c_laddr = laddr;
+ conn->c_faddr = faddr;
+
+ ret = rdsv3_cong_get_maps(conn);
+ if (ret) {
+ kmem_cache_free(rdsv3_conn_slab, conn);
+ conn = ERR_PTR(ret);
+ goto out;
+ }
+
+ /*
+ * This is where a connection becomes loopback. If *any* RDS sockets
+ * can bind to the destination address then we'd rather the messages
+ * flow through loopback rather than either transport.
+ */
+ if (rdsv3_trans_get_preferred(faddr)) {
+ conn->c_loopback = 1;
+ if (is_outgoing && trans->t_prefer_loopback) {
+ /*
+ * "outgoing" connection - and the transport
+ * says it wants the connection handled by the
+ * loopback transport. This is what TCP does.
+ */
+ trans = &rdsv3_loop_transport;
+ }
+ }
+
+ conn->c_trans = trans;
+
+ ret = trans->conn_alloc(conn, gfp);
+ if (ret) {
+ kmem_cache_free(rdsv3_conn_slab, conn);
+ conn = ERR_PTR(ret);
+ goto out;
+ }
+
+ conn->c_state = RDSV3_CONN_DOWN;
+ conn->c_reconnect_jiffies = 0;
+ RDSV3_INIT_DELAYED_WORK(&conn->c_send_w, rdsv3_send_worker);
+ RDSV3_INIT_DELAYED_WORK(&conn->c_recv_w, rdsv3_recv_worker);
+ RDSV3_INIT_DELAYED_WORK(&conn->c_conn_w, rdsv3_connect_worker);
+ RDSV3_INIT_WORK(&conn->c_down_w, rdsv3_shutdown_worker);
+ mutex_init(&conn->c_cm_lock, NULL, MUTEX_DRIVER, NULL);
+ conn->c_flags = 0;
+
+ RDSV3_DPRINTF2("__rdsv3_conn_create",
+ "allocated conn %p for %u.%u.%u.%u -> %u.%u.%u.%u over %s %s",
+ conn, NIPQUAD(laddr), NIPQUAD(faddr),
+ trans->t_name ? trans->t_name : "[unknown]",
+ is_outgoing ? "(outgoing)" : "");
+
+ /*
+ * Since we ran without holding the conn lock, someone could
+ * have created the same conn (either normal or passive) in the
+ * interim. We check while holding the lock. If we won, we complete
+ * init and return our conn. If we lost, we rollback and return the
+ * other one.
+ */
+ rw_enter(&rdsv3_conn_lock, RW_WRITER);
+ if (parent) {
+ /* Creating passive conn */
+ if (parent->c_passive) {
+ trans->conn_free(conn->c_transport_data);
+ kmem_cache_free(rdsv3_conn_slab, conn);
+ conn = parent->c_passive;
+ } else {
+ parent->c_passive = conn;
+ rdsv3_cong_add_conn(conn);
+ rdsv3_conn_count++;
+ }
+ } else {
+ /* Creating normal conn */
+ struct rdsv3_connection *found;
+
+ found = rdsv3_conn_lookup(laddr, faddr, &pos);
+ if (found) {
+ trans->conn_free(conn->c_transport_data);
+ kmem_cache_free(rdsv3_conn_slab, conn);
+ conn = found;
+ } else {
+ avl_insert(&rdsv3_conn_hash, conn, pos);
+ rdsv3_cong_add_conn(conn);
+ rdsv3_conn_count++;
+ }
+ }
+
+ rw_exit(&rdsv3_conn_lock);
+
+ RDSV3_DPRINTF2("__rdsv3_conn_create", "Return(conn: %p)", conn);
+
+out:
+ return (conn);
+}
+
+struct rdsv3_connection *
+rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr,
+ struct rdsv3_transport *trans, int gfp)
+{
+ return (__rdsv3_conn_create(laddr, faddr, trans, gfp, 0));
+}
+
+struct rdsv3_connection *
+rdsv3_conn_create_outgoing(uint32_be_t laddr, uint32_be_t faddr,
+ struct rdsv3_transport *trans, int gfp)
+{
+ return (__rdsv3_conn_create(laddr, faddr, trans, gfp, 1));
+}
+
+void
+rdsv3_conn_destroy(struct rdsv3_connection *conn)
+{
+ struct rdsv3_message *rm, *rtmp;
+
+ RDSV3_DPRINTF4("rdsv3_conn_destroy",
+ "freeing conn %p for %u.%u.%u.%u -> %u.%u.%u.%u",
+ conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr));
+
+ avl_remove(&rdsv3_conn_hash, conn);
+
+ /* wait for the rds thread to shut it down */
+ conn->c_state = RDSV3_CONN_ERROR;
+ rdsv3_cancel_delayed_work(&conn->c_conn_w);
+ rdsv3_cancel_delayed_work(&conn->c_send_w);
+ rdsv3_cancel_delayed_work(&conn->c_recv_w);
+ rdsv3_shutdown_worker(&conn->c_down_w);
+ rdsv3_flush_workqueue(rdsv3_wq);
+
+ /* tear down queued messages */
+ RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, rtmp,
+ &conn->c_send_queue,
+ m_conn_item) {
+ list_remove_node(&rm->m_conn_item);
+ ASSERT(!list_link_active(&rm->m_sock_item));
+ rdsv3_message_put(rm);
+ }
+ if (conn->c_xmit_rm)
+ rdsv3_message_put(conn->c_xmit_rm);
+
+ conn->c_trans->conn_free(conn->c_transport_data);
+
+ /*
+ * The congestion maps aren't freed up here. They're
+ * freed by rdsv3_cong_exit() after all the connections
+ * have been freed.
+ */
+ rdsv3_cong_remove_conn(conn);
+
+ ASSERT(list_is_empty(&conn->c_retrans));
+ kmem_cache_free(rdsv3_conn_slab, conn);
+
+ rdsv3_conn_count--;
+}
+
+/* ARGSUSED */
+static void
+rdsv3_conn_message_info(struct rsock *sock, unsigned int len,
+ struct rdsv3_info_iterator *iter,
+ struct rdsv3_info_lengths *lens,
+ int want_send)
+{
+ struct list *list;
+ struct rdsv3_connection *conn;
+ struct rdsv3_message *rm;
+ unsigned int total = 0;
+
+ RDSV3_DPRINTF4("rdsv3_conn_message_info", "Enter");
+
+ len /= sizeof (struct rdsv3_info_message);
+
+ rw_enter(&rdsv3_conn_lock, RW_READER);
+
+ if (avl_is_empty(&rdsv3_conn_hash)) {
+ /* no connections */
+ rw_exit(&rdsv3_conn_lock);
+ return;
+ }
+
+ conn = (struct rdsv3_connection *)avl_first(&rdsv3_conn_hash);
+
+ do {
+ if (want_send)
+ list = &conn->c_send_queue;
+ else
+ list = &conn->c_retrans;
+
+ mutex_enter(&conn->c_lock);
+
+ /* XXX too lazy to maintain counts.. */
+ RDSV3_FOR_EACH_LIST_NODE(rm, list, m_conn_item) {
+ total++;
+ if (total <= len)
+ rdsv3_inc_info_copy(&rm->m_inc, iter,
+ conn->c_laddr, conn->c_faddr, 0);
+ }
+
+ mutex_exit(&conn->c_lock);
+
+ conn = AVL_NEXT(&rdsv3_conn_hash, conn);
+ } while (conn != NULL);
+
+ rw_exit(&rdsv3_conn_lock);
+
+ lens->nr = total;
+ lens->each = sizeof (struct rdsv3_info_message);
+
+ RDSV3_DPRINTF4("rdsv3_conn_message_info", "Return");
+}
+
+static void
+rdsv3_conn_message_info_send(struct rsock *sock, unsigned int len,
+ struct rdsv3_info_iterator *iter,
+ struct rdsv3_info_lengths *lens)
+{
+ rdsv3_conn_message_info(sock, len, iter, lens, 1);
+}
+
+static void
+rdsv3_conn_message_info_retrans(struct rsock *sock,
+ unsigned int len,
+ struct rdsv3_info_iterator *iter,
+ struct rdsv3_info_lengths *lens)
+{
+ rdsv3_conn_message_info(sock, len, iter, lens, 0);
+}
+
+/* ARGSUSED */
+void
+rdsv3_for_each_conn_info(struct rsock *sock, unsigned int len,
+ struct rdsv3_info_iterator *iter,
+ struct rdsv3_info_lengths *lens,
+ int (*visitor)(struct rdsv3_connection *, void *),
+ size_t item_len)
+{
+#ifndef __lock_lint
+ uint64_t buffer[(item_len + 7) / 8];
+#else
+ uint64_t buffer[256];
+#endif
+ struct rdsv3_connection *conn;
+
+ rw_enter(&rdsv3_conn_lock, RW_READER);
+
+ lens->nr = 0;
+ lens->each = item_len;
+
+ if (avl_is_empty(&rdsv3_conn_hash)) {
+ /* no connections */
+ rw_exit(&rdsv3_conn_lock);
+ return;
+ }
+
+ conn = (struct rdsv3_connection *)avl_first(&rdsv3_conn_hash);
+
+ do {
+ /* XXX no c_lock usage.. */
+ if (!visitor(conn, buffer))
+ continue;
+
+ /*
+ * We copy as much as we can fit in the buffer,
+ * but we count all items so that the caller
+ * can resize the buffer.
+ */
+ if (len >= item_len) {
+ rdsv3_info_copy(iter, buffer, item_len);
+ len -= item_len;
+ }
+ lens->nr++;
+ conn = AVL_NEXT(&rdsv3_conn_hash, conn);
+ } while (conn != NULL);
+
+ rw_exit(&rdsv3_conn_lock);
+}
+
+static int
+rdsv3_conn_info_visitor(struct rdsv3_connection *conn, void *buffer)
+{
+ struct rdsv3_info_connection *cinfo = buffer;
+
+ cinfo->next_tx_seq = conn->c_next_tx_seq;
+ cinfo->next_rx_seq = conn->c_next_rx_seq;
+ cinfo->laddr = conn->c_laddr;
+ cinfo->faddr = conn->c_faddr;
+ (void) strncpy((char *)cinfo->transport, conn->c_trans->t_name,
+ sizeof (cinfo->transport));
+ cinfo->flags = 0;
+
+ rdsv3_conn_info_set(cinfo->flags,
+ rdsv3_conn_is_sending(conn), SENDING);
+ /* XXX Future: return the state rather than these funky bits */
+ rdsv3_conn_info_set(cinfo->flags,
+ atomic_get(&conn->c_state) == RDSV3_CONN_CONNECTING,
+ CONNECTING);
+ rdsv3_conn_info_set(cinfo->flags,
+ atomic_get(&conn->c_state) == RDSV3_CONN_UP,
+ CONNECTED);
+ return (1);
+}
+
+static void
+rdsv3_conn_info(struct rsock *sock, unsigned int len,
+ struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens)
+{
+ rdsv3_for_each_conn_info(sock, len, iter, lens,
+ rdsv3_conn_info_visitor, sizeof (struct rdsv3_info_connection));
+}
+
+int
+rdsv3_conn_init()
+{
+ RDSV3_DPRINTF4("rdsv3_conn_init", "Enter");
+
+ rdsv3_conn_slab = kmem_cache_create("rdsv3_connection",
+ sizeof (struct rdsv3_connection), 0, rdsv3_conn_constructor,
+ rdsv3_conn_destructor, NULL, NULL, NULL, 0);
+ if (rdsv3_conn_slab == NULL) {
+ RDSV3_DPRINTF1("rdsv3_conn_init",
+ "kmem_cache_create(rdsv3_conn_slab) failed");
+ return (-1);
+ }
+
+ avl_create(&rdsv3_conn_hash, rdsv3_conn_compare,
+ sizeof (struct rdsv3_connection), offsetof(struct rdsv3_connection,
+ c_hash_node));
+
+ rw_init(&rdsv3_conn_lock, NULL, RW_DRIVER, NULL);
+
+ rdsv3_loop_init();
+
+ rdsv3_info_register_func(RDSV3_INFO_CONNECTIONS, rdsv3_conn_info);
+ rdsv3_info_register_func(RDSV3_INFO_SEND_MESSAGES,
+ rdsv3_conn_message_info_send);
+ rdsv3_info_register_func(RDSV3_INFO_RETRANS_MESSAGES,
+ rdsv3_conn_message_info_retrans);
+
+ RDSV3_DPRINTF4("rdsv3_conn_init", "Return");
+
+ return (0);
+}
+
+void
+rdsv3_conn_exit()
+{
+ RDSV3_DPRINTF4("rdsv3_conn_exit", "Enter");
+
+ rdsv3_loop_exit();
+
+ rw_destroy(&rdsv3_conn_lock);
+ avl_destroy(&rdsv3_conn_hash);
+
+ ASSERT(rdsv3_conn_slab);
+ kmem_cache_destroy(rdsv3_conn_slab);
+
+ RDSV3_DPRINTF4("rdsv3_conn_exit", "Return");
+}
+
+/*
+ * Force a disconnect
+ */
+void
+rdsv3_conn_drop(struct rdsv3_connection *conn)
+{
+ conn->c_state = RDSV3_CONN_ERROR;
+ rdsv3_queue_work(rdsv3_wq, &conn->c_down_w);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib.c
new file mode 100644
index 0000000000..3b2adb3932
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib.c
@@ -0,0 +1,410 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/sysmacros.h>
+#include <sys/rds.h>
+
+#include <sys/ib/ibtl/ibti.h>
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/ib.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT;
+
+struct list rdsv3_ib_devices;
+
+/* NOTE: if also grabbing ibdev lock, grab this first */
+kmutex_t ib_nodev_conns_lock;
+list_t ib_nodev_conns;
+
+void
+rdsv3_ib_add_one(ib_device_t *device)
+{
+ struct rdsv3_ib_device *rds_ibdev;
+ ibt_hca_attr_t *dev_attr;
+
+ RDSV3_DPRINTF4("rdsv3_ib_add_one", "device: %p", device);
+
+ /* Only handle IB (no iWARP) devices */
+ if (device->node_type != RDMA_NODE_IB_CA)
+ return;
+
+ dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr),
+ KM_NOSLEEP);
+ if (!dev_attr)
+ return;
+
+ if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) {
+ RDSV3_DPRINTF5("rdsv3_ib_add_one",
+ "Query device failed for %s", device->name);
+ goto free_attr;
+ }
+
+ /* We depend on Reserved Lkey */
+ if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) {
+ RDSV3_DPRINTF5("rdsv3_ib_add_one",
+ "Reserved Lkey support is required: %s",
+ device->name);
+ goto free_attr;
+ }
+
+ rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP);
+ if (!rds_ibdev)
+ goto free_attr;
+
+ mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL);
+
+ rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz;
+ rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE);
+
+ rds_ibdev->dev = device;
+ rds_ibdev->pd = ib_alloc_pd(device);
+ if (IS_ERR(rds_ibdev->pd))
+ goto free_dev;
+
+ if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) {
+ goto free_dev;
+ }
+
+ list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr),
+ offsetof(struct rdsv3_ib_ipaddr, list));
+ list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection),
+ offsetof(struct rdsv3_ib_connection, ib_node));
+
+ list_insert_tail(&rdsv3_ib_devices, rds_ibdev);
+
+ ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev);
+
+ RDSV3_DPRINTF4("rdsv3_ib_add_one", "Return: device: %p", device);
+
+ goto free_attr;
+
+err_pd:
+ (void) ib_dealloc_pd(rds_ibdev->pd);
+free_dev:
+ kmem_free(rds_ibdev, sizeof (*rds_ibdev));
+free_attr:
+ kmem_free(dev_attr, sizeof (*dev_attr));
+}
+
+void
+rdsv3_ib_remove_one(struct ib_device *device)
+{
+ struct rdsv3_ib_device *rds_ibdev;
+ struct rdsv3_ib_ipaddr *i_ipaddr, *i_next;
+
+ RDSV3_DPRINTF4("rdsv3_ib_remove_one", "device: %p", device);
+
+ rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client);
+ if (!rds_ibdev)
+ return;
+
+ RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list,
+ list) {
+ list_remove_node(&i_ipaddr->list);
+ kmem_free(i_ipaddr, sizeof (*i_ipaddr));
+ }
+
+ rdsv3_ib_destroy_conns(rds_ibdev);
+
+ rdsv3_ib_destroy_mr_pool(rds_ibdev);
+
+#if 0
+ while (ib_dealloc_pd(rds_ibdev->pd)) {
+#ifndef __lock_lint
+ RDSV3_DPRINTF5("rdsv3_ib_remove_one",
+ "%s-%d Failed to dealloc pd %p",
+ __func__, __LINE__, rds_ibdev->pd);
+#endif
+ delay(drv_usectohz(1000));
+ }
+#else
+ if (ib_dealloc_pd(rds_ibdev->pd)) {
+#ifndef __lock_lint
+ RDSV3_DPRINTF2("rdsv3_ib_remove_one",
+ "%s-%d Failed to dealloc pd %p",
+ __func__, __LINE__, rds_ibdev->pd);
+#endif
+ }
+#endif
+
+ list_destroy(&rds_ibdev->ipaddr_list);
+ list_destroy(&rds_ibdev->conn_list);
+ list_remove_node(&rds_ibdev->list);
+ kmem_free(rds_ibdev, sizeof (*rds_ibdev));
+
+ RDSV3_DPRINTF4("rdsv3_ib_remove_one", "Return: device: %p", device);
+}
+
+#ifndef __lock_lint
+struct ib_client rdsv3_ib_client = {
+ .name = "rdsv3_ib",
+ .add = rdsv3_ib_add_one,
+ .remove = rdsv3_ib_remove_one,
+ .clnt_hdl = NULL,
+ .state = IB_CLNT_UNINITIALIZED
+};
+#else
+struct ib_client rdsv3_ib_client = {
+ "rdsv3_ib",
+ rdsv3_ib_add_one,
+ rdsv3_ib_remove_one,
+ NULL,
+ NULL,
+ IB_CLNT_UNINITIALIZED
+};
+#endif
+
+static int
+rds_ib_conn_info_visitor(struct rdsv3_connection *conn,
+ void *buffer)
+{
+ struct rdsv3_info_rdma_connection *iinfo = buffer;
+ struct rdsv3_ib_connection *ic;
+
+ RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
+ conn, buffer);
+
+ /* We will only ever look at IB transports */
+ if (conn->c_trans != &rdsv3_ib_transport)
+ return (0);
+
+ iinfo->src_addr = conn->c_laddr;
+ iinfo->dst_addr = conn->c_faddr;
+
+ (void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid));
+ (void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid));
+ if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
+ struct rdsv3_ib_device *rds_ibdev;
+ struct rdma_dev_addr *dev_addr;
+
+ ic = conn->c_transport_data;
+ dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+
+ ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid);
+ ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid);
+
+ rds_ibdev = ib_get_client_data(ic->i_cm_id->device,
+ &rdsv3_ib_client);
+ iinfo->max_send_wr = ic->i_send_ring.w_nr;
+ iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
+ iinfo->max_send_sge = rds_ibdev->max_sge;
+ }
+
+ RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
+ conn, buffer);
+ return (1);
+}
+
+static void
+rds_ib_ic_info(struct rsock *sock, unsigned int len,
+ struct rdsv3_info_iterator *iter,
+ struct rdsv3_info_lengths *lens)
+{
+ RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
+ sock, iter, lens, len);
+
+ rdsv3_for_each_conn_info(sock, len, iter, lens,
+ rds_ib_conn_info_visitor,
+ sizeof (struct rdsv3_info_rdma_connection));
+}
+
+/*
+ * Early RDS/IB was built to only bind to an address if there is an IPoIB
+ * device with that address set.
+ *
+ * If it were me, I'd advocate for something more flexible. Sending and
+ * receiving should be device-agnostic. Transports would try and maintain
+ * connections between peers who have messages queued. Userspace would be
+ * allowed to influence which paths have priority. We could call userspace
+ * asserting this policy "routing".
+ */
+static int
+rds_ib_laddr_check(uint32_be_t addr)
+{
+ int ret;
+ struct rdma_cm_id *cm_id;
+ struct sockaddr_in sin;
+
+ RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr));
+
+ /*
+ * Create a CMA ID and try to bind it. This catches both
+ * IB and iWARP capable NICs.
+ */
+ cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
+ if (IS_ERR(cm_id))
+ return (PTR_ERR(cm_id));
+
+ (void) memset(&sin, 0, sizeof (sin));
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr);
+
+ /* rdma_bind_addr will only succeed for IB & iWARP devices */
+ ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+ /*
+ * due to this, we will claim to support iWARP devices unless we
+ * check node_type.
+ */
+ if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
+ ret = -EADDRNOTAVAIL;
+
+ RDSV3_DPRINTF5("rds_ib_laddr_check",
+ "addr %u.%u.%u.%u ret %d node type %d",
+ NIPQUAD(addr), ret,
+ cm_id->device ? cm_id->device->node_type : -1);
+
+ rdma_destroy_id(cm_id);
+
+ return (ret);
+}
+
+void
+rdsv3_ib_exit(void)
+{
+ RDSV3_DPRINTF4("rds_ib_exit", "Enter");
+
+ rdsv3_info_deregister_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+ rdsv3_ib_destroy_nodev_conns();
+ ib_unregister_client(&rdsv3_ib_client);
+ rdsv3_ib_sysctl_exit();
+ rdsv3_ib_recv_exit();
+ rdsv3_trans_unregister(&rdsv3_ib_transport);
+ mutex_destroy(&ib_nodev_conns_lock);
+ list_destroy(&ib_nodev_conns);
+ list_destroy(&rdsv3_ib_devices);
+
+ RDSV3_DPRINTF4("rds_ib_exit", "Return");
+}
+
+#ifndef __lock_lint
+struct rdsv3_transport rdsv3_ib_transport = {
+ .laddr_check = rds_ib_laddr_check,
+ .xmit_complete = rdsv3_ib_xmit_complete,
+ .xmit = rdsv3_ib_xmit,
+ .xmit_cong_map = NULL,
+ .xmit_rdma = rdsv3_ib_xmit_rdma,
+ .recv = rdsv3_ib_recv,
+ .conn_alloc = rdsv3_ib_conn_alloc,
+ .conn_free = rdsv3_ib_conn_free,
+ .conn_connect = rdsv3_ib_conn_connect,
+ .conn_shutdown = rdsv3_ib_conn_shutdown,
+ .inc_copy_to_user = rdsv3_ib_inc_copy_to_user,
+ .inc_purge = rdsv3_ib_inc_purge,
+ .inc_free = rdsv3_ib_inc_free,
+ .cm_initiate_connect = rdsv3_ib_cm_initiate_connect,
+ .cm_handle_connect = rdsv3_ib_cm_handle_connect,
+ .cm_connect_complete = rdsv3_ib_cm_connect_complete,
+ .stats_info_copy = rdsv3_ib_stats_info_copy,
+ .exit = rdsv3_ib_exit,
+ .get_mr = rdsv3_ib_get_mr,
+ .sync_mr = rdsv3_ib_sync_mr,
+ .free_mr = rdsv3_ib_free_mr,
+ .flush_mrs = rdsv3_ib_flush_mrs,
+ .t_name = "infiniband",
+};
+#else
+struct rdsv3_transport rdsv3_ib_transport;
+#endif
+
+int
+rdsv3_ib_init(void)
+{
+ int ret;
+
+ RDSV3_DPRINTF4("rds_ib_init", "Enter");
+
+ list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device),
+ offsetof(struct rdsv3_ib_device, list));
+ list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection),
+ offsetof(struct rdsv3_ib_connection, ib_node));
+ mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL);
+
+ rdsv3_ib_client.dip = rdsv3_dev_info;
+ ret = ib_register_client(&rdsv3_ib_client);
+ if (ret)
+ goto out;
+
+ ret = rdsv3_ib_sysctl_init();
+ if (ret)
+ goto out_ibreg;
+
+ ret = rdsv3_ib_recv_init();
+ if (ret)
+ goto out_sysctl;
+
+ ret = rdsv3_trans_register(&rdsv3_ib_transport);
+ if (ret)
+ goto out_recv;
+
+ rdsv3_info_register_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+
+ RDSV3_DPRINTF4("rds_ib_init", "Return");
+
+ return (0);
+
+out_recv:
+ rdsv3_ib_recv_exit();
+out_sysctl:
+ rdsv3_ib_sysctl_exit();
+out_ibreg:
+ ib_unregister_client(&rdsv3_ib_client);
+out:
+ mutex_destroy(&ib_nodev_conns_lock);
+ list_destroy(&ib_nodev_conns);
+ list_destroy(&rdsv3_ib_devices);
+ return (ret);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_cm.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_cm.c
new file mode 100644
index 0000000000..df1f73ca2f
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_cm.c
@@ -0,0 +1,978 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/rds.h>
+
+#include <sys/ib/clients/of/ofed_kernel.h>
+#include <sys/ib/clients/of/rdma/ib_addr.h>
+#include <sys/ib/clients/of/rdma/rdma_cm.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/ib.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+extern ddi_taskq_t *rdsv3_taskq;
+
+/*
+ * Set the selected protocol version
+ */
+static void
+rdsv3_ib_set_protocol(struct rdsv3_connection *conn, unsigned int version)
+{
+ RDSV3_DPRINTF4("rdsv3_ib_set_protocol", "conn: %p version: %d",
+ conn, version);
+ conn->c_version = version;
+}
+
+/*
+ * Set up flow control
+ */
+static void
+rdsv3_ib_set_flow_control(struct rdsv3_connection *conn, uint32_t credits)
+{
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+
+ RDSV3_DPRINTF2("rdsv3_ib_set_flow_control",
+ "Enter: conn: %p credits: %d", conn, credits);
+
+ if (rdsv3_ib_sysctl_flow_control && credits != 0) {
+ /* We're doing flow control */
+ ic->i_flowctl = 1;
+ rdsv3_ib_send_add_credits(conn, credits);
+ } else {
+ ic->i_flowctl = 0;
+ }
+
+ RDSV3_DPRINTF2("rdsv3_ib_set_flow_control",
+ "Return: conn: %p credits: %d",
+ conn, credits);
+}
+
+/*
+ * Tune RNR behavior. Without flow control, we use a rather
+ * low timeout, but not the absolute minimum - this should
+ * be tunable.
+ *
+ * We already set the RNR retry count to 7 (which is the
+ * smallest infinite number :-) above.
+ * If flow control is off, we want to change this back to 0
+ * so that we learn quickly when our credit accounting is
+ * buggy.
+ *
+ * Caller passes in a qp_attr pointer - don't waste stack spacv
+ * by allocation this twice.
+ */
+static void
+rdsv3_ib_tune_rnr(struct rdsv3_ib_connection *ic, struct ib_qp_attr *attr)
+{
+ int ret;
+
+ RDSV3_DPRINTF2("rdsv3_ib_tune_rnr", "Enter ic: %p attr: %p",
+ ic, attr);
+
+ attr->min_rnr_timer = IB_RNR_TIMER_000_32;
+ ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
+ if (ret)
+ RDSV3_DPRINTF0("rdsv3_ib_tune_rnr",
+ "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d", -ret);
+}
+
+/*
+ * Connection established.
+ * We get here for both outgoing and incoming connection.
+ */
+void
+rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn,
+ struct rdma_cm_event *event)
+{
+ const struct rdsv3_ib_connect_private *dp = NULL;
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+ struct rdsv3_ib_device *rds_ibdev;
+ struct ib_qp_attr qp_attr;
+ int err;
+
+ RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
+ "Enter conn: %p event: %p", conn, event);
+
+ if (event->param.conn.private_data_len >= sizeof (*dp)) {
+ dp = event->param.conn.private_data;
+
+ /* make sure it isn't empty data */
+ if (dp->dp_protocol_major) {
+ rdsv3_ib_set_protocol(conn,
+ RDS_PROTOCOL(dp->dp_protocol_major,
+ dp->dp_protocol_minor));
+ rdsv3_ib_set_flow_control(conn,
+ ntohl(dp->dp_credit));
+ }
+ }
+
+ RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
+ "RDS/IB: connected to %u.%u.%u.%u version %u.%u%s",
+ NIPQUAD(conn->c_faddr),
+ RDS_PROTOCOL_MAJOR(conn->c_version),
+ RDS_PROTOCOL_MINOR(conn->c_version),
+ ic->i_flowctl ? ", flow control" : "");
+
+ /*
+ * Init rings and fill recv. this needs to wait until protocol
+ * negotiation
+ * is complete, since ring layout is different from 3.0 to 3.1.
+ */
+ rdsv3_ib_send_init_ring(ic);
+ rdsv3_ib_recv_init_ring(ic);
+ /*
+ * Post receive buffers - as a side effect, this will update
+ * the posted credit count.
+ */
+ (void) rdsv3_ib_recv_refill(conn, KM_NOSLEEP, 0, 1);
+
+ /* Tune RNR behavior */
+ rdsv3_ib_tune_rnr(ic, &qp_attr);
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
+ if (err)
+ RDSV3_DPRINTF0("rdsv3_ib_cm_connect_complete",
+ "ib_modify_qp(IB_QP_STATE, RTS): err=%d", err);
+
+ /* update ib_device with this local ipaddr & conn */
+ rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client);
+ err = rdsv3_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
+ if (err)
+ RDSV3_DPRINTF0("rdsv3_ib_cm_connect_complete",
+ "rdsv3_ib_update_ipaddr failed (%d)", err);
+ rdsv3_ib_add_conn(rds_ibdev, conn);
+
+ /*
+ * If the peer gave us the last packet it saw, process this as if
+ * we had received a regular ACK.
+ */
+ if (dp && dp->dp_ack_seq)
+ rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL);
+
+ rdsv3_connect_complete(conn);
+
+ RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
+ "Return conn: %p event: %p",
+ conn, event);
+}
+
+static void
+rdsv3_ib_cm_fill_conn_param(struct rdsv3_connection *conn,
+ struct rdma_conn_param *conn_param,
+ struct rdsv3_ib_connect_private *dp,
+ uint32_t protocol_version)
+{
+ RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param",
+ "Enter conn: %p conn_param: %p private: %p version: %d",
+ conn, conn_param, dp, protocol_version);
+
+ (void) memset(conn_param, 0, sizeof (struct rdma_conn_param));
+ /* XXX tune these? */
+ conn_param->responder_resources = 1;
+ conn_param->initiator_depth = 1;
+ conn_param->retry_count = min(rdsv3_ib_retry_count, 7);
+ conn_param->rnr_retry_count = 7;
+
+ if (dp) {
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+
+ (void) memset(dp, 0, sizeof (*dp));
+ dp->dp_saddr = conn->c_laddr;
+ dp->dp_daddr = conn->c_faddr;
+ dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
+ dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
+ dp->dp_protocol_minor_mask =
+ htons(RDSV3_IB_SUPPORTED_PROTOCOLS);
+ dp->dp_ack_seq = rdsv3_ib_piggyb_ack(ic);
+
+ /* Advertise flow control */
+ if (ic->i_flowctl) {
+ unsigned int credits;
+
+ credits = IB_GET_POST_CREDITS(
+ atomic_get(&ic->i_credits));
+ dp->dp_credit = htonl(credits);
+ atomic_add_32(&ic->i_credits,
+ -IB_SET_POST_CREDITS(credits));
+ }
+
+ conn_param->private_data = dp;
+ conn_param->private_data_len = sizeof (*dp);
+ }
+
+ RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param",
+ "Return conn: %p conn_param: %p private: %p version: %d",
+ conn, conn_param, dp, protocol_version);
+}
+
+static void
+rdsv3_ib_cq_event_handler(struct ib_event *event, void *data)
+{
+ RDSV3_DPRINTF3("rdsv3_ib_cq_event_handler", "event %u data %p",
+ event->event, data);
+}
+
+static void
+rdsv3_ib_qp_event_handler(struct ib_event *event, void *data)
+{
+ struct rdsv3_connection *conn = data;
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+
+ RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "conn %p ic %p event %u",
+ conn, ic, event->event);
+
+ switch (event->event) {
+ case IB_EVENT_COMM_EST:
+ (void) rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
+ break;
+ default:
+ if (conn) {
+ RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler",
+ "RDS/IB: Fatal QP Event %u - "
+ "connection %u.%u.%u.%u ->%u.%u.%u.%u "
+ "...reconnecting",
+ event->event, NIPQUAD(conn->c_laddr),
+ NIPQUAD(conn->c_faddr));
+ rdsv3_conn_drop(conn);
+ } else {
+ RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler",
+ "RDS/IB: Fatal QP Event %u - connection"
+ "...reconnecting", event->event);
+ }
+ break;
+ }
+
+ RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "Return conn: %p event: %p",
+ conn, event);
+}
+
+extern int rdsv3_ib_alloc_hdrs(ib_device_t *dev,
+ struct rdsv3_ib_connection *ic);
+extern void rdsv3_ib_free_hdrs(ib_device_t *dev,
+ struct rdsv3_ib_connection *ic);
+
+/*
+ * This needs to be very careful to not leave IS_ERR pointers around for
+ * cleanup to trip over.
+ */
+static int
+rdsv3_ib_setup_qp(struct rdsv3_connection *conn)
+{
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+ struct ib_device *dev = ic->i_cm_id->device;
+ struct ib_qp_init_attr attr;
+ struct rdsv3_ib_device *rds_ibdev;
+ ibt_send_wr_t *wrp;
+ ibt_wr_ds_t *sgl;
+ int ret, i;
+
+ RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "Enter conn: %p", conn);
+
+ /*
+ * rdsv3_ib_add_one creates a rdsv3_ib_device object per IB device,
+ * and allocates a protection domain, memory range and FMR pool
+ * for each. If that fails for any reason, it will not register
+ * the rds_ibdev at all.
+ */
+ rds_ibdev = ib_get_client_data(dev, &rdsv3_ib_client);
+ if (rds_ibdev == NULL) {
+ RDSV3_DPRINTF0("rdsv3_ib_setup_qp",
+ "RDS/IB: No client_data for device %s", dev->name);
+ return (-EOPNOTSUPP);
+ }
+
+ if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
+ rdsv3_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
+ if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
+ rdsv3_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
+
+ /* Protection domain and memory range */
+ ic->i_pd = rds_ibdev->pd;
+
+ ic->i_send_cq = ib_create_cq(dev, rdsv3_ib_send_cq_comp_handler,
+ rdsv3_ib_cq_event_handler, conn,
+ ic->i_send_ring.w_nr + 1, 0);
+ if (IS_ERR(ic->i_send_cq)) {
+ ret = PTR_ERR(ic->i_send_cq);
+ ic->i_send_cq = NULL;
+ RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
+ "ib_create_cq send failed: %d", ret);
+ goto out;
+ }
+
+ ic->i_recv_cq = ib_create_cq(dev, rdsv3_ib_recv_cq_comp_handler,
+ rdsv3_ib_cq_event_handler, conn,
+ ic->i_recv_ring.w_nr, 0);
+ if (IS_ERR(ic->i_recv_cq)) {
+ ret = PTR_ERR(ic->i_recv_cq);
+ ic->i_recv_cq = NULL;
+ RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
+ "ib_create_cq recv failed: %d", ret);
+ goto out;
+ }
+
+ ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
+ if (ret) {
+ RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
+ "ib_req_notify_cq send failed: %d", ret);
+ goto out;
+ }
+
+ ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+ if (ret) {
+ RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
+ "ib_req_notify_cq recv failed: %d", ret);
+ goto out;
+ }
+
+ /* XXX negotiate max send/recv with remote? */
+ (void) memset(&attr, 0, sizeof (attr));
+ attr.event_handler = rdsv3_ib_qp_event_handler;
+ attr.qp_context = conn;
+ /* + 1 to allow for the single ack message */
+ attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
+ attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
+ attr.cap.max_send_sge = rds_ibdev->max_sge;
+ attr.cap.max_recv_sge = RDSV3_IB_RECV_SGE;
+ attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ attr.qp_type = IB_QPT_RC;
+ attr.send_cq = ic->i_send_cq;
+ attr.recv_cq = ic->i_recv_cq;
+
+ /*
+ * XXX this can fail if max_*_wr is too large? Are we supposed
+ * to back off until we get a value that the hardware can support?
+ */
+ ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
+ if (ret) {
+ RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
+ "rdma_create_qp failed: %d", ret);
+ goto out;
+ }
+
+ ret = rdsv3_ib_alloc_hdrs(dev, ic);
+ if (ret != 0) {
+ ret = -ENOMEM;
+ RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
+ "rdsv3_ib_alloc_hdrs failed: %d", ret);
+ goto out;
+ }
+
+ ic->i_sends = kmem_alloc(ic->i_send_ring.w_nr *
+ sizeof (struct rdsv3_ib_send_work), KM_NOSLEEP);
+ if (ic->i_sends == NULL) {
+ ret = -ENOMEM;
+ RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
+ "send allocation failed: %d", ret);
+ goto out;
+ }
+ (void) memset(ic->i_sends, 0, ic->i_send_ring.w_nr *
+ sizeof (struct rdsv3_ib_send_work));
+
+ ic->i_send_wrs =
+ kmem_alloc(RDSV3_IB_SEND_WRS * (sizeof (ibt_send_wr_t) +
+ RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t)), KM_NOSLEEP);
+ if (ic->i_send_wrs == NULL) {
+ ret = -ENOMEM;
+ RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
+ "WR allocation failed: %d", ret);
+ goto out;
+ }
+ sgl = (ibt_wr_ds_t *)((uint8_t *)ic->i_send_wrs +
+ (RDSV3_IB_SEND_WRS * sizeof (ibt_send_wr_t)));
+ RDSV3_DPRINTF4("rdsv3_ib_setup_qp", "i_send_wrs: %p sgl: %p",
+ ic->i_send_wrs, sgl);
+ for (i = 0; i < RDSV3_IB_SEND_WRS; i++) {
+ wrp = &ic->i_send_wrs[i];
+ wrp->wr_sgl = &sgl[i * RDSV3_IB_MAX_SGE];
+ }
+
+ ic->i_recvs = kmem_alloc(ic->i_recv_ring.w_nr *
+ sizeof (struct rdsv3_ib_recv_work), KM_NOSLEEP);
+ if (ic->i_recvs == NULL) {
+ ret = -ENOMEM;
+ RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
+ "recv allocation failed: %d", ret);
+ goto out;
+ }
+ (void) memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr *
+ sizeof (struct rdsv3_ib_recv_work));
+
+ rdsv3_ib_recv_init_ack(ic);
+
+ RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "conn %p pd %p mr %p cq %p %p",
+ conn, ic->i_pd, ic->i_mr, ic->i_send_cq, ic->i_recv_cq);
+
+out:
+ return (ret);
+}
+
+static uint32_t
+rdsv3_ib_protocol_compatible(struct rdma_cm_event *event)
+{
+ const struct rdsv3_ib_connect_private *dp =
+ event->param.conn.private_data;
+ uint16_t common;
+ uint32_t version = 0;
+
+ RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Enter event: %p",
+ event);
+
+ /*
+ * rdma_cm private data is odd - when there is any private data in the
+ * request, we will be given a pretty large buffer without telling us
+ * the
+ * original size. The only way to tell the difference is by looking at
+ * the contents, which are initialized to zero.
+ * If the protocol version fields aren't set,
+ * this is a connection attempt
+ * from an older version. This could could be 3.0 or 2.0 -
+ * we can't tell.
+ * We really should have changed this for OFED 1.3 :-(
+ */
+
+ /* Be paranoid. RDS always has privdata */
+ if (!event->param.conn.private_data_len) {
+ RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible",
+ "RDS incoming connection has no private data, rejecting");
+ return (0);
+ }
+
+ /* Even if len is crap *now* I still want to check it. -ASG */
+ if (event->param.conn.private_data_len < sizeof (*dp) ||
+ dp->dp_protocol_major == 0)
+ return (RDS_PROTOCOL_3_0);
+
+ common = ntohs(dp->dp_protocol_minor_mask) &
+ RDSV3_IB_SUPPORTED_PROTOCOLS;
+ if (dp->dp_protocol_major == 3 && common) {
+ version = RDS_PROTOCOL_3_0;
+ while ((common >>= 1) != 0)
+ version++;
+ } else {
+ RDSV3_DPRINTF0("rdsv3_ib_protocol_compatible",
+ "RDS: Connection from %u.%u.%u.%u using "
+ "incompatible protocol version %u.%u\n",
+ NIPQUAD(dp->dp_saddr),
+ dp->dp_protocol_major,
+ dp->dp_protocol_minor);
+ }
+
+ RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Return event: %p",
+ event);
+
+ return (version);
+}
+
+int
+rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ uint64_be_t lguid = cm_id->route.path_rec->sgid.global.interface_id;
+ uint64_be_t fguid = cm_id->route.path_rec->dgid.global.interface_id;
+ const struct rdsv3_ib_connect_private *dp =
+ event->param.conn.private_data;
+ struct rdsv3_ib_connect_private dp_rep;
+ struct rdsv3_connection *conn = NULL;
+ struct rdsv3_ib_connection *ic = NULL;
+ struct rdma_conn_param conn_param;
+ uint32_t version;
+ int err, destroy = 1;
+ boolean_t conn_created = B_FALSE;
+
+ RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
+ "Enter cm_id: %p event: %p", cm_id, event);
+
+ /* Check whether the remote protocol version matches ours. */
+ version = rdsv3_ib_protocol_compatible(event);
+ if (!version) {
+ RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
+ "version mismatch");
+ goto out;
+ }
+
+ RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
+ "saddr %u.%u.%u.%u daddr %u.%u.%u.%u RDSv%d.%d lguid 0x%llx fguid "
+ "0x%llx", NIPQUAD(dp->dp_saddr), NIPQUAD(dp->dp_daddr),
+ RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
+ (unsigned long long)ntohll(lguid),
+ (unsigned long long)ntohll(fguid));
+
+ conn = rdsv3_conn_create(dp->dp_daddr, dp->dp_saddr,
+ &rdsv3_ib_transport, KM_NOSLEEP);
+ if (IS_ERR(conn)) {
+ RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
+ "rdsv3_conn_create failed (%ld)", PTR_ERR(conn));
+ conn = NULL;
+ goto out;
+ }
+
+ /*
+ * The connection request may occur while the
+ * previous connection exist, e.g. in case of failover.
+ * But as connections may be initiated simultaneously
+ * by both hosts, we have a random backoff mechanism -
+ * see the comment above rdsv3_queue_reconnect()
+ */
+ mutex_enter(&conn->c_cm_lock);
+ if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN,
+ RDSV3_CONN_CONNECTING)) {
+ if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
+ RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
+ "incoming connect when connected: %p",
+ conn);
+ rdsv3_conn_drop(conn);
+ rdsv3_ib_stats_inc(s_ib_listen_closed_stale);
+ mutex_exit(&conn->c_cm_lock);
+ goto out;
+ } else if (rdsv3_conn_state(conn) == RDSV3_CONN_CONNECTING) {
+ /* Wait and see - our connect may still be succeeding */
+ RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
+ "peer-to-peer connection request: %p, "
+ "lguid: 0x%llx fguid: 0x%llx",
+ conn, lguid, fguid);
+ rdsv3_ib_stats_inc(s_ib_connect_raced);
+ }
+ mutex_exit(&conn->c_cm_lock);
+ goto out;
+ }
+
+ ic = conn->c_transport_data;
+
+ rdsv3_ib_set_protocol(conn, version);
+ rdsv3_ib_set_flow_control(conn, ntohl(dp->dp_credit));
+
+ /*
+ * If the peer gave us the last packet it saw, process this as if
+ * we had received a regular ACK.
+ */
+ if (dp->dp_ack_seq)
+ rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL);
+
+ ASSERT(!cm_id->context);
+ ASSERT(!ic->i_cm_id);
+
+ if (ic->i_cm_id != NULL)
+ RDSV3_PANIC();
+
+ ic->i_cm_id = cm_id;
+ cm_id->context = conn;
+
+ /*
+ * We got halfway through setting up the ib_connection, if we
+ * fail now, we have to take the long route out of this mess.
+ */
+ destroy = 0;
+
+ err = rdsv3_ib_setup_qp(conn);
+ if (err) {
+ RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
+ "rdsv3_ib_setup_qp failed (%d)", err);
+ rdsv3_conn_drop(conn);
+ goto out;
+ }
+
+ rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
+
+ /* rdma_accept() calls rdma_reject() internally if it fails */
+ err = rdma_accept(cm_id, &conn_param);
+ mutex_exit(&conn->c_cm_lock);
+ if (err) {
+ RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
+ "rdma_accept failed (%d)", err);
+ rdsv3_conn_drop(conn);
+ goto out;
+ }
+
+ RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
+ "Return cm_id: %p event: %p", cm_id, event);
+
+ return (0);
+
+out:
+ (void) rdma_reject(cm_id, NULL, 0);
+ return (destroy);
+}
+
+
+int
+rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
+{
+ struct rdsv3_connection *conn = cm_id->context;
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+ struct rdma_conn_param conn_param;
+ struct rdsv3_ib_connect_private dp;
+ int ret;
+
+ RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", "Enter: cm_id: %p",
+ cm_id);
+
+ /*
+ * If the peer doesn't do protocol negotiation, we must
+ * default to RDSv3.0
+ */
+ rdsv3_ib_set_protocol(conn, RDS_PROTOCOL_3_0);
+ ic->i_flowctl =
+ rdsv3_ib_sysctl_flow_control; /* advertise flow control */
+
+ ret = rdsv3_ib_setup_qp(conn);
+ if (ret) {
+ RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect",
+ "rdsv3_ib_setup_qp failed (%d)", ret);
+ rdsv3_conn_drop(conn);
+ goto out;
+ }
+
+ (void) rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp,
+ RDS_PROTOCOL_VERSION);
+
+ ret = rdma_connect(cm_id, &conn_param);
+ if (ret) {
+ RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect",
+ "rdma_connect failed (%d)", ret);
+ rdsv3_conn_drop(conn);
+ }
+
+ RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect",
+ "Return: cm_id: %p", cm_id);
+
+out:
+ /*
+ * Beware - returning non-zero tells the rdma_cm to destroy
+ * the cm_id. We should certainly not do it as long as we still
+ * "own" the cm_id.
+ */
+ if (ret) {
+ if (ic->i_cm_id == cm_id)
+ ret = 0;
+ }
+ return (ret);
+}
+
+int
+rdsv3_ib_conn_connect(struct rdsv3_connection *conn)
+{
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+ struct sockaddr_in src, dest;
+ ipaddr_t laddr, faddr;
+ int ret;
+
+ RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Enter: conn: %p", conn);
+
+ /*
+ * XXX I wonder what affect the port space has
+ */
+ /* delegate cm event handler to rdma_transport */
+ ic->i_cm_id = rdma_create_id(rdsv3_rdma_cm_event_handler, conn,
+ RDMA_PS_TCP);
+ if (IS_ERR(ic->i_cm_id)) {
+ ret = PTR_ERR(ic->i_cm_id);
+ ic->i_cm_id = NULL;
+ RDSV3_DPRINTF2("rdsv3_ib_conn_connect",
+ "rdma_create_id() failed: %d", ret);
+ goto out;
+ }
+
+ RDSV3_DPRINTF3("rdsv3_ib_conn_connect",
+ "created cm id %p for conn %p", ic->i_cm_id, conn);
+
+ /* The ipaddr should be in the network order */
+ laddr = conn->c_laddr;
+ faddr = conn->c_faddr;
+ ret = rdsv3_sc_path_lookup(&laddr, &faddr);
+ if (ret == 0) {
+ RDSV3_DPRINTF2(LABEL, "Path not found (0x%x 0x%x)",
+ ntohl(laddr), ntohl(faddr));
+ }
+
+ src.sin_family = AF_INET;
+ src.sin_addr.s_addr = (uint32_t)laddr;
+ src.sin_port = (uint16_t)htons(0);
+
+ dest.sin_family = AF_INET;
+ dest.sin_addr.s_addr = (uint32_t)faddr;
+ dest.sin_port = (uint16_t)htons(RDSV3_PORT);
+
+ ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
+ (struct sockaddr *)&dest,
+ RDSV3_RDMA_RESOLVE_TIMEOUT_MS);
+ if (ret) {
+ RDSV3_DPRINTF2("rdsv3_ib_conn_connect",
+ "addr resolve failed for cm id %p: %d", ic->i_cm_id, ret);
+ rdma_destroy_id(ic->i_cm_id);
+ ic->i_cm_id = NULL;
+ }
+
+ RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Return: conn: %p", conn);
+
+out:
+ return (ret);
+}
+
+/*
+ * This is so careful about only cleaning up resources that were built up
+ * so that it can be called at any point during startup. In fact it
+ * can be called multiple times for a given connection.
+ */
+void
+rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn)
+{
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+ int err = 0;
+
+ RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown",
+ "cm %p pd %p cq %p %p qp %p", ic->i_cm_id,
+ ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
+ ic->i_cm_id ? ic->i_cm_id->qp : NULL);
+
+ if (ic->i_cm_id) {
+ struct ib_device *dev = ic->i_cm_id->device;
+
+ RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown",
+ "disconnecting cm %p", ic->i_cm_id);
+ err = rdma_disconnect(ic->i_cm_id);
+ if (err) {
+ /*
+ * Actually this may happen quite frequently, when
+ * an outgoing connect raced with an incoming connect.
+ */
+ RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown",
+ "failed to disconnect, cm: %p err %d",
+ ic->i_cm_id, err);
+ }
+
+ if (ic->i_cm_id->qp) {
+ (void) ibt_flush_qp(
+ ib_get_ibt_channel_hdl(ic->i_cm_id));
+
+ /* wait until all WRs are flushed */
+ rdsv3_wait_event(rdsv3_ib_ring_empty_wait,
+ rdsv3_ib_ring_empty(&ic->i_send_ring) &&
+ rdsv3_ib_ring_empty(&ic->i_recv_ring));
+
+ rdma_destroy_qp(ic->i_cm_id);
+ }
+
+
+ if (ic->i_mr)
+ rdsv3_ib_free_hdrs(dev, ic);
+
+ if (ic->i_sends)
+ rdsv3_ib_send_clear_ring(ic);
+ if (ic->i_recvs)
+ rdsv3_ib_recv_clear_ring(ic);
+
+ if (ic->i_send_cq)
+ (void) ib_destroy_cq(ic->i_send_cq);
+ if (ic->i_recv_cq)
+ (void) ib_destroy_cq(ic->i_recv_cq);
+ rdma_destroy_id(ic->i_cm_id);
+
+ /*
+ * Move connection back to the nodev list.
+ */
+ if (ic->rds_ibdev)
+ rdsv3_ib_remove_conn(ic->rds_ibdev, conn);
+
+ ic->i_cm_id = NULL;
+ ic->i_pd = NULL;
+ ic->i_mr = NULL;
+ ic->i_send_cq = NULL;
+ ic->i_recv_cq = NULL;
+ ic->i_send_hdrs = NULL;
+ ic->i_recv_hdrs = NULL;
+ ic->i_ack = NULL;
+ }
+ ASSERT(!ic->rds_ibdev);
+
+ /* Clear pending transmit */
+ if (ic->i_rm) {
+ rdsv3_message_put(ic->i_rm);
+ ic->i_rm = NULL;
+ }
+
+ /* Clear the ACK state */
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ ic->i_ack_next = 0;
+ ic->i_ack_recv = 0;
+
+ /* Clear flow control state */
+ ic->i_flowctl = 0;
+ ic->i_credits = 0;
+
+ rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr);
+ rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr);
+
+ if (ic->i_ibinc) {
+ rdsv3_inc_put(&ic->i_ibinc->ii_inc);
+ ic->i_ibinc = NULL;
+ }
+
+ if (ic->i_sends) {
+ kmem_free(ic->i_sends,
+ ic->i_send_ring.w_nr * sizeof (struct rdsv3_ib_send_work));
+ ic->i_sends = NULL;
+ }
+ if (ic->i_send_wrs) {
+ kmem_free(ic->i_send_wrs, RDSV3_IB_SEND_WRS *
+ (sizeof (ibt_send_wr_t) +
+ RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t)));
+ ic->i_send_wrs = NULL;
+ }
+ if (ic->i_recvs) {
+ kmem_free(ic->i_recvs,
+ ic->i_recv_ring.w_nr * sizeof (struct rdsv3_ib_recv_work));
+ ic->i_recvs = NULL;
+ }
+
+ RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", "Return conn: %p", conn);
+}
+
+/*
+ * the connection can be allocated from either rdsv3_conn_create_outgoing()
+ * or rdsv3_conn_create(), so ddi_taskq_create() can be called with the
+ * same string. This can print the kstat warning on the console. To prevent
+ * it, this counter value is used.
+ * Note that requests from rdsv3_conn_create_outgoing() refers to the cached
+ * value with the mutex lock before it allocates the connection, so that
+ * the warning cannot be produced in the case. (only between
+ * rdsv3_conn_create() and rdsv3_conn_create_outgoing().
+ */
+static int conn_cnt;
+
+/* ARGSUSED */
+int
+rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp)
+{
+ struct rdsv3_ib_connection *ic;
+ char tq_name[TASKQ_NAMELEN];
+
+ RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn: %p", conn);
+
+ /* XXX too lazy? */
+ ic = kmem_zalloc(sizeof (struct rdsv3_ib_connection), gfp);
+ if (ic == NULL)
+ return (-ENOMEM);
+
+ list_link_init(&ic->ib_node);
+ (void) snprintf(tq_name, TASKQ_NAMELEN, "RDSV3_CONN_to_%x:%u",
+ htonl(conn->c_faddr), conn_cnt++ % 100);
+ ic->i_recv_tasklet =
+ ddi_taskq_create(NULL, tq_name, 1, TASKQ_DEFAULTPRI, 0);
+
+
+ mutex_init(&ic->i_recv_mutex, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&ic->i_ack_lock, NULL, MUTEX_DRIVER, NULL);
+
+ /*
+ * rdsv3_ib_conn_shutdown() waits for these to be emptied so they
+ * must be initialized before it can be called.
+ */
+ rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr);
+ rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr);
+
+ ic->conn = conn;
+ conn->c_transport_data = ic;
+
+ mutex_enter(&ib_nodev_conns_lock);
+ list_insert_tail(&ib_nodev_conns, ic);
+ mutex_exit(&ib_nodev_conns_lock);
+
+
+ RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn %p conn ic %p",
+ conn, conn->c_transport_data);
+ return (0);
+}
+
+/*
+ * Free a connection. Connection must be shut down and not set for reconnect.
+ */
+void
+rdsv3_ib_conn_free(void *arg)
+{
+ struct rdsv3_ib_connection *ic = arg;
+ kmutex_t *lock_ptr;
+
+ RDSV3_DPRINTF2("rdsv3_ib_conn_free", "ic %p\n", ic);
+
+#ifndef __lock_lint
+ /*
+ * Conn is either on a dev's list or on the nodev list.
+ * A race with shutdown() or connect() would cause problems
+ * (since rds_ibdev would change) but that should never happen.
+ */
+ lock_ptr = ic->rds_ibdev ?
+ &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
+
+ mutex_enter(lock_ptr);
+ list_remove_node(&ic->ib_node);
+ mutex_exit(lock_ptr);
+#endif
+
+ ddi_taskq_destroy(ic->i_recv_tasklet);
+ kmem_free(ic, sizeof (*ic));
+}
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rdsv3_ib_conn_error(struct rdsv3_connection *conn)
+{
+ rdsv3_conn_drop(conn);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_rdma.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_rdma.c
new file mode 100644
index 0000000000..fce01b7b1d
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_rdma.c
@@ -0,0 +1,551 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/rds.h>
+#include <netinet/in.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/rdma.h>
+#include <sys/ib/clients/rdsv3/ib.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+/*
+ * This is stored as mr->r_trans_private.
+ */
+struct rdsv3_ib_mr {
+ struct rdsv3_ib_device *device;
+ struct rdsv3_ib_mr_pool *pool;
+ struct ib_fmr *fmr;
+ struct list list;
+ unsigned int remap_count;
+
+ struct rdsv3_scatterlist *sg;
+ unsigned int sg_len;
+ uint64_t *dma;
+ int sg_dma_len;
+
+ /* DDI pinned memory */
+ ddi_umem_cookie_t umem_cookie;
+ /* IBTF type definitions */
+ ibt_fmr_pool_hdl_t fmr_pool_hdl;
+ ibt_ma_hdl_t rc_ma_hdl;
+ ibt_mr_hdl_t rc_fmr_hdl;
+ ibt_pmr_desc_t rc_mem_desc;
+};
+
+/*
+ * Our own little FMR pool
+ */
+struct rdsv3_ib_mr_pool {
+ struct mutex flush_lock; /* serialize fmr invalidate */
+ struct rdsv3_work_s flush_worker; /* flush worker */
+
+ kmutex_t list_lock; /* protect variables below */
+ atomic_t item_count; /* total # of MRs */
+ atomic_t dirty_count; /* # dirty of MRs */
+ /* MRs that have reached their max_maps limit */
+ struct list drop_list;
+ struct list free_list; /* unused MRs */
+ struct list clean_list; /* unused & unamapped MRs */
+ atomic_t free_pinned; /* memory pinned by free MRs */
+ unsigned long max_items;
+ unsigned long max_items_soft;
+ unsigned long max_free_pinned;
+};
+
+static int rdsv3_ib_flush_mr_pool(struct rdsv3_ib_device *rds_ibdev,
+ ibt_fmr_pool_hdl_t pool_hdl, int free_all);
+static void rdsv3_ib_teardown_mr(struct rdsv3_ib_mr *ibmr);
+static void rdsv3_ib_mr_pool_flush_worker(struct rdsv3_work_s *work);
+static struct rdsv3_ib_mr *rdsv3_ib_alloc_fmr(struct rdsv3_ib_device
+ *rds_ibdev);
+static int rdsv3_ib_map_fmr(struct rdsv3_ib_device *rds_ibdev,
+ struct rdsv3_ib_mr *ibmr, struct buf *bp, unsigned int nents);
+
+static struct rdsv3_ib_device *
+rdsv3_ib_get_device(uint32_be_t ipaddr)
+{
+ struct rdsv3_ib_device *rds_ibdev;
+ struct rdsv3_ib_ipaddr *i_ipaddr;
+
+ RDSV3_DPRINTF4("rdsv3_ib_get_device", "Enter: ipaddr: 0x%x", ipaddr);
+
+ RDSV3_FOR_EACH_LIST_NODE(rds_ibdev, &rdsv3_ib_devices, list) {
+ mutex_enter(&rds_ibdev->spinlock);
+ RDSV3_FOR_EACH_LIST_NODE(i_ipaddr, &rds_ibdev->ipaddr_list,
+ list) {
+ if (i_ipaddr->ipaddr == ipaddr) {
+ mutex_exit(&rds_ibdev->spinlock);
+ return (rds_ibdev);
+ }
+ }
+ mutex_exit(&rds_ibdev->spinlock);
+ }
+
+ RDSV3_DPRINTF4("rdsv3_ib_get_device", "Return: ipaddr: 0x%x", ipaddr);
+
+ return (NULL);
+}
+
+static int
+rdsv3_ib_add_ipaddr(struct rdsv3_ib_device *rds_ibdev, uint32_be_t ipaddr)
+{
+ struct rdsv3_ib_ipaddr *i_ipaddr;
+
+ RDSV3_DPRINTF4("rdsv3_ib_add_ipaddr", "rds_ibdev: %p ipaddr: %x",
+ rds_ibdev, ipaddr);
+
+ i_ipaddr = kmem_alloc(sizeof (*i_ipaddr), KM_NOSLEEP);
+ if (!i_ipaddr)
+ return (-ENOMEM);
+
+ i_ipaddr->ipaddr = ipaddr;
+
+ mutex_enter(&rds_ibdev->spinlock);
+ list_insert_tail(&rds_ibdev->ipaddr_list, i_ipaddr);
+ mutex_exit(&rds_ibdev->spinlock);
+
+ return (0);
+}
+
+static void
+rdsv3_ib_remove_ipaddr(struct rdsv3_ib_device *rds_ibdev, uint32_be_t ipaddr)
+{
+ struct rdsv3_ib_ipaddr *i_ipaddr, *next;
+
+ RDSV3_DPRINTF4("rdsv3_ib_remove_ipaddr", "rds_ibdev: %p, ipaddr: %x",
+ rds_ibdev, ipaddr);
+
+ mutex_enter(&rds_ibdev->spinlock);
+ RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, next, &rds_ibdev->ipaddr_list,
+ list) {
+ if (i_ipaddr->ipaddr == ipaddr) {
+ list_remove_node(&i_ipaddr->list);
+ kmem_free(i_ipaddr, sizeof (*i_ipaddr));
+ break;
+ }
+ }
+ mutex_exit(&rds_ibdev->spinlock);
+
+ RDSV3_DPRINTF4("rdsv3_ib_remove_ipaddr",
+ "Return: rds_ibdev: %p, ipaddr: %x", rds_ibdev, ipaddr);
+}
+
+int
+rdsv3_ib_update_ipaddr(struct rdsv3_ib_device *rds_ibdev, uint32_be_t ipaddr)
+{
+ struct rdsv3_ib_device *rds_ibdev_old;
+
+ RDSV3_DPRINTF4("rdsv3_ib_update_ipaddr", "rds_ibdev: %p, ipaddr: %x",
+ rds_ibdev, ipaddr);
+
+ rds_ibdev_old = rdsv3_ib_get_device(ipaddr);
+ if (rds_ibdev_old)
+ rdsv3_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
+
+ return (rdsv3_ib_add_ipaddr(rds_ibdev, ipaddr));
+}
+
+void
+rdsv3_ib_add_conn(struct rdsv3_ib_device *rds_ibdev,
+ struct rdsv3_connection *conn)
+{
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+
+ RDSV3_DPRINTF4("rdsv3_ib_add_conn", "rds_ibdev: %p, conn: %p",
+ rds_ibdev, conn);
+
+ /* conn was previously on the nodev_conns_list */
+ mutex_enter(&ib_nodev_conns_lock);
+ ASSERT(!list_is_empty(&ib_nodev_conns));
+ ASSERT(list_link_active(&ic->ib_node));
+ list_remove_node(&ic->ib_node);
+
+ mutex_enter(&rds_ibdev->spinlock);
+ list_insert_tail(&rds_ibdev->conn_list, ic);
+ mutex_exit(&rds_ibdev->spinlock);
+ mutex_exit(&ib_nodev_conns_lock);
+
+ ic->rds_ibdev = rds_ibdev;
+}
+
+void
+rdsv3_ib_remove_conn(struct rdsv3_ib_device *rds_ibdev,
+ struct rdsv3_connection *conn)
+{
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+
+ RDSV3_DPRINTF4("rdsv3_ib_remove_conn", "rds_ibdev: %p, conn: %p",
+ rds_ibdev, conn);
+
+ /* place conn on nodev_conns_list */
+ mutex_enter(&ib_nodev_conns_lock);
+
+ mutex_enter(&rds_ibdev->spinlock);
+ ASSERT(list_link_active(&ic->ib_node));
+ list_remove_node(&ic->ib_node);
+ mutex_exit(&rds_ibdev->spinlock);
+
+ list_insert_tail(&ib_nodev_conns, ic);
+
+ mutex_exit(&ib_nodev_conns_lock);
+
+ ic->rds_ibdev = NULL;
+
+ RDSV3_DPRINTF4("rdsv3_ib_remove_conn",
+ "Return: rds_ibdev: %p, conn: %p", rds_ibdev, conn);
+}
+
+void
+__rdsv3_ib_destroy_conns(struct list *list, kmutex_t *list_lock)
+{
+ struct rdsv3_ib_connection *ic, *_ic;
+ list_t tmp_list;
+
+ RDSV3_DPRINTF4("__rdsv3_ib_destroy_conns", "Enter: list: %p", list);
+
+ /* avoid calling conn_destroy with irqs off */
+ mutex_enter(list_lock);
+ list_splice(list, &tmp_list);
+ mutex_exit(list_lock);
+
+ RDSV3_FOR_EACH_LIST_NODE_SAFE(ic, _ic, &tmp_list, ib_node) {
+ rdsv3_conn_destroy(ic->conn);
+ }
+
+ RDSV3_DPRINTF4("__rdsv3_ib_destroy_conns", "Return: list: %p", list);
+}
+
+void
+rdsv3_ib_destroy_mr_pool(struct rdsv3_ib_device *rds_ibdev)
+{
+ RDSV3_DPRINTF4("rdsv3_ib_destroy_mr_pool", "Enter: ibdev: %p",
+ rds_ibdev);
+
+ if (rds_ibdev->fmr_pool_hdl == NULL)
+ return;
+
+ (void) rdsv3_ib_flush_mr_pool(rds_ibdev, rds_ibdev->fmr_pool_hdl, 1);
+ (void) ibt_destroy_fmr_pool(ib_get_ibt_hca_hdl(rds_ibdev->dev),
+ rds_ibdev->fmr_pool_hdl);
+}
+
+#define IB_FMR_MAX_BUF_SIZE 0x1000000 /* 16MB max buf */
+int
+rdsv3_ib_create_mr_pool(struct rdsv3_ib_device *rds_ibdev)
+{
+ uint_t h_page_sz;
+ ibt_fmr_pool_attr_t fmr_attr;
+ ibt_status_t ibt_status;
+ ibt_hca_hdl_t hca_hdl;
+
+ RDSV3_DPRINTF4("rdsv3_ib_create_mr_pool",
+ "Enter: ibdev: %p", rds_ibdev);
+
+ hca_hdl = ib_get_ibt_hca_hdl(rds_ibdev->dev);
+ /* get hca attributes */
+ ibt_status = ibt_query_hca(hca_hdl, &rds_ibdev->hca_attr);
+ if (ibt_status != IBT_SUCCESS) {
+ return (-ENOMEM);
+ }
+
+ /* setup FMR pool attributes */
+ h_page_sz = rds_ibdev->hca_attr.hca_page_sz * 1024;
+
+ fmr_attr.fmr_max_pages_per_fmr = (IB_FMR_MAX_BUF_SIZE / h_page_sz) + 2;
+ fmr_attr.fmr_pool_size = RDSV3_FMR_POOL_SIZE;
+ fmr_attr.fmr_dirty_watermark = 128;
+ fmr_attr.fmr_cache = B_FALSE;
+ fmr_attr.fmr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
+ IBT_MR_ENABLE_REMOTE_WRITE | IBT_MR_ENABLE_REMOTE_READ;
+ fmr_attr.fmr_page_sz = h_page_sz;
+ fmr_attr.fmr_func_hdlr = NULL;
+ fmr_attr.fmr_func_arg = (void *) NULL;
+
+ /* create the FMR pool */
+ ibt_status = ibt_create_fmr_pool(hca_hdl, rds_ibdev->pd->ibt_pd,
+ &fmr_attr, &rds_ibdev->fmr_pool_hdl);
+ if (ibt_status != IBT_SUCCESS) {
+ return (-ENOMEM);
+ }
+ rds_ibdev->max_fmrs = fmr_attr.fmr_pool_size;
+ rds_ibdev->fmr_message_size = fmr_attr.fmr_max_pages_per_fmr;
+ return (0);
+}
+
+void
+rdsv3_ib_get_mr_info(struct rdsv3_ib_device *rds_ibdev,
+ struct rdsv3_info_rdma_connection *iinfo)
+{
+ iinfo->rdma_mr_max = rds_ibdev->max_fmrs;
+ iinfo->rdma_mr_size = rds_ibdev->fmr_message_size;
+}
+
+static void
+rdsv3_umem_cb(ddi_umem_cookie_t *umem_cookie)
+{
+ /* LINTED E_FUNC_SET_NOT_USED */
+ ddi_umem_cookie_t *cp = umem_cookie;
+ RDSV3_DPRINTF5("rdsv3_umem_cb", "Enter: umem_cookie %p", umem_cookie);
+ /* all umem_cookies are freed at socket fd close */
+ /* there should be no umem_cookies when clearing the addr space */
+}
+
+struct umem_callback_ops rdsv3_umem_cbops = {
+ UMEM_CALLBACK_VERSION,
+ rdsv3_umem_cb,
+};
+
+void *
+rdsv3_ib_get_mr(struct rdsv3_iovec *args, unsigned long nents,
+ struct rdsv3_sock *rs, uint32_t *key_ret)
+{
+ struct rdsv3_ib_device *rds_ibdev;
+ struct rdsv3_ib_mr *ibmr = NULL;
+ ddi_umem_cookie_t umem_cookie;
+ size_t umem_len;
+ caddr_t umem_addr;
+ int umem_flags;
+ int ret;
+ struct buf *bp;
+
+ RDSV3_DPRINTF4("rdsv3_ib_get_mr", "Enter: args.addr: %p", args->addr);
+
+ rds_ibdev = rdsv3_ib_get_device(rs->rs_bound_addr);
+
+ if (rds_ibdev == NULL)
+ return (void *)(PTR_ERR(-EFAULT));
+
+ ibmr = rdsv3_ib_alloc_fmr(rds_ibdev);
+ if (IS_ERR(ibmr))
+ return (ibmr);
+
+ /* pin user memory pages */
+ umem_len = ptob(btopr(args->bytes +
+ ((uintptr_t)args->addr & PAGEOFFSET)));
+ umem_addr = (caddr_t)((uintptr_t)args->addr & ~PAGEOFFSET);
+ umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
+ DDI_UMEMLOCK_LONGTERM);
+ ret = umem_lockmemory(umem_addr, umem_len, umem_flags,
+ &umem_cookie, &rdsv3_umem_cbops, NULL);
+ if (ret != 0) {
+ kmem_free((void *) ibmr, sizeof (*ibmr));
+ ibmr = ERR_PTR(ret);
+ return (ibmr);
+ }
+
+ /* transpose umem_cookie to buf structure for rdsv3_ib_map_fmr() */
+ bp = ddi_umem_iosetup(umem_cookie, 0, umem_len,
+ B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP);
+
+ ret = rdsv3_ib_map_fmr(rds_ibdev, ibmr, bp, nents);
+ freerbuf(bp); /* free bp */
+ if (ret == 0) {
+ ibmr->umem_cookie = umem_cookie;
+ *key_ret = (uint32_t)ibmr->rc_mem_desc.pmd_rkey;
+ ibmr->device = rds_ibdev;
+ RDSV3_DPRINTF4("rdsv3_ib_get_mr",
+ "Return: ibmr: %p umem_cookie %p", ibmr, ibmr->umem_cookie);
+ return (ibmr);
+ } else { /* error return */
+ RDSV3_DPRINTF1("rdsv3_ib_get_mr", "map_fmr failed (errno=%d)\n",
+ ret);
+ ddi_umem_unlock(umem_cookie);
+ kmem_free((void *)ibmr, sizeof (*ibmr));
+ return (ERR_PTR(ret));
+ }
+}
+
+static struct rdsv3_ib_mr *
+rdsv3_ib_alloc_fmr(struct rdsv3_ib_device *rds_ibdev)
+{
+ struct rdsv3_ib_mr *ibmr;
+
+ RDSV3_DPRINTF4("rdsv3_ib_alloc_fmr", "Enter: ibdev: %p", rds_ibdev);
+
+ if (rds_ibdev->fmr_pool_hdl) {
+ ibmr = (struct rdsv3_ib_mr *)kmem_zalloc(sizeof (*ibmr),
+ KM_SLEEP);
+ ibmr->fmr_pool_hdl = rds_ibdev->fmr_pool_hdl;
+ return (ibmr);
+ }
+ return (struct rdsv3_ib_mr *)(PTR_ERR(-ENOMEM));
+}
+
+static int
+rdsv3_ib_map_fmr(struct rdsv3_ib_device *rds_ibdev, struct rdsv3_ib_mr *ibmr,
+ struct buf *bp, unsigned int nents)
+{
+ ibt_va_attr_t va_attr;
+ ibt_reg_req_t reg_req;
+ uint_t paddr_list_len;
+ uint_t page_sz;
+ ibt_status_t ibt_status;
+ /* LINTED E_FUNC_SET_NOT_USED */
+ unsigned int l_nents = nents;
+
+ RDSV3_DPRINTF4("rdsv3_ib_map_fmr", "Enter: ibmr: %p", ibmr);
+ RDSV3_DPRINTF4("rdsv3_ib_map_fmr", "buf addr: %p", bp->b_un.b_addr);
+
+ /* setup ibt_map_mem_area attributes */
+ bzero(&va_attr, sizeof (ibt_va_attr_t));
+ va_attr.va_buf = bp;
+ va_attr.va_flags = IBT_VA_FMR | IBT_VA_BUF;
+
+ page_sz = rds_ibdev->hca_attr.hca_page_sz * 1024; /* in kbytes */
+ paddr_list_len = (bp->b_bcount / page_sz) + 2; /* start + end pg */
+
+ /* map user buffer to HCA address */
+ ibt_status = ibt_map_mem_area(ib_get_ibt_hca_hdl(rds_ibdev->dev),
+ &va_attr, paddr_list_len, &reg_req, &ibmr->rc_ma_hdl);
+ if (ibt_status != IBT_SUCCESS) {
+ return (-ENOMEM);
+ }
+
+ /* use a free entry from FMR pool to register the specified memory */
+ ibt_status = ibt_register_physical_fmr(
+ ib_get_ibt_hca_hdl(rds_ibdev->dev), ibmr->fmr_pool_hdl,
+ &reg_req.fn_arg, &ibmr->rc_fmr_hdl, &ibmr->rc_mem_desc);
+ if (ibt_status != IBT_SUCCESS) {
+ (void) ibt_unmap_mem_area(ib_get_ibt_hca_hdl(rds_ibdev->dev),
+ ibmr->rc_ma_hdl);
+ if (ibt_status == IBT_INSUFF_RESOURCE) {
+ return (-ENOBUFS);
+ }
+ return (-EINVAL);
+ }
+ RDSV3_DPRINTF4("rdsv3_ib_map_fmr", "Return: ibmr: %p rkey: 0x%x",
+ ibmr, (uint32_t)ibmr->rc_mem_desc.pmd_rkey);
+ return (0);
+}
+
+void
+rdsv3_ib_sync_mr(void *trans_private, int direction)
+{
+ /* LINTED E_FUNC_SET_NOT_USED */
+ void *l_trans_private = trans_private;
+ /* LINTED E_FUNC_SET_NOT_USED */
+ int l_direction = direction;
+
+ /* FMR Sync not needed in Solaris on PCI-ex systems */
+
+ RDSV3_DPRINTF4("rdsv3_ib_sync_mr", "Enter:");
+}
+
+void
+rdsv3_ib_flush_mrs(void)
+{
+ struct rdsv3_ib_device *rds_ibdev;
+
+ RDSV3_DPRINTF4("rdsv3_ib_flush_mrs", "Enter:");
+
+ RDSV3_FOR_EACH_LIST_NODE(rds_ibdev, &rdsv3_ib_devices, list) {
+ if (rds_ibdev->fmr_pool_hdl) {
+ (void) rdsv3_ib_flush_mr_pool(rds_ibdev,
+ rds_ibdev->fmr_pool_hdl, 0);
+ }
+ }
+}
+
+static void
+__rdsv3_ib_teardown_mr(struct rdsv3_ib_mr *ibmr)
+{
+ RDSV3_DPRINTF4("__rdsv3_ib_teardown_mr",
+ "Enter: ibmr: %p umem_cookie %p", ibmr, ibmr->umem_cookie);
+
+ /* unpin memory pages */
+ (void) ddi_umem_unlock(ibmr->umem_cookie);
+}
+
+void
+rdsv3_ib_free_mr(void *trans_private, int invalidate)
+{
+ struct rdsv3_ib_mr *ibmr = trans_private;
+ struct rdsv3_ib_device *rds_ibdev = ibmr->device;
+
+ RDSV3_DPRINTF4("rdsv3_ib_free_mr", "Enter: ibmr: %p inv: %d",
+ ibmr, invalidate);
+
+ /* return the fmr to the IBTF pool */
+ /* the final punch will come from the ibt_flush_fmr_pool() */
+ (void) ibt_deregister_fmr(ib_get_ibt_hca_hdl(rds_ibdev->dev),
+ ibmr->rc_fmr_hdl);
+ (void) ibt_unmap_mem_area(ib_get_ibt_hca_hdl(rds_ibdev->dev),
+ ibmr->rc_ma_hdl);
+ __rdsv3_ib_teardown_mr(ibmr);
+ if (invalidate) {
+ rds_ibdev = ibmr->device;
+ (void) rdsv3_ib_flush_mr_pool(rds_ibdev,
+ rds_ibdev->fmr_pool_hdl, 0);
+ }
+ kmem_free((void *) ibmr, sizeof (*ibmr));
+}
+
+static int
+rdsv3_ib_flush_mr_pool(struct rdsv3_ib_device *rds_ibdev,
+ ibt_fmr_pool_hdl_t pool_hdl, int free_all)
+{
+ /* LINTED E_FUNC_SET_NOT_USED */
+ int l_free_all = free_all;
+
+ RDSV3_DPRINTF4("rdsv3_ib_flush_mr_pool", "Enter: pool: %p", pool_hdl);
+
+ rdsv3_ib_stats_inc(s_ib_rdma_mr_pool_flush);
+
+ (void) ibt_flush_fmr_pool(ib_get_ibt_hca_hdl(rds_ibdev->dev),
+ pool_hdl);
+ return (0);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_recv.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_recv.c
new file mode 100644
index 0000000000..21cbfb08f3
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_recv.c
@@ -0,0 +1,1129 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/cpuvar.h>
+#include <sys/rds.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/ib.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+static struct kmem_cache *rdsv3_ib_incoming_slab;
+static struct kmem_cache *rdsv3_ib_frag_slab;
+static atomic_t rdsv3_ib_allocation = ATOMIC_INIT(0);
+
+static void
+rdsv3_ib_frag_drop_page(struct rdsv3_page_frag *frag)
+{
+ RDSV3_DPRINTF5("rdsv3_ib_frag_drop_page",
+ "frag %p page %p offset %d", frag, frag->f_page, frag->f_offset);
+ kmem_free(frag->f_page, PAGE_SIZE);
+ frag->f_page = NULL;
+}
+
+static void
+rdsv3_ib_frag_free(struct rdsv3_page_frag *frag)
+{
+ RDSV3_DPRINTF5("rdsv3_ib_frag_free", "frag %p page %p",
+ frag, frag->f_page);
+ ASSERT(frag->f_page == NULL);
+ kmem_cache_free(rdsv3_ib_frag_slab, frag);
+}
+
+/*
+ * We map a page at a time. Its fragments are posted in order. This
+ * is called in fragment order as the fragments get send completion events.
+ * Only the last frag in the page performs the unmapping.
+ *
+ * It's OK for ring cleanup to call this in whatever order it likes because
+ * DMA is not in flight and so we can unmap while other ring entries still
+ * hold page references in their frags.
+ */
+static void
+rdsv3_ib_recv_unmap_page(struct rdsv3_ib_connection *ic,
+ struct rdsv3_ib_recv_work *recv)
+{
+ struct rdsv3_page_frag *frag = recv->r_frag;
+
+#if 0
+ RDSV3_DPRINTF5("rdsv3_ib_recv_unmap_page",
+ "recv %p frag %p page %p\n", recv, frag, frag->f_page);
+#endif
+ if (frag->f_mapped) {
+ (void) ibt_unmap_mem_iov(
+ ib_get_ibt_hca_hdl(ic->i_cm_id->device), frag->f_mapped);
+ frag->f_mapped = 0;
+ }
+}
+
+void
+rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection *ic)
+{
+ struct rdsv3_ib_recv_work *recv;
+ struct rdsv3_header *hdrp;
+ uint32_t i;
+
+ RDSV3_DPRINTF4("rdsv3_ib_recv_init_ring", "ic: %p", ic);
+
+ hdrp = ic->i_recv_hdrs;
+ for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
+ recv->r_ibinc = NULL;
+ recv->r_frag = NULL;
+
+ recv->r_wr.recv.wr_id = i;
+
+ /* initialize the hdr sgl permanently */
+ recv->r_sge[0].ds_va = (ib_vaddr_t)(uintptr_t)hdrp++;
+ recv->r_sge[0].ds_len = sizeof (struct rdsv3_header);
+ recv->r_sge[0].ds_key = ic->i_mr->lkey;
+ }
+}
+
+static void
+rdsv3_ib_recv_clear_one(struct rdsv3_ib_connection *ic,
+ struct rdsv3_ib_recv_work *recv)
+{
+ RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "ic: %p, recv: %p",
+ ic, recv);
+
+ if (recv->r_ibinc) {
+ rdsv3_inc_put(&recv->r_ibinc->ii_inc);
+ recv->r_ibinc = NULL;
+ }
+ if (recv->r_frag) {
+ rdsv3_ib_recv_unmap_page(ic, recv);
+ if (recv->r_frag->f_page)
+ rdsv3_ib_frag_drop_page(recv->r_frag);
+ rdsv3_ib_frag_free(recv->r_frag);
+ recv->r_frag = NULL;
+ }
+
+ RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "Return: ic: %p, recv: %p",
+ ic, recv);
+}
+
+void
+rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection *ic)
+{
+ uint32_t i;
+
+ RDSV3_DPRINTF4("rdsv3_ib_recv_clear_ring", "ic: %p", ic);
+
+ for (i = 0; i < ic->i_recv_ring.w_nr; i++)
+ rdsv3_ib_recv_clear_one(ic, &ic->i_recvs[i]);
+
+ if (ic->i_frag.f_page)
+ rdsv3_ib_frag_drop_page(&ic->i_frag);
+}
+
+static int
+rdsv3_ib_recv_refill_one(struct rdsv3_connection *conn,
+ struct rdsv3_ib_recv_work *recv,
+ int kptr_gfp, int page_gfp)
+{
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+ ibt_mi_hdl_t mi_hdl;
+ ibt_iov_attr_t iov_attr;
+ ibt_iov_t iov_arr[1];
+ int ret = -ENOMEM;
+
+ RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "conn: %p, recv: %p",
+ conn, recv);
+
+ if (recv->r_ibinc == NULL) {
+ if (atomic_add_32_nv(&rdsv3_ib_allocation, 1) >
+ rdsv3_ib_sysctl_max_recv_allocation) {
+ atomic_add_32(&rdsv3_ib_allocation, -1);
+ rdsv3_ib_stats_inc(s_ib_rx_alloc_limit);
+ goto out;
+ }
+ recv->r_ibinc = kmem_cache_alloc(rdsv3_ib_incoming_slab,
+ kptr_gfp);
+ if (recv->r_ibinc == NULL) {
+ atomic_add_32(&rdsv3_ib_allocation, -1);
+ goto out;
+ }
+ list_create(&recv->r_ibinc->ii_frags,
+ sizeof (struct rdsv3_page_frag),
+ offsetof(struct rdsv3_page_frag, f_item));
+ rdsv3_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
+ }
+
+ if (recv->r_frag == NULL) {
+ recv->r_frag = kmem_cache_alloc(rdsv3_ib_frag_slab, kptr_gfp);
+ if (recv->r_frag == NULL)
+ goto out;
+ list_link_init(&recv->r_frag->f_item);
+ recv->r_frag->f_page = NULL;
+ }
+
+ if (ic->i_frag.f_page == NULL) {
+ ic->i_frag.f_page = kmem_alloc(PAGE_SIZE, page_gfp);
+ if (ic->i_frag.f_page == NULL)
+ goto out;
+ ic->i_frag.f_offset = 0;
+ }
+
+ iov_attr.iov_as = NULL;
+ iov_attr.iov = &iov_arr[0];
+ iov_attr.iov_buf = NULL;
+ iov_attr.iov_list_len = 1;
+ iov_attr.iov_wr_nds = 1;
+ iov_attr.iov_lso_hdr_sz = 0;
+ iov_attr.iov_flags = IBT_IOV_SLEEP | IBT_IOV_RECV;
+
+ /* Data */
+ iov_arr[0].iov_addr = ic->i_frag.f_page + ic->i_frag.f_offset;
+ iov_arr[0].iov_len = RDSV3_FRAG_SIZE;
+
+ /*
+ * Header comes from pre-registered buffer, so don't map it.
+ * Map the data only and stick in the header sgl quietly after
+ * the call.
+ */
+ recv->r_wr.recv.wr_sgl = &recv->r_sge[1];
+ recv->r_wr.recv.wr_nds = 1;
+
+ ret = ibt_map_mem_iov(ib_get_ibt_hca_hdl(ic->i_cm_id->device),
+ &iov_attr, &recv->r_wr, &mi_hdl);
+ if (ret != IBT_SUCCESS) {
+ RDSV3_DPRINTF2("rdsv3_ib_recv_refill_one",
+ "ibt_map_mem_iov failed: %d", ret);
+ goto out;
+ }
+
+ /* stick in the header */
+ recv->r_wr.recv.wr_sgl = &recv->r_sge[0];
+ recv->r_wr.recv.wr_nds = RDSV3_IB_RECV_SGE;
+
+ /*
+ * Once we get the RDSV3_PAGE_LAST_OFF frag then rdsv3_ib_frag_unmap()
+ * must be called on this recv. This happens as completions hit
+ * in order or on connection shutdown.
+ */
+ recv->r_frag->f_page = ic->i_frag.f_page;
+ recv->r_frag->f_offset = ic->i_frag.f_offset;
+ recv->r_frag->f_mapped = mi_hdl;
+
+ if (ic->i_frag.f_offset < RDSV3_PAGE_LAST_OFF) {
+ ic->i_frag.f_offset += RDSV3_FRAG_SIZE;
+ } else {
+ ic->i_frag.f_page = NULL;
+ ic->i_frag.f_offset = 0;
+ }
+
+ ret = 0;
+
+ RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "Return: conn: %p, recv: %p",
+ conn, recv);
+out:
+ return (ret);
+}
+
+/*
+ * This tries to allocate and post unused work requests after making sure that
+ * they have all the allocations they need to queue received fragments into
+ * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
+ * pairs don't go unmatched.
+ *
+ * -1 is returned if posting fails due to temporary resource exhaustion.
+ */
+int
+rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int kptr_gfp,
+ int page_gfp, int prefill)
+{
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+ struct rdsv3_ib_recv_work *recv;
+ unsigned int succ_wr;
+ unsigned int posted = 0;
+ int ret = 0;
+ uint32_t pos;
+
+ RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "conn: %p, prefill: %d",
+ conn, prefill);
+
+ while ((prefill || rdsv3_conn_up(conn)) &&
+ rdsv3_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+ if (pos >= ic->i_recv_ring.w_nr) {
+ RDSV3_DPRINTF0("rdsv3_ib_recv_refill",
+ "Argh - ring alloc returned pos=%u",
+ pos);
+ ret = -EINVAL;
+ break;
+ }
+
+ recv = &ic->i_recvs[pos];
+ ret = rdsv3_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
+ if (ret) {
+ ret = -1;
+ break;
+ }
+
+ /* XXX when can this fail? */
+ ret = ibt_post_recv(ib_get_ibt_channel_hdl(ic->i_cm_id),
+ &recv->r_wr.recv, 1, &succ_wr);
+ RDSV3_DPRINTF5("rdsv3_ib_recv_refill",
+ "recv %p ibinc %p frag %p ret %d\n", recv,
+ recv->r_ibinc, recv->r_frag, ret);
+ if (ret) {
+ RDSV3_DPRINTF2("rdsv3_ib_recv_refill",
+ "Return: conn: %p, posted: %d", conn, ret);
+ rdsv3_conn_drop(conn);
+ ret = -1;
+ break;
+ }
+
+ posted++;
+ }
+
+ /* We're doing flow control - update the window. */
+ if (ic->i_flowctl && posted)
+ rdsv3_ib_advertise_credits(conn, posted);
+
+ if (ret)
+ rdsv3_ib_ring_unalloc(&ic->i_recv_ring, 1);
+
+ RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "Return: conn: %p, posted: %d",
+ conn, posted);
+ return (ret);
+}
+
+void
+rdsv3_ib_inc_purge(struct rdsv3_incoming *inc)
+{
+ struct rdsv3_ib_incoming *ibinc;
+ struct rdsv3_page_frag *frag;
+ struct rdsv3_page_frag *pos;
+
+ RDSV3_DPRINTF4("rdsv3_ib_inc_purge", "inc: %p", inc);
+
+ ibinc = container_of(inc, struct rdsv3_ib_incoming, ii_inc);
+ RDSV3_DPRINTF5("rdsv3_ib_inc_purge",
+ "purging ibinc %p inc %p\n", ibinc, inc);
+
+ RDSV3_FOR_EACH_LIST_NODE_SAFE(frag, pos, &ibinc->ii_frags, f_item) {
+ list_remove_node(&frag->f_item);
+ rdsv3_ib_frag_drop_page(frag);
+ rdsv3_ib_frag_free(frag);
+ }
+
+ RDSV3_DPRINTF4("rdsv3_ib_inc_purge", "Return: inc: %p", inc);
+}
+
+void
+rdsv3_ib_inc_free(struct rdsv3_incoming *inc)
+{
+ struct rdsv3_ib_incoming *ibinc;
+
+ RDSV3_DPRINTF4("rdsv3_ib_inc_free", "inc: %p", inc);
+
+ ibinc = container_of(inc, struct rdsv3_ib_incoming, ii_inc);
+
+ rdsv3_ib_inc_purge(inc);
+ RDSV3_DPRINTF5("rdsv3_ib_inc_free", "freeing ibinc %p inc %p",
+ ibinc, inc);
+ ASSERT(list_is_empty(&ibinc->ii_frags));
+ kmem_cache_free(rdsv3_ib_incoming_slab, ibinc);
+ atomic_dec_uint(&rdsv3_ib_allocation);
+
+ RDSV3_DPRINTF4("rdsv3_ib_inc_free", "Return: inc: %p", inc);
+}
+
+int
+rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming *inc, uio_t *uiop,
+ size_t size)
+{
+ struct rdsv3_ib_incoming *ibinc;
+ struct rdsv3_page_frag *frag;
+ unsigned long to_copy;
+ unsigned long frag_off = 0;
+ int copied = 0;
+ int ret;
+ uint32_t len;
+
+ ibinc = container_of(inc, struct rdsv3_ib_incoming, ii_inc);
+ frag = list_head(&ibinc->ii_frags);
+ len = ntohl(inc->i_hdr.h_len);
+
+ RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user", "inc: %p, size: %d len: %d",
+ inc, size, len);
+
+ while (copied < size && copied < len) {
+ if (frag_off == RDSV3_FRAG_SIZE) {
+ frag = list_next(&ibinc->ii_frags, frag);
+ frag_off = 0;
+ }
+
+ to_copy = min(len - copied, RDSV3_FRAG_SIZE - frag_off);
+ to_copy = min(size - copied, to_copy);
+
+ RDSV3_DPRINTF5("rdsv3_ib_inc_copy_to_user",
+ "%lu bytes to user %p from frag [%p, %u] + %lu",
+ to_copy, uiop,
+ frag->f_page, frag->f_offset, frag_off);
+
+ ret = uiomove((caddr_t)(frag->f_page +
+ frag->f_offset + frag_off),
+ to_copy, UIO_READ, uiop);
+ if (ret) {
+ RDSV3_DPRINTF2("rdsv3_ib_inc_copy_to_user",
+ "uiomove (%d) returned: %d", to_copy, ret);
+ break;
+ }
+
+ frag_off += to_copy;
+ copied += to_copy;
+ }
+
+ RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user",
+ "Return: inc: %p, copied: %d", inc, copied);
+
+ return (copied);
+}
+
+/* ic starts out kmem_zalloc()ed */
+void
+rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection *ic)
+{
+ ibt_send_wr_t *wr = &ic->i_ack_wr;
+ ibt_wr_ds_t *sge = &ic->i_ack_sge;
+
+ RDSV3_DPRINTF4("rdsv3_ib_recv_init_ack", "ic: %p", ic);
+
+ sge->ds_va = ic->i_ack_dma;
+ sge->ds_len = sizeof (struct rdsv3_header);
+ sge->ds_key = ic->i_mr->lkey;
+
+ wr->wr_sgl = sge;
+ wr->wr_nds = 1;
+ wr->wr_opcode = IBT_WRC_SEND;
+ wr->wr_id = RDSV3_IB_ACK_WR_ID;
+ wr->wr_flags = IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT;
+}
+
+/*
+ * You'd think that with reliable IB connections you wouldn't need to ack
+ * messages that have been received. The problem is that IB hardware generates
+ * an ack message before it has DMAed the message into memory. This creates a
+ * potential message loss if the HCA is disabled for any reason between when it
+ * sends the ack and before the message is DMAed and processed. This is only a
+ * potential issue if another HCA is available for fail-over.
+ *
+ * When the remote host receives our ack they'll free the sent message from
+ * their send queue. To decrease the latency of this we always send an ack
+ * immediately after we've received messages.
+ *
+ * For simplicity, we only have one ack in flight at a time. This puts
+ * pressure on senders to have deep enough send queues to absorb the latency of
+ * a single ack frame being in flight. This might not be good enough.
+ *
+ * This is implemented by have a long-lived send_wr and sge which point to a
+ * statically allocated ack frame. This ack wr does not fall under the ring
+ * accounting that the tx and rx wrs do. The QP attribute specifically makes
+ * room for it beyond the ring size. Send completion notices its special
+ * wr_id and avoids working with the ring in that case.
+ */
+static void
+rdsv3_ib_set_ack(struct rdsv3_ib_connection *ic, uint64_t seq,
+ int ack_required)
+{
+ RDSV3_DPRINTF4("rdsv3_ib_set_ack", "ic: %p, seq: %lld ack: %d",
+ ic, seq, ack_required);
+
+ mutex_enter(&ic->i_ack_lock);
+ ic->i_ack_next = seq;
+ if (ack_required)
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ mutex_exit(&ic->i_ack_lock);
+}
+
+static uint64_t
+rdsv3_ib_get_ack(struct rdsv3_ib_connection *ic)
+{
+ uint64_t seq;
+
+ RDSV3_DPRINTF4("rdsv3_ib_get_ack", "ic: %p", ic);
+
+ clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+ mutex_enter(&ic->i_ack_lock);
+ seq = ic->i_ack_next;
+ mutex_exit(&ic->i_ack_lock);
+
+ return (seq);
+}
+
+static void
+rdsv3_ib_send_ack(struct rdsv3_ib_connection *ic, unsigned int adv_credits)
+{
+ struct rdsv3_header *hdr = ic->i_ack;
+ uint64_t seq;
+ int ret;
+
+ RDSV3_DPRINTF4("rdsv3_ib_send_ack", "ic: %p adv_credits: %d",
+ ic, adv_credits);
+
+ seq = rdsv3_ib_get_ack(ic);
+
+ RDSV3_DPRINTF4("rdsv3_ib_send_ack", "send_ack: ic %p ack %llu",
+ ic, (unsigned long long) seq);
+ rdsv3_message_populate_header(hdr, 0, 0, 0);
+ hdr->h_ack = htonll(seq);
+ hdr->h_credit = adv_credits;
+ rdsv3_message_make_checksum(hdr);
+ ic->i_ack_queued = jiffies;
+
+ ret = ibt_post_send(RDSV3_QP2CHANHDL(ic->i_cm_id->qp), &ic->i_ack_wr, 1,
+ NULL);
+ if (ret) {
+ /*
+ * Failed to send. Release the WR, and
+ * force another ACK.
+ */
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ rdsv3_ib_stats_inc(s_ib_ack_send_failure);
+#if 1
+ RDSV3_DPRINTF2("rdsv3_ib_send_ack", "ibt_post_send FAIL");
+#else
+ /* Need to finesse this later. */
+ RDSV3_PANIC();
+#endif
+ } else {
+ rdsv3_ib_stats_inc(s_ib_ack_sent);
+ }
+ RDSV3_DPRINTF4("rdsv3_ib_send_ack", "Return: ic: %p adv_credits: %d",
+ ic, adv_credits);
+}
+
+/*
+ * There are 3 ways of getting acknowledgements to the peer:
+ * 1. We call rdsv3_ib_attempt_ack from the recv completion handler
+ * to send an ACK-only frame.
+ * However, there can be only one such frame in the send queue
+ * at any time, so we may have to postpone it.
+ * 2. When another (data) packet is transmitted while there's
+ * an ACK in the queue, we piggyback the ACK sequence number
+ * on the data packet.
+ * 3. If the ACK WR is done sending, we get called from the
+ * send queue completion handler, and check whether there's
+ * another ACK pending (postponed because the WR was on the
+ * queue). If so, we transmit it.
+ *
+ * We maintain 2 variables:
+ * - i_ack_flags, which keeps track of whether the ACK WR
+ * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
+ * - i_ack_next, which is the last sequence number we received
+ *
+ * Potentially, send queue and receive queue handlers can run concurrently.
+ * It would be nice to not have to use a spinlock to synchronize things,
+ * but the one problem that rules this out is that 64bit updates are
+ * not atomic on all platforms. Things would be a lot simpler if
+ * we had atomic64 or maybe cmpxchg64 everywhere.
+ *
+ * Reconnecting complicates this picture just slightly. When we
+ * reconnect, we may be seeing duplicate packets. The peer
+ * is retransmitting them, because it hasn't seen an ACK for
+ * them. It is important that we ACK these.
+ *
+ * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
+ * this flag set *MUST* be acknowledged immediately.
+ */
+
+/*
+ * When we get here, we're called from the recv queue handler.
+ * Check whether we ought to transmit an ACK.
+ */
+void
+rdsv3_ib_attempt_ack(struct rdsv3_ib_connection *ic)
+{
+ unsigned int adv_credits;
+
+ RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "ic: %p", ic);
+
+ if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+ return;
+
+ if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
+ rdsv3_ib_stats_inc(s_ib_ack_send_delayed);
+ return;
+ }
+
+ /* Can we get a send credit? */
+ if (!rdsv3_ib_send_grab_credits(ic, 1, &adv_credits, 0,
+ RDSV3_MAX_ADV_CREDIT)) {
+ rdsv3_ib_stats_inc(s_ib_tx_throttle);
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ return;
+ }
+
+ clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ rdsv3_ib_send_ack(ic, adv_credits);
+
+ RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "Return: ic: %p", ic);
+}
+
+/*
+ * We get here from the send completion handler, when the
+ * adapter tells us the ACK frame was sent.
+ */
+void
+rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection *ic)
+{
+ RDSV3_DPRINTF4("rdsv3_ib_ack_send_complete", "ic: %p", ic);
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ rdsv3_ib_attempt_ack(ic);
+}
+
+/*
+ * This is called by the regular xmit code when it wants to piggyback
+ * an ACK on an outgoing frame.
+ */
+uint64_t
+rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection *ic)
+{
+ RDSV3_DPRINTF4("rdsv3_ib_piggyb_ack", "ic: %p", ic);
+ if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) {
+ rdsv3_ib_stats_inc(s_ib_ack_send_piggybacked);
+ }
+ return (rdsv3_ib_get_ack(ic));
+}
+
+static struct rdsv3_header *
+rdsv3_ib_get_header(struct rdsv3_connection *conn,
+ struct rdsv3_ib_recv_work *recv,
+ uint32_t data_len)
+{
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+ void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs];
+
+ RDSV3_DPRINTF4("rdsv3_ib_get_header", "conn: %p, recv: %p len: %d",
+ conn, recv, data_len);
+
+ /*
+ * Support header at the front (RDS 3.1+) as well as header-at-end.
+ *
+ * Cases:
+ * 1) header all in header buff (great!)
+ * 2) header all in data page (copy all to header buff)
+ * 3) header split across hdr buf + data page
+ * (move bit in hdr buff to end before copying other bit from
+ * data page)
+ */
+ if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDSV3_FRAG_SIZE)
+ return (hdr_buff);
+ /*
+ * XXX - Need to discuss the support for version < RDS_PROTOCOL_3_1.
+ */
+ if (conn->c_version == RDS_PROTOCOL_3_0)
+ return (hdr_buff);
+
+ /* version < RDS_PROTOCOL_3_0 */
+ RDSV3_DPRINTF2("rdsv3_ib_get_header",
+ "NULL header (version: 0x%x, data_len: %d)", conn->c_version,
+ data_len);
+ return (NULL);
+}
+
+/*
+ * It's kind of lame that we're copying from the posted receive pages into
+ * long-lived bitmaps. We could have posted the bitmaps and rdma written into
+ * them. But receiving new congestion bitmaps should be a *rare* event, so
+ * hopefully we won't need to invest that complexity in making it more
+ * efficient. By copying we can share a simpler core with TCP which has to
+ * copy.
+ */
+static void
+rdsv3_ib_cong_recv(struct rdsv3_connection *conn,
+ struct rdsv3_ib_incoming *ibinc)
+{
+ struct rdsv3_cong_map *map;
+ unsigned int map_off;
+ unsigned int map_page;
+ struct rdsv3_page_frag *frag;
+ unsigned long frag_off;
+ unsigned long to_copy;
+ unsigned long copied;
+ uint64_t uncongested = 0;
+ caddr_t addr;
+
+ RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "conn: %p, ibinc: %p",
+ conn, ibinc);
+
+ /* catch completely corrupt packets */
+ if (ntohl(ibinc->ii_inc.i_hdr.h_len) != RDSV3_CONG_MAP_BYTES)
+ return;
+
+ map = conn->c_fcong;
+ map_page = 0;
+ map_off = 0;
+
+ frag = list_head(&ibinc->ii_frags);
+ frag_off = 0;
+
+ copied = 0;
+
+ while (copied < RDSV3_CONG_MAP_BYTES) {
+ uint64_t *src, *dst;
+ unsigned int k;
+
+ to_copy = min(RDSV3_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
+ ASSERT(!(to_copy & 7)); /* Must be 64bit aligned. */
+
+ addr = frag->f_page + frag->f_offset;
+
+ src = (uint64_t *)(addr + frag_off);
+ dst = (uint64_t *)(map->m_page_addrs[map_page] + map_off);
+ RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
+ "src: %p dst: %p copied: %d", src, dst, copied);
+ for (k = 0; k < to_copy; k += 8) {
+ /*
+ * Record ports that became uncongested, ie
+ * bits that changed from 0 to 1.
+ */
+ uncongested |= ~(*src) & *dst;
+ *dst++ = *src++;
+ }
+
+ copied += to_copy;
+ RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
+ "src: %p dst: %p copied: %d", src, dst, copied);
+
+ map_off += to_copy;
+ if (map_off == PAGE_SIZE) {
+ map_off = 0;
+ map_page++;
+ }
+
+ frag_off += to_copy;
+ if (frag_off == RDSV3_FRAG_SIZE) {
+ frag = list_next(&ibinc->ii_frags, frag);
+ frag_off = 0;
+ }
+ }
+
+#if 0
+XXX
+ /* the congestion map is in little endian order */
+ uncongested = le64_to_cpu(uncongested);
+#endif
+
+ rdsv3_cong_map_updated(map, uncongested);
+
+ RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "Return: conn: %p, ibinc: %p",
+ conn, ibinc);
+}
+
+/*
+ * Rings are posted with all the allocations they'll need to queue the
+ * incoming message to the receiving socket so this can't fail.
+ * All fragments start with a header, so we can make sure we're not receiving
+ * garbage, and we can tell a small 8 byte fragment from an ACK frame.
+ */
+struct rdsv3_ib_ack_state {
+ uint64_t ack_next;
+ uint64_t ack_recv;
+ unsigned int ack_required:1;
+ unsigned int ack_next_valid:1;
+ unsigned int ack_recv_valid:1;
+};
+
+static void
+rdsv3_ib_process_recv(struct rdsv3_connection *conn,
+ struct rdsv3_ib_recv_work *recv, uint32_t data_len,
+ struct rdsv3_ib_ack_state *state)
+{
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+ struct rdsv3_ib_incoming *ibinc = ic->i_ibinc;
+ struct rdsv3_header *ihdr, *hdr;
+
+ /* XXX shut down the connection if port 0,0 are seen? */
+
+ RDSV3_DPRINTF5("rdsv3_ib_process_recv",
+ "ic %p ibinc %p recv %p byte len %u", ic, ibinc, recv, data_len);
+
+ if (data_len < sizeof (struct rdsv3_header)) {
+ RDSV3_DPRINTF2("rdsv3_ib_process_recv",
+ "incoming message from %u.%u.%u.%u didn't include a "
+ "header, disconnecting and reconnecting",
+ NIPQUAD(conn->c_faddr));
+ rdsv3_conn_drop(conn);
+ return;
+ }
+ data_len -= sizeof (struct rdsv3_header);
+
+ if ((ihdr = rdsv3_ib_get_header(conn, recv, data_len)) == NULL) {
+ RDSV3_DPRINTF2("rdsv3_ib_process_recv", "incoming message "
+ "from %u.%u.%u.%u didn't have a proper version (0x%x) or"
+ "data_len (0x%x), disconnecting and "
+ "reconnecting",
+ NIPQUAD(conn->c_faddr), conn->c_version, data_len);
+ rdsv3_conn_drop(conn);
+ return;
+ }
+
+ /* Validate the checksum. */
+ if (!rdsv3_message_verify_checksum(ihdr)) {
+ RDSV3_DPRINTF2("rdsv3_ib_process_recv", "incoming message "
+ "from %u.%u.%u.%u has corrupted header - "
+ "forcing a reconnect",
+ NIPQUAD(conn->c_faddr));
+ rdsv3_conn_drop(conn);
+ rdsv3_stats_inc(s_recv_drop_bad_checksum);
+ return;
+ }
+
+ /* Process the ACK sequence which comes with every packet */
+ state->ack_recv = ntohll(ihdr->h_ack);
+ state->ack_recv_valid = 1;
+
+ /* Process the credits update if there was one */
+ if (ihdr->h_credit)
+ rdsv3_ib_send_add_credits(conn, ihdr->h_credit);
+
+ if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
+ /*
+ * This is an ACK-only packet. The fact that it gets
+ * special treatment here is that historically, ACKs
+ * were rather special beasts.
+ */
+ rdsv3_ib_stats_inc(s_ib_ack_received);
+
+ /*
+ * Usually the frags make their way on to incs and are then
+ * freed as
+ * the inc is freed. We don't go that route, so we have to
+ * drop the
+ * page ref ourselves. We can't just leave the page on the recv
+ * because that confuses the dma mapping of pages and each
+ * recv's use
+ * of a partial page. We can leave the frag, though, it will be
+ * reused.
+ *
+ * FIXME: Fold this into the code path below.
+ */
+ rdsv3_ib_frag_drop_page(recv->r_frag);
+ return;
+ }
+
+ /*
+ * If we don't already have an inc on the connection then this
+ * fragment has a header and starts a message.. copy its header
+ * into the inc and save the inc so we can hang upcoming fragments
+ * off its list.
+ */
+ if (ibinc == NULL) {
+ ibinc = recv->r_ibinc;
+ recv->r_ibinc = NULL;
+ ic->i_ibinc = ibinc;
+
+ hdr = &ibinc->ii_inc.i_hdr;
+ (void) memcpy(hdr, ihdr, sizeof (*hdr));
+ ic->i_recv_data_rem = ntohl(hdr->h_len);
+
+ RDSV3_DPRINTF5("rdsv3_ib_process_recv",
+ "ic %p ibinc %p rem %u flag 0x%x", ic, ibinc,
+ ic->i_recv_data_rem, hdr->h_flags);
+ } else {
+ hdr = &ibinc->ii_inc.i_hdr;
+ /*
+ * We can't just use memcmp here; fragments of a
+ * single message may carry different ACKs
+ */
+ if (hdr->h_sequence != ihdr->h_sequence ||
+ hdr->h_len != ihdr->h_len ||
+ hdr->h_sport != ihdr->h_sport ||
+ hdr->h_dport != ihdr->h_dport) {
+ RDSV3_DPRINTF2("rdsv3_ib_process_recv",
+ "fragment header mismatch; forcing reconnect");
+ rdsv3_conn_drop(conn);
+ return;
+ }
+ }
+
+ list_insert_tail(&ibinc->ii_frags, recv->r_frag);
+ recv->r_frag = NULL;
+
+ if (ic->i_recv_data_rem > RDSV3_FRAG_SIZE)
+ ic->i_recv_data_rem -= RDSV3_FRAG_SIZE;
+ else {
+ ic->i_recv_data_rem = 0;
+ ic->i_ibinc = NULL;
+
+ if (ibinc->ii_inc.i_hdr.h_flags == RDSV3_FLAG_CONG_BITMAP)
+ rdsv3_ib_cong_recv(conn, ibinc);
+ else {
+ rdsv3_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+ &ibinc->ii_inc, KM_NOSLEEP);
+ state->ack_next = ntohll(hdr->h_sequence);
+ state->ack_next_valid = 1;
+ }
+
+ /*
+ * Evaluate the ACK_REQUIRED flag *after* we received
+ * the complete frame, and after bumping the next_rx
+ * sequence.
+ */
+ if (hdr->h_flags & RDSV3_FLAG_ACK_REQUIRED) {
+ rdsv3_stats_inc(s_recv_ack_required);
+ state->ack_required = 1;
+ }
+
+ rdsv3_inc_put(&ibinc->ii_inc);
+ }
+
+ RDSV3_DPRINTF4("rdsv3_ib_process_recv",
+ "Return: conn: %p recv: %p len: %d state: %p",
+ conn, recv, data_len, state);
+}
+
+/*
+ * Plucking the oldest entry from the ring can be done concurrently with
+ * the thread refilling the ring. Each ring operation is protected by
+ * spinlocks and the transient state of refilling doesn't change the
+ * recording of which entry is oldest.
+ *
+ * This relies on IB only calling one cq comp_handler for each cq so that
+ * there will only be one caller of rdsv3_recv_incoming() per RDS connection.
+ */
+
+void
+rdsv3_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+ struct rdsv3_connection *conn = context;
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+
+ RDSV3_DPRINTF4("rdsv3_ib_recv_cq_comp_handler",
+ "Enter(conn: %p cq: %p)", conn, cq);
+
+ rdsv3_ib_stats_inc(s_ib_rx_cq_call);
+
+ (void) ddi_taskq_dispatch(ic->i_recv_tasklet, rdsv3_ib_recv_tasklet_fn,
+ (void *)ic, DDI_SLEEP);
+}
+
+static inline void
+rdsv3_poll_cq(struct rdsv3_ib_connection *ic, struct rdsv3_ib_ack_state *state)
+{
+ struct rdsv3_connection *conn = ic->conn;
+ ibt_wc_t wc;
+ struct rdsv3_ib_recv_work *recv;
+ uint_t polled;
+
+ while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_recv_cq), &wc, 1, &polled) ==
+ IBT_SUCCESS) {
+ RDSV3_DPRINTF5("rdsv3_ib_recv_cq_comp_handler",
+ "rwc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+ (unsigned long long)wc.wc_id, wc.wc_status,
+ wc.wc_bytes_xfer, ntohl(wc.wc_immed_data));
+ rdsv3_ib_stats_inc(s_ib_rx_cq_event);
+
+ recv = &ic->i_recvs[rdsv3_ib_ring_oldest(&ic->i_recv_ring)];
+
+ rdsv3_ib_recv_unmap_page(ic, recv);
+
+ /*
+ * Also process recvs in connecting state because it is possible
+ * to get a recv completion _before_ the rdmacm ESTABLISHED
+ * event is processed.
+ */
+ if (rdsv3_conn_up(conn) || rdsv3_conn_connecting(conn)) {
+ /*
+ * We expect errors as the qp is drained during
+ * shutdown
+ */
+ if (wc.wc_status == IBT_WC_SUCCESS) {
+ rdsv3_ib_process_recv(conn, recv,
+ wc.wc_bytes_xfer, state);
+ } else {
+ RDSV3_DPRINTF2("rdsv3_ib_recv_cq_comp_handler",
+ "recv completion on "
+ "%u.%u.%u.%u had status %u, "
+ "disconnecting and reconnecting\n",
+ NIPQUAD(conn->c_faddr),
+ wc.wc_status);
+ rdsv3_conn_drop(conn);
+ }
+ }
+
+ rdsv3_ib_ring_free(&ic->i_recv_ring, 1);
+ }
+}
+
+static processorid_t rdsv3_taskq_bind_cpuid = 0;
+void
+rdsv3_ib_recv_tasklet_fn(void *data)
+{
+ struct rdsv3_ib_connection *ic = (struct rdsv3_ib_connection *)data;
+ struct rdsv3_connection *conn = ic->conn;
+ struct rdsv3_ib_ack_state state = { 0, };
+ cpu_t *cp;
+
+ RDSV3_DPRINTF4("rdsv3_ib_recv_tasklet_fn", "Enter: ic: %p", ic);
+
+ /* If not already bound, bind this thread to a CPU */
+ if (ic->i_recv_tasklet_cpuid != rdsv3_taskq_bind_cpuid) {
+ cp = cpu[rdsv3_taskq_bind_cpuid];
+ mutex_enter(&cpu_lock);
+ if (cpu_is_online(cp)) {
+ if (ic->i_recv_tasklet_cpuid >= 0)
+ thread_affinity_clear(curthread);
+ thread_affinity_set(curthread, rdsv3_taskq_bind_cpuid);
+ ic->i_recv_tasklet_cpuid = rdsv3_taskq_bind_cpuid;
+ }
+ mutex_exit(&cpu_lock);
+ }
+
+ rdsv3_poll_cq(ic, &state);
+ (void) ibt_enable_cq_notify(RDSV3_CQ2CQHDL(ic->i_recv_cq),
+ IBT_NEXT_SOLICITED);
+ rdsv3_poll_cq(ic, &state);
+
+ if (state.ack_next_valid)
+ rdsv3_ib_set_ack(ic, state.ack_next, state.ack_required);
+ if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
+ rdsv3_send_drop_acked(conn, state.ack_recv, NULL);
+ ic->i_ack_recv = state.ack_recv;
+ }
+ if (rdsv3_conn_up(conn))
+ rdsv3_ib_attempt_ack(ic);
+
+ /*
+ * If we ever end up with a really empty receive ring, we're
+ * in deep trouble, as the sender will definitely see RNR
+ * timeouts.
+ */
+ if (rdsv3_ib_ring_empty(&ic->i_recv_ring))
+ rdsv3_ib_stats_inc(s_ib_rx_ring_empty);
+
+ /*
+ * If the ring is running low, then schedule the thread to refill.
+ */
+ if (rdsv3_ib_ring_low(&ic->i_recv_ring) &&
+ (rdsv3_conn_up(conn) || rdsv3_conn_connecting(conn)))
+ rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0);
+
+ RDSV3_DPRINTF4("rdsv3_ib_recv_tasklet_fn", "Return: ic: %p", ic);
+}
+
+int
+rdsv3_ib_recv(struct rdsv3_connection *conn)
+{
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+ int ret = 0;
+
+ RDSV3_DPRINTF4("rdsv3_ib_recv", "conn %p\n", conn);
+
+ /*
+ * If we get a temporary posting failure in this context then
+ * we're really low and we want the caller to back off for a bit.
+ */
+ mutex_enter(&ic->i_recv_mutex);
+ if (rdsv3_ib_recv_refill(conn, KM_NOSLEEP, 0, 0))
+ ret = -ENOMEM;
+ else
+ rdsv3_ib_stats_inc(s_ib_rx_refill_from_thread);
+ mutex_exit(&ic->i_recv_mutex);
+
+ if (rdsv3_conn_up(conn))
+ rdsv3_ib_attempt_ack(ic);
+
+ RDSV3_DPRINTF4("rdsv3_ib_recv", "Return: conn: %p", conn);
+
+ return (ret);
+}
+
+uint_t MaxRecvMemory = 128 * 1024 * 1024;
+
+int
+rdsv3_ib_recv_init(void)
+{
+ int ret = -ENOMEM;
+
+ RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Enter");
+
+ /* XXX - hard code it to 128 MB */
+ rdsv3_ib_sysctl_max_recv_allocation = MaxRecvMemory / RDSV3_FRAG_SIZE;
+
+ rdsv3_ib_incoming_slab = kmem_cache_create("rdsv3_ib_incoming",
+ sizeof (struct rdsv3_ib_incoming), 0, NULL, NULL, NULL,
+ NULL, NULL, 0);
+ if (rdsv3_ib_incoming_slab == NULL)
+ goto out;
+
+ rdsv3_ib_frag_slab = kmem_cache_create("rdsv3_ib_frag",
+ sizeof (struct rdsv3_page_frag),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ if (rdsv3_ib_frag_slab == NULL)
+ kmem_cache_destroy(rdsv3_ib_incoming_slab);
+ else
+ ret = 0;
+
+ RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Return");
+out:
+ return (ret);
+}
+
+void
+rdsv3_ib_recv_exit(void)
+{
+ RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Enter");
+ kmem_cache_destroy(rdsv3_ib_incoming_slab);
+ kmem_cache_destroy(rdsv3_ib_frag_slab);
+ RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Return");
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_ring.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_ring.c
new file mode 100644
index 0000000000..889cc016d8
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_ring.c
@@ -0,0 +1,208 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/rds.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/ib.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+/*
+ * Locking for IB rings.
+ * We assume that allocation is always protected by a mutex
+ * in the caller (this is a valid assumption for the current
+ * implementation).
+ *
+ * Freeing always happens in an interrupt, and hence only
+ * races with allocations, but not with other free()s.
+ *
+ * The interaction between allocation and freeing is that
+ * the alloc code has to determine the number of free entries.
+ * To this end, we maintain two counters; an allocation counter
+ * and a free counter. Both are allowed to run freely, and wrap
+ * around.
+ * The number of used entries is always (alloc_ctr - free_ctr) % NR.
+ *
+ * The current implementation makes free_ctr atomic. When the
+ * caller finds an allocation fails, it should set an "alloc fail"
+ * bit and retry the allocation. The "alloc fail" bit essentially tells
+ * the CQ completion handlers to wake it up after freeing some
+ * more entries.
+ */
+
+/*
+ * This only happens on shutdown.
+ */
+rdsv3_wait_queue_t rdsv3_ib_ring_empty_wait;
+
+void
+rdsv3_ib_ring_init(struct rdsv3_ib_work_ring *ring, uint32_t nr)
+{
+ (void) memset(ring, 0, sizeof (*ring));
+ ring->w_nr = nr;
+ RDSV3_DPRINTF5("rdsv3_ib_ring_init", "ring %p nr %u", ring, ring->w_nr);
+}
+
+static inline uint32_t
+__rdsv3_ib_ring_used(struct rdsv3_ib_work_ring *ring)
+{
+ uint32_t diff;
+
+ /* This assumes that atomic_t has at least as many bits as uint32_t */
+ diff = ring->w_alloc_ctr - (uint32_t)atomic_get(&ring->w_free_ctr);
+ ASSERT(diff <= ring->w_nr);
+
+ return (diff);
+}
+
+void
+rdsv3_ib_ring_resize(struct rdsv3_ib_work_ring *ring, uint32_t nr)
+{
+ /*
+ * We only ever get called from the connection setup code,
+ * prior to creating the QP.
+ */
+ ASSERT(!__rdsv3_ib_ring_used(ring));
+ ring->w_nr = nr;
+}
+
+static int
+__rdsv3_ib_ring_empty(struct rdsv3_ib_work_ring *ring)
+{
+ return (__rdsv3_ib_ring_used(ring) == 0);
+}
+
+uint32_t
+rdsv3_ib_ring_alloc(struct rdsv3_ib_work_ring *ring, uint32_t val,
+ uint32_t *pos)
+{
+ uint32_t ret = 0, avail;
+
+ avail = ring->w_nr - __rdsv3_ib_ring_used(ring);
+
+ RDSV3_DPRINTF5("rdsv3_ib_ring_alloc",
+ "ring %p val %u next %u free %u", ring, val,
+ ring->w_alloc_ptr, avail);
+
+ if (val && avail) {
+ ret = min(val, avail);
+ *pos = ring->w_alloc_ptr;
+
+ ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
+ ring->w_alloc_ctr += ret;
+ }
+
+ return (ret);
+}
+
+void
+rdsv3_ib_ring_free(struct rdsv3_ib_work_ring *ring, uint32_t val)
+{
+ ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
+ atomic_add_32(&ring->w_free_ctr, val);
+
+ if (__rdsv3_ib_ring_empty(ring))
+ rdsv3_wake_up(&rdsv3_ib_ring_empty_wait);
+}
+
+void
+rdsv3_ib_ring_unalloc(struct rdsv3_ib_work_ring *ring, uint32_t val)
+{
+ ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
+ ring->w_alloc_ctr -= val;
+}
+
+int
+rdsv3_ib_ring_empty(struct rdsv3_ib_work_ring *ring)
+{
+ return (__rdsv3_ib_ring_empty(ring));
+}
+
+int
+rdsv3_ib_ring_low(struct rdsv3_ib_work_ring *ring)
+{
+ return (__rdsv3_ib_ring_used(ring) <= (ring->w_nr >> 1));
+}
+
+/*
+ * returns the oldest alloced ring entry. This will be the next one
+ * freed. This can't be called if there are none allocated.
+ */
+uint32_t
+rdsv3_ib_ring_oldest(struct rdsv3_ib_work_ring *ring)
+{
+ return (ring->w_free_ptr);
+}
+
+/*
+ * returns the number of completed work requests.
+ */
+
+uint32_t
+rdsv3_ib_ring_completed(struct rdsv3_ib_work_ring *ring,
+ uint32_t wr_id, uint32_t oldest)
+{
+ uint32_t ret;
+
+ if (oldest <= (unsigned long long)wr_id)
+ ret = (unsigned long long)wr_id - oldest + 1;
+ else
+ ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
+
+ RDSV3_DPRINTF5("rdsv3_ib_ring_completed",
+ "ring %p ret %u wr_id %u oldest %u", ring, ret, wr_id, oldest);
+ return (ret);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_send.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_send.c
new file mode 100644
index 0000000000..9a8ba2fd6c
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_send.c
@@ -0,0 +1,1148 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/rds.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/rdma.h>
+#include <sys/ib/clients/rdsv3/ib.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+static void
+rdsv3_ib_send_rdma_complete(struct rdsv3_message *rm,
+ int wc_status)
+{
+ int notify_status;
+
+ RDSV3_DPRINTF4("rdsv3_ib_send_rdma_complete", "rm: %p, wc_status: %d",
+ rm, wc_status);
+
+ switch (wc_status) {
+ case IBT_WC_WR_FLUSHED_ERR:
+ return;
+
+ case IBT_WC_SUCCESS:
+ notify_status = RDSV3_RDMA_SUCCESS;
+ break;
+
+ case IBT_WC_REMOTE_ACCESS_ERR:
+ notify_status = RDSV3_RDMA_REMOTE_ERROR;
+ break;
+
+ default:
+ notify_status = RDSV3_RDMA_OTHER_ERROR;
+ break;
+ }
+ rdsv3_rdma_send_complete(rm, notify_status);
+
+ RDSV3_DPRINTF4("rdsv3_ib_send_rdma_complete", "rm: %p, wc_status: %d",
+ rm, wc_status);
+}
+
+static void rdsv3_ib_dma_unmap_sg_rdma(struct ib_device *dev,
+ uint_t num, struct rdsv3_rdma_sg scat[]);
+
+void
+rdsv3_ib_send_unmap_rdma(struct rdsv3_ib_connection *ic,
+ struct rdsv3_rdma_op *op)
+{
+ RDSV3_DPRINTF4("rdsv3_ib_send_unmap_rdma", "ic: %p, op: %p", ic, op);
+ if (op->r_mapped) {
+ op->r_mapped = 0;
+ if (ic->i_cm_id) {
+ rdsv3_ib_dma_unmap_sg_rdma(ic->i_cm_id->device,
+ op->r_nents, op->r_rdma_sg);
+ } else {
+ rdsv3_ib_dma_unmap_sg_rdma((struct ib_device *)NULL,
+ op->r_nents, op->r_rdma_sg);
+ }
+ }
+}
+
+static void
+rdsv3_ib_send_unmap_rm(struct rdsv3_ib_connection *ic,
+ struct rdsv3_ib_send_work *send,
+ int wc_status)
+{
+ struct rdsv3_message *rm = send->s_rm;
+
+ RDSV3_DPRINTF4("rdsv3_ib_send_unmap_rm", "ic %p send %p rm %p\n",
+ ic, send, rm);
+
+ rdsv3_ib_dma_unmap_sg(ic->i_cm_id->device,
+ rm->m_sg, rm->m_nents);
+
+ if (rm->m_rdma_op != NULL) {
+ rdsv3_ib_send_unmap_rdma(ic, rm->m_rdma_op);
+
+ /*
+ * If the user asked for a completion notification on this
+ * message, we can implement three different semantics:
+ * 1. Notify when we received the ACK on the RDS message
+ * that was queued with the RDMA. This provides reliable
+ * notification of RDMA status at the expense of a one-way
+ * packet delay.
+ * 2. Notify when the IB stack gives us the completion
+ * event for the RDMA operation.
+ * 3. Notify when the IB stack gives us the completion
+ * event for the accompanying RDS messages.
+ * Here, we implement approach #3. To implement approach #2,
+ * call rdsv3_rdma_send_complete from the cq_handler.
+ * To implement #1,
+ * don't call rdsv3_rdma_send_complete at all, and fall back to
+ * the notify
+ * handling in the ACK processing code.
+ *
+ * Note: There's no need to explicitly sync any RDMA buffers
+ * using
+ * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+ * operation itself unmapped the RDMA buffers, which takes care
+ * of synching.
+ */
+ rdsv3_ib_send_rdma_complete(rm, wc_status);
+
+ if (rm->m_rdma_op->r_write)
+ rdsv3_stats_add(s_send_rdma_bytes,
+ rm->m_rdma_op->r_bytes);
+ else
+ rdsv3_stats_add(s_recv_rdma_bytes,
+ rm->m_rdma_op->r_bytes);
+ }
+
+ /*
+ * If anyone waited for this message to get flushed out, wake
+ * them up now
+ */
+ rdsv3_message_unmapped(rm);
+
+ rdsv3_message_put(rm);
+ send->s_rm = NULL;
+}
+
+void
+rdsv3_ib_send_init_ring(struct rdsv3_ib_connection *ic)
+{
+ struct rdsv3_ib_send_work *send;
+ uint32_t i;
+
+ RDSV3_DPRINTF4("rdsv3_ib_send_init_ring", "ic: %p", ic);
+
+ for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+ send->s_rm = NULL;
+ send->s_op = NULL;
+ }
+}
+
+void
+rdsv3_ib_send_clear_ring(struct rdsv3_ib_connection *ic)
+{
+ struct rdsv3_ib_send_work *send;
+ uint32_t i;
+
+ RDSV3_DPRINTF4("rdsv3_ib_send_clear_ring", "ic: %p", ic);
+
+ for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+ if (send->s_opcode == 0xdd)
+ continue;
+ if (send->s_rm)
+ rdsv3_ib_send_unmap_rm(ic, send, IBT_WC_WR_FLUSHED_ERR);
+ if (send->s_op)
+ rdsv3_ib_send_unmap_rdma(ic, send->s_op);
+ }
+
+ RDSV3_DPRINTF4("rdsv3_ib_send_clear_ring", "Return: ic: %p", ic);
+}
+
+/*
+ * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
+ * operations performed in the send path. As the sender allocs and potentially
+ * unallocs the next free entry in the ring it doesn't alter which is
+ * the next to be freed, which is what this is concerned with.
+ */
+void
+rdsv3_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+ struct rdsv3_connection *conn = context;
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+ ibt_wc_t wc;
+ struct rdsv3_ib_send_work *send;
+ uint32_t completed, polled;
+ uint32_t oldest;
+ uint32_t i = 0;
+ int ret;
+
+ RDSV3_DPRINTF4("rdsv3_ib_send_cq_comp_handler", "conn: %p cq: %p",
+ conn, cq);
+
+ rdsv3_ib_stats_inc(s_ib_tx_cq_call);
+ ret = ibt_enable_cq_notify(RDSV3_CQ2CQHDL(cq), IBT_NEXT_COMPLETION);
+ if (ret)
+ RDSV3_DPRINTF2("rdsv3_ib_send_cq_comp_handler",
+ "ib_req_notify_cq send failed: %d", ret);
+
+ while (ibt_poll_cq(RDSV3_CQ2CQHDL(cq), &wc, 1, &polled) ==
+ IBT_SUCCESS) {
+ RDSV3_DPRINTF5("rdsv3_ib_send_cq_comp_handler",
+ "swc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+ (unsigned long long)wc.wc_id, wc.wc_status,
+ wc.wc_bytes_xfer, ntohl(wc.wc_immed_data));
+ rdsv3_ib_stats_inc(s_ib_tx_cq_event);
+
+ if (wc.wc_id == RDSV3_IB_ACK_WR_ID) {
+ if (ic->i_ack_queued + HZ/2 < jiffies)
+ rdsv3_ib_stats_inc(s_ib_tx_stalled);
+ rdsv3_ib_ack_send_complete(ic);
+ continue;
+ }
+
+ oldest = rdsv3_ib_ring_oldest(&ic->i_send_ring);
+
+ completed = rdsv3_ib_ring_completed(&ic->i_send_ring,
+ wc.wc_id, oldest);
+
+ for (i = 0; i < completed; i++) {
+ send = &ic->i_sends[oldest];
+
+ /*
+ * In the error case, wc.opcode sometimes contains
+ * garbage
+ */
+ switch (send->s_opcode) {
+ case IBT_WRC_SEND:
+ if (send->s_rm)
+ rdsv3_ib_send_unmap_rm(ic, send,
+ wc.wc_status);
+ break;
+ case IBT_WRC_RDMAW:
+ case IBT_WRC_RDMAR:
+ /*
+ * Nothing to be done - the SG list will
+ * be unmapped
+ * when the SEND completes.
+ */
+ break;
+ default:
+#ifndef __lock_lint
+ RDSV3_DPRINTF0("rdsv3_ib_send_cq_comp_handler",
+ "RDS/IB: %s: unexpected opcode "
+ "0x%x in WR!",
+ __func__, send->s_opcode);
+#endif
+ break;
+ }
+
+ send->s_opcode = 0xdd;
+ if (send->s_queued + HZ/2 < jiffies)
+ rdsv3_ib_stats_inc(s_ib_tx_stalled);
+
+ /*
+ * If a RDMA operation produced an error, signal
+ * this right
+ * away. If we don't, the subsequent SEND that goes
+ * with this
+ * RDMA will be canceled with ERR_WFLUSH, and the
+ * application
+ * never learn that the RDMA failed.
+ */
+ if (wc.wc_status ==
+ IBT_WC_REMOTE_ACCESS_ERR && send->s_op) {
+ struct rdsv3_message *rm;
+
+ rm = rdsv3_send_get_message(conn, send->s_op);
+ if (rm) {
+ if (rm->m_rdma_op != NULL)
+ rdsv3_ib_send_unmap_rdma(ic,
+ rm->m_rdma_op);
+ rdsv3_ib_send_rdma_complete(rm,
+ wc.wc_status);
+ rdsv3_message_put(rm);
+ }
+ }
+
+ oldest = (oldest + 1) % ic->i_send_ring.w_nr;
+ }
+
+ RDSV3_DPRINTF4("rdsv3_ib_send_cq_comp_handler", "compl: %d",
+ completed);
+ rdsv3_ib_ring_free(&ic->i_send_ring, completed);
+
+ if (test_and_clear_bit(RDSV3_LL_SEND_FULL, &conn->c_flags) ||
+ test_bit(0, &conn->c_map_queued))
+ rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0);
+
+ /* We expect errors as the qp is drained during shutdown */
+ if (wc.wc_status != IBT_WC_SUCCESS && rdsv3_conn_up(conn)) {
+ RDSV3_DPRINTF2("rdsv3_ib_send_cq_comp_handler",
+ "send completion on %u.%u.%u.%u "
+ "had status %u, disconnecting and reconnecting\n",
+ NIPQUAD(conn->c_faddr), wc.wc_status);
+ rdsv3_conn_drop(conn);
+ }
+ }
+
+ RDSV3_DPRINTF4("rdsv3_ib_send_cq_comp_handler",
+ "Return: conn: %p, cq: %p", conn, cq);
+}
+
+/*
+ * This is the main function for allocating credits when sending
+ * messages.
+ *
+ * Conceptually, we have two counters:
+ * - send credits: this tells us how many WRs we're allowed
+ * to submit without overruning the reciever's queue. For
+ * each SEND WR we post, we decrement this by one.
+ *
+ * - posted credits: this tells us how many WRs we recently
+ * posted to the receive queue. This value is transferred
+ * to the peer as a "credit update" in a RDS header field.
+ * Every time we transmit credits to the peer, we subtract
+ * the amount of transferred credits from this counter.
+ *
+ * It is essential that we avoid situations where both sides have
+ * exhausted their send credits, and are unable to send new credits
+ * to the peer. We achieve this by requiring that we send at least
+ * one credit update to the peer before exhausting our credits.
+ * When new credits arrive, we subtract one credit that is withheld
+ * until we've posted new buffers and are ready to transmit these
+ * credits (see rdsv3_ib_send_add_credits below).
+ *
+ * The RDS send code is essentially single-threaded; rdsv3_send_xmit
+ * grabs c_send_lock to ensure exclusive access to the send ring.
+ * However, the ACK sending code is independent and can race with
+ * message SENDs.
+ *
+ * In the send path, we need to update the counters for send credits
+ * and the counter of posted buffers atomically - when we use the
+ * last available credit, we cannot allow another thread to race us
+ * and grab the posted credits counter. Hence, we have to use a
+ * spinlock to protect the credit counter, or use atomics.
+ *
+ * Spinlocks shared between the send and the receive path are bad,
+ * because they create unnecessary delays. An early implementation
+ * using a spinlock showed a 5% degradation in throughput at some
+ * loads.
+ *
+ * This implementation avoids spinlocks completely, putting both
+ * counters into a single atomic, and updating that atomic using
+ * atomic_add (in the receive path, when receiving fresh credits),
+ * and using atomic_cmpxchg when updating the two counters.
+ */
+int
+rdsv3_ib_send_grab_credits(struct rdsv3_ib_connection *ic,
+ uint32_t wanted, uint32_t *adv_credits, int need_posted, int max_posted)
+{
+ unsigned int avail, posted, got = 0, advertise;
+ long oldval, newval;
+
+ RDSV3_DPRINTF4("rdsv3_ib_send_grab_credits", "ic: %p, %d %d %d %d",
+ ic, wanted, *adv_credits, need_posted, max_posted);
+
+ *adv_credits = 0;
+ if (!ic->i_flowctl)
+ return (wanted);
+
+try_again:
+ advertise = 0;
+ oldval = newval = atomic_get(&ic->i_credits);
+ posted = IB_GET_POST_CREDITS(oldval);
+ avail = IB_GET_SEND_CREDITS(oldval);
+
+ RDSV3_DPRINTF5("rdsv3_ib_send_grab_credits",
+ "wanted (%u): credits=%u posted=%u\n", wanted, avail, posted);
+
+ /* The last credit must be used to send a credit update. */
+ if (avail && !posted)
+ avail--;
+
+ if (avail < wanted) {
+ struct rdsv3_connection *conn = ic->i_cm_id->context;
+
+ /* Oops, there aren't that many credits left! */
+ set_bit(RDSV3_LL_SEND_FULL, &conn->c_flags);
+ got = avail;
+ } else {
+ /* Sometimes you get what you want, lalala. */
+ got = wanted;
+ }
+ newval -= IB_SET_SEND_CREDITS(got);
+
+ /*
+ * If need_posted is non-zero, then the caller wants
+ * the posted regardless of whether any send credits are
+ * available.
+ */
+ if (posted && (got || need_posted)) {
+ advertise = min(posted, max_posted);
+ newval -= IB_SET_POST_CREDITS(advertise);
+ }
+
+ /* Finally bill everything */
+ if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
+ goto try_again;
+
+ *adv_credits = advertise;
+
+ RDSV3_DPRINTF4("rdsv3_ib_send_grab_credits", "ic: %p, %d %d %d %d",
+ ic, got, *adv_credits, need_posted, max_posted);
+ return (got);
+}
+
+void
+rdsv3_ib_send_add_credits(struct rdsv3_connection *conn, unsigned int credits)
+{
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+
+ if (credits == 0)
+ return;
+
+ RDSV3_DPRINTF5("rdsv3_ib_send_add_credits",
+ "credits (%u): current=%u%s\n",
+ credits,
+ IB_GET_SEND_CREDITS(atomic_get(&ic->i_credits)),
+ test_bit(RDSV3_LL_SEND_FULL, &conn->c_flags) ?
+ ", ll_send_full" : "");
+
+ atomic_add_32(&ic->i_credits, IB_SET_SEND_CREDITS(credits));
+ if (test_and_clear_bit(RDSV3_LL_SEND_FULL, &conn->c_flags))
+ rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0);
+
+ ASSERT(!(IB_GET_SEND_CREDITS(credits) >= 16384));
+
+ rdsv3_ib_stats_inc(s_ib_rx_credit_updates);
+
+ RDSV3_DPRINTF4("rdsv3_ib_send_add_credits",
+ "Return: conn: %p, credits: %d",
+ conn, credits);
+}
+
+void
+rdsv3_ib_advertise_credits(struct rdsv3_connection *conn, unsigned int posted)
+{
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+
+ RDSV3_DPRINTF4("rdsv3_ib_advertise_credits", "conn: %p, posted: %d",
+ conn, posted);
+
+ if (posted == 0)
+ return;
+
+ atomic_add_32(&ic->i_credits, IB_SET_POST_CREDITS(posted));
+
+ /*
+ * Decide whether to send an update to the peer now.
+ * If we would send a credit update for every single buffer we
+ * post, we would end up with an ACK storm (ACK arrives,
+ * consumes buffer, we refill the ring, send ACK to remote
+ * advertising the newly posted buffer... ad inf)
+ *
+ * Performance pretty much depends on how often we send
+ * credit updates - too frequent updates mean lots of ACKs.
+ * Too infrequent updates, and the peer will run out of
+ * credits and has to throttle.
+ * For the time being, 16 seems to be a good compromise.
+ */
+ if (IB_GET_POST_CREDITS(atomic_get(&ic->i_credits)) >= 16)
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+}
+
+static inline void
+rdsv3_ib_xmit_populate_wr(struct rdsv3_ib_connection *ic,
+ ibt_send_wr_t *wr, unsigned int pos,
+ struct rdsv3_scatterlist *scat, unsigned int off, unsigned int length,
+ int send_flags)
+{
+ ibt_wr_ds_t *sge;
+
+ RDSV3_DPRINTF4("rdsv3_ib_xmit_populate_wr",
+ "ic: %p, wr: %p scat: %p %d %d %d %d",
+ ic, wr, scat, pos, off, length, send_flags);
+
+ wr->wr_id = pos;
+ wr->wr_trans = IBT_RC_SRV;
+ wr->wr_flags = send_flags;
+ wr->wr_opcode = IBT_WRC_SEND;
+
+ if (length != 0) {
+ int ix, len, assigned;
+ ibt_wr_ds_t *sgl;
+
+ ASSERT(length <= scat->length - off);
+
+ sgl = scat->sgl;
+ if (off != 0) {
+ /* find the right sgl to begin with */
+ while (sgl->ds_len <= off) {
+ off -= sgl->ds_len;
+ sgl++;
+ }
+ }
+
+ ix = 1; /* first data sgl is at 1 */
+ assigned = 0;
+ len = length;
+ do {
+ sge = &wr->wr_sgl[ix++];
+ sge->ds_va = sgl->ds_va + off;
+ assigned = min(len, sgl->ds_len - off);
+ sge->ds_len = assigned;
+ sge->ds_key = sgl->ds_key;
+ len -= assigned;
+ if (len != 0) {
+ sgl++;
+ off = 0;
+ }
+ } while (len > 0);
+
+ wr->wr_nds = ix;
+ } else {
+ /*
+ * We're sending a packet with no payload. There is only
+ * one SGE
+ */
+ wr->wr_nds = 1;
+ }
+
+ sge = &wr->wr_sgl[0];
+ sge->ds_va = ic->i_send_hdrs_dma + (pos * sizeof (struct rdsv3_header));
+ sge->ds_len = sizeof (struct rdsv3_header);
+ sge->ds_key = ic->i_mr->lkey;
+
+ RDSV3_DPRINTF4("rdsv3_ib_xmit_populate_wr",
+ "Return: ic: %p, wr: %p scat: %p", ic, wr, scat);
+}
+
+/*
+ * This can be called multiple times for a given message. The first time
+ * we see a message we map its scatterlist into the IB device so that
+ * we can provide that mapped address to the IB scatter gather entries
+ * in the IB work requests. We translate the scatterlist into a series
+ * of work requests that fragment the message. These work requests complete
+ * in order so we pass ownership of the message to the completion handler
+ * once we send the final fragment.
+ *
+ * The RDS core uses the c_send_lock to only enter this function once
+ * per connection. This makes sure that the tx ring alloc/unalloc pairs
+ * don't get out of sync and confuse the ring.
+ */
+int
+rdsv3_ib_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+ struct ib_device *dev = ic->i_cm_id->device;
+ struct rdsv3_ib_send_work *send = NULL;
+ struct rdsv3_ib_send_work *first;
+ struct rdsv3_ib_send_work *prev;
+ ibt_send_wr_t *wr;
+ struct rdsv3_scatterlist *scat;
+ uint32_t pos;
+ uint32_t i;
+ uint32_t work_alloc;
+ uint32_t credit_alloc;
+ uint32_t posted;
+ uint32_t adv_credits = 0;
+ int send_flags = 0;
+ int sent;
+ int ret;
+ int flow_controlled = 0;
+
+ RDSV3_DPRINTF4("rdsv3_ib_xmit", "conn: %p, rm: %p", conn, rm);
+
+ ASSERT(!(off % RDSV3_FRAG_SIZE));
+ ASSERT(!(hdr_off != 0 && hdr_off != sizeof (struct rdsv3_header)));
+
+ /* Do not send cong updates to IB loopback */
+ if (conn->c_loopback &&
+ rm->m_inc.i_hdr.h_flags & RDSV3_FLAG_CONG_BITMAP) {
+ rdsv3_cong_map_updated(conn->c_fcong, ~(uint64_t)0);
+ return (sizeof (struct rdsv3_header) + RDSV3_CONG_MAP_BYTES);
+ }
+
+#ifndef __lock_lint
+ /* FIXME we may overallocate here */
+ if (ntohl(rm->m_inc.i_hdr.h_len) == 0)
+ i = 1;
+ else
+ i = ceil(ntohl(rm->m_inc.i_hdr.h_len), RDSV3_FRAG_SIZE);
+#endif
+
+ work_alloc = rdsv3_ib_ring_alloc(&ic->i_send_ring, i, &pos);
+ if (work_alloc == 0) {
+ set_bit(RDSV3_LL_SEND_FULL, &conn->c_flags);
+ rdsv3_ib_stats_inc(s_ib_tx_ring_full);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ credit_alloc = work_alloc;
+ if (ic->i_flowctl) {
+ credit_alloc = rdsv3_ib_send_grab_credits(ic, work_alloc,
+ &posted, 0, RDSV3_MAX_ADV_CREDIT);
+ adv_credits += posted;
+ if (credit_alloc < work_alloc) {
+ rdsv3_ib_ring_unalloc(&ic->i_send_ring,
+ work_alloc - credit_alloc);
+ work_alloc = credit_alloc;
+ flow_controlled++;
+ }
+ if (work_alloc == 0) {
+ set_bit(RDSV3_LL_SEND_FULL, &conn->c_flags);
+ rdsv3_ib_stats_inc(s_ib_tx_throttle);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* map the message the first time we see it */
+ if (ic->i_rm == NULL) {
+ /*
+ * printk(KERN_NOTICE
+ * "rdsv3_ib_xmit prep msg dport=%u flags=0x%x len=%d\n",
+ * be16_to_cpu(rm->m_inc.i_hdr.h_dport),
+ * rm->m_inc.i_hdr.h_flags,
+ * be32_to_cpu(rm->m_inc.i_hdr.h_len));
+ */
+ if (rm->m_nents) {
+ rm->m_count = rdsv3_ib_dma_map_sg(dev,
+ rm->m_sg, rm->m_nents);
+ RDSV3_DPRINTF5("rdsv3_ib_xmit",
+ "ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
+ if (rm->m_count == 0) {
+ rdsv3_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+ rdsv3_ib_ring_unalloc(&ic->i_send_ring,
+ work_alloc);
+ ret = -ENOMEM; /* XXX ? */
+ RDSV3_DPRINTF2("rdsv3_ib_xmit",
+ "fail: ic %p mapping rm %p: %d\n",
+ ic, rm, rm->m_count);
+ goto out;
+ }
+ } else {
+ rm->m_count = 0;
+ }
+
+ ic->i_unsignaled_wrs = rdsv3_ib_sysctl_max_unsig_wrs;
+ ic->i_unsignaled_bytes = rdsv3_ib_sysctl_max_unsig_bytes;
+ rdsv3_message_addref(rm);
+ ic->i_rm = rm;
+
+ /* Finalize the header */
+ if (test_bit(RDSV3_MSG_ACK_REQUIRED, &rm->m_flags))
+ rm->m_inc.i_hdr.h_flags |= RDSV3_FLAG_ACK_REQUIRED;
+ if (test_bit(RDSV3_MSG_RETRANSMITTED, &rm->m_flags))
+ rm->m_inc.i_hdr.h_flags |= RDSV3_FLAG_RETRANSMITTED;
+
+ /*
+ * If it has a RDMA op, tell the peer we did it. This is
+ * used by the peer to release use-once RDMA MRs.
+ */
+ if (rm->m_rdma_op) {
+ struct rdsv3_ext_header_rdma ext_hdr;
+
+ ext_hdr.h_rdma_rkey = htonl(rm->m_rdma_op->r_key);
+ (void) rdsv3_message_add_extension(&rm->m_inc.i_hdr,
+ RDSV3_EXTHDR_RDMA, &ext_hdr,
+ sizeof (ext_hdr));
+ }
+ if (rm->m_rdma_cookie) {
+ (void) rdsv3_message_add_rdma_dest_extension(
+ &rm->m_inc.i_hdr,
+ rdsv3_rdma_cookie_key(rm->m_rdma_cookie),
+ rdsv3_rdma_cookie_offset(rm->m_rdma_cookie));
+ }
+
+ /*
+ * Note - rdsv3_ib_piggyb_ack clears the ACK_REQUIRED bit, so
+ * we should not do this unless we have a chance of at least
+ * sticking the header into the send ring. Which is why we
+ * should call rdsv3_ib_ring_alloc first.
+ */
+ rm->m_inc.i_hdr.h_ack = htonll(rdsv3_ib_piggyb_ack(ic));
+ rdsv3_message_make_checksum(&rm->m_inc.i_hdr);
+
+ /*
+ * Update adv_credits since we reset the ACK_REQUIRED bit.
+ */
+ (void) rdsv3_ib_send_grab_credits(ic, 0, &posted, 1,
+ RDSV3_MAX_ADV_CREDIT - adv_credits);
+ adv_credits += posted;
+ ASSERT(adv_credits <= 255);
+ } else if (ic->i_rm != rm)
+ RDSV3_PANIC();
+
+ send = &ic->i_sends[pos];
+ first = send;
+ prev = NULL;
+ scat = &rm->m_sg[sg];
+ sent = 0;
+ i = 0;
+
+ /*
+ * Sometimes you want to put a fence between an RDMA
+ * READ and the following SEND.
+ * We could either do this all the time
+ * or when requested by the user. Right now, we let
+ * the application choose.
+ */
+ if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
+ send_flags = IBT_WR_SEND_FENCE;
+
+ /*
+ * We could be copying the header into the unused tail of the page.
+ * That would need to be changed in the future when those pages might
+ * be mapped userspace pages or page cache pages. So instead we always
+ * use a second sge and our long-lived ring of mapped headers. We send
+ * the header after the data so that the data payload can be aligned on
+ * the receiver.
+ */
+
+ /* handle a 0-len message */
+ if (ntohl(rm->m_inc.i_hdr.h_len) == 0) {
+ wr = &ic->i_send_wrs[0];
+ rdsv3_ib_xmit_populate_wr(ic, wr, pos, NULL, 0, 0, send_flags);
+ send->s_queued = jiffies;
+ send->s_op = NULL;
+ send->s_opcode = wr->wr_opcode;
+ goto add_header;
+ }
+
+ /* if there's data reference it with a chain of work reqs */
+ for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
+ unsigned int len;
+
+ send = &ic->i_sends[pos];
+
+ wr = &ic->i_send_wrs[i];
+ len = min(RDSV3_FRAG_SIZE,
+ rdsv3_ib_sg_dma_len(dev, scat) - off);
+ rdsv3_ib_xmit_populate_wr(ic, wr, pos, scat, off, len,
+ send_flags);
+ send->s_queued = jiffies;
+ send->s_op = NULL;
+ send->s_opcode = wr->wr_opcode;
+
+ /*
+ * We want to delay signaling completions just enough to get
+ * the batching benefits but not so much that we create dead
+ * time
+ * on the wire.
+ */
+ if (ic->i_unsignaled_wrs-- == 0) {
+ ic->i_unsignaled_wrs = rdsv3_ib_sysctl_max_unsig_wrs;
+ wr->wr_flags |=
+ IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT;
+ }
+
+ ic->i_unsignaled_bytes -= len;
+ if (ic->i_unsignaled_bytes <= 0) {
+ ic->i_unsignaled_bytes =
+ rdsv3_ib_sysctl_max_unsig_bytes;
+ wr->wr_flags |=
+ IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT;
+ }
+
+ /*
+ * Always signal the last one if we're stopping due to flow
+ * control.
+ */
+ if (flow_controlled && i == (work_alloc-1)) {
+ wr->wr_flags |=
+ IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT;
+ }
+
+ RDSV3_DPRINTF5("rdsv3_ib_xmit", "send %p wr %p num_sge %u \n",
+ send, wr, wr->wr_nds);
+
+ sent += len;
+ off += len;
+ if (off == rdsv3_ib_sg_dma_len(dev, scat)) {
+ scat++;
+ off = 0;
+ }
+
+add_header:
+ /*
+ * Tack on the header after the data. The header SGE
+ * should already
+ * have been set up to point to the right header buffer.
+ */
+ (void) memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr,
+ sizeof (struct rdsv3_header));
+
+ if (0) {
+ struct rdsv3_header *hdr = &ic->i_send_hdrs[pos];
+
+ RDSV3_DPRINTF0("rdsv3_ib_xmit",
+ "send WR dport=%u flags=0x%x len=%d",
+ ntohs(hdr->h_dport),
+ hdr->h_flags,
+ ntohl(hdr->h_len));
+ }
+ if (adv_credits) {
+ struct rdsv3_header *hdr = &ic->i_send_hdrs[pos];
+
+ /* add credit and redo the header checksum */
+ hdr->h_credit = adv_credits;
+ rdsv3_message_make_checksum(hdr);
+ adv_credits = 0;
+ rdsv3_ib_stats_inc(s_ib_tx_credit_updates);
+ }
+
+ prev = send;
+
+ pos = (pos + 1) % ic->i_send_ring.w_nr;
+ }
+
+ /*
+ * Account the RDS header in the number of bytes we sent, but just once.
+ * The caller has no concept of fragmentation.
+ */
+ if (hdr_off == 0)
+ sent += sizeof (struct rdsv3_header);
+
+ /* if we finished the message then send completion owns it */
+ if (scat == &rm->m_sg[rm->m_count]) {
+ prev->s_rm = ic->i_rm;
+ wr->wr_flags |= IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT;
+ ic->i_rm = NULL;
+ }
+
+ if (i < work_alloc) {
+ rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+ work_alloc = i;
+ }
+ if (ic->i_flowctl && i < credit_alloc)
+ rdsv3_ib_send_add_credits(conn, credit_alloc - i);
+
+ /* XXX need to worry about failed_wr and partial sends. */
+ ret = ibt_post_send(ib_get_ibt_channel_hdl(ic->i_cm_id),
+ ic->i_send_wrs, i, &posted);
+ if (posted != i) {
+ RDSV3_DPRINTF1("rdsv3_ib_xmit",
+ "ic %p first %p nwr: %d ret %d:%d",
+ ic, first, i, ret, posted);
+ }
+ if (ret) {
+ RDSV3_DPRINTF0("rdsv3_ib_xmit",
+ "RDS/IB: ib_post_send to %u.%u.%u.%u "
+ "returned %d\n", NIPQUAD(conn->c_faddr), ret);
+ rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ if (prev->s_rm) {
+ ic->i_rm = prev->s_rm;
+ prev->s_rm = NULL;
+ }
+#if 1
+ RDSV3_DPRINTF2("rdsv3_ib_xmit", "ibt_post_send FAIL");
+ ret = -EAGAIN;
+#else
+ /* Finesse this later */
+ RDSV3_PANIC();
+#endif
+ goto out;
+ }
+
+ ret = sent;
+
+ RDSV3_DPRINTF4("rdsv3_ib_xmit", "Return: conn: %p, rm: %p", conn, rm);
+out:
+ ASSERT(!adv_credits);
+ return (ret);
+}
+
+static void
+rdsv3_ib_dma_unmap_sg_rdma(struct ib_device *dev, uint_t num,
+ struct rdsv3_rdma_sg scat[])
+{
+ ibt_hca_hdl_t hca_hdl;
+ int i;
+ int num_sgl;
+
+ RDSV3_DPRINTF4("rdsv3_ib_dma_unmap_sg", "rdma_sg: %p", scat);
+
+ if (dev) {
+ hca_hdl = ib_get_ibt_hca_hdl(dev);
+ } else {
+ hca_hdl = scat[0].hca_hdl;
+ RDSV3_DPRINTF2("rdsv3_ib_dma_unmap_sg_rdma",
+ "NULL dev use cached hca_hdl %p", hca_hdl);
+ }
+
+ if (hca_hdl == NULL)
+ return;
+ scat[0].hca_hdl = NULL;
+
+ for (i = 0; i < num; i++) {
+ if (scat[i].mihdl != NULL) {
+ num_sgl = (scat[i].iovec.bytes / PAGESIZE) + 2;
+ kmem_free(scat[i].swr.wr_sgl,
+ (num_sgl * sizeof (ibt_wr_ds_t)));
+ scat[i].swr.wr_sgl = NULL;
+ (void) ibt_unmap_mem_iov(hca_hdl, scat[i].mihdl);
+ scat[i].mihdl = NULL;
+ } else
+ break;
+ }
+}
+
+/* ARGSUSED */
+uint_t
+rdsv3_ib_dma_map_sg_rdma(struct ib_device *dev, struct rdsv3_rdma_sg scat[],
+ uint_t num, struct rdsv3_scatterlist **scatl)
+{
+ ibt_hca_hdl_t hca_hdl;
+ ibt_iov_attr_t iov_attr;
+ struct buf *bp;
+ uint_t i, j, k;
+ uint_t count;
+ struct rdsv3_scatterlist *sg;
+ int ret;
+
+ RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg_rdma", "scat: %p, num: %d",
+ scat, num);
+
+ hca_hdl = ib_get_ibt_hca_hdl(dev);
+ scat[0].hca_hdl = hca_hdl;
+ bzero(&iov_attr, sizeof (ibt_iov_attr_t));
+ iov_attr.iov_flags = IBT_IOV_BUF;
+ iov_attr.iov_lso_hdr_sz = 0;
+
+ for (i = 0, count = 0; i < num; i++) {
+ /* transpose umem_cookie to buf structure */
+ bp = ddi_umem_iosetup(scat[i].umem_cookie,
+ scat[i].iovec.addr & PAGEOFFSET, scat[i].iovec.bytes,
+ B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP);
+ if (bp == NULL) {
+ /* free resources and return error */
+ goto out;
+ }
+ /* setup ibt_map_mem_iov() attributes */
+ iov_attr.iov_buf = bp;
+ iov_attr.iov_wr_nds = (scat[i].iovec.bytes / PAGESIZE) + 2;
+ scat[i].swr.wr_sgl =
+ kmem_zalloc(iov_attr.iov_wr_nds * sizeof (ibt_wr_ds_t),
+ KM_SLEEP);
+
+ ret = ibt_map_mem_iov(hca_hdl, &iov_attr,
+ (ibt_all_wr_t *)&scat[i].swr, &scat[i].mihdl);
+ freerbuf(bp);
+ if (ret != IBT_SUCCESS) {
+ RDSV3_DPRINTF2("rdsv3_ib_dma_map_sg_rdma",
+ "ibt_map_mem_iov returned: %d", ret);
+ /* free resources and return error */
+ kmem_free(scat[i].swr.wr_sgl,
+ iov_attr.iov_wr_nds * sizeof (ibt_wr_ds_t));
+ goto out;
+ }
+ count += scat[i].swr.wr_nds;
+
+#ifdef DEBUG
+ for (j = 0; j < scat[i].swr.wr_nds; j++) {
+ RDSV3_DPRINTF5("rdsv3_ib_dma_map_sg_rdma",
+ "sgl[%d] va %llx len %x", j,
+ scat[i].swr.wr_sgl[j].ds_va,
+ scat[i].swr.wr_sgl[j].ds_len);
+ }
+#endif
+ RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg_rdma",
+ "iovec.bytes: 0x%x scat[%d]swr.wr_nds: %d",
+ scat[i].iovec.bytes, i, scat[i].swr.wr_nds);
+ }
+
+ count = ((count - 1) / RDSV3_IB_MAX_SGE) + 1;
+ RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg_rdma", "Ret: num: %d", count);
+ return (count);
+
+out:
+ rdsv3_ib_dma_unmap_sg_rdma(dev, num, scat);
+ return (0);
+}
+
+int
+rdsv3_ib_xmit_rdma(struct rdsv3_connection *conn, struct rdsv3_rdma_op *op)
+{
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+ struct rdsv3_ib_send_work *send = NULL;
+ struct rdsv3_rdma_sg *scat;
+ uint64_t remote_addr;
+ uint32_t pos;
+ uint32_t work_alloc;
+ uint32_t i, j, k, idx;
+ uint32_t left, count;
+ uint32_t posted;
+ int sent;
+ ibt_status_t status;
+ ibt_send_wr_t *wr;
+ ibt_wr_ds_t *sge;
+
+ RDSV3_DPRINTF4("rdsv3_ib_xmit_rdma", "rdsv3_ib_conn: %p", ic);
+
+ /* map the message the first time we see it */
+ if (!op->r_mapped) {
+ op->r_count = rdsv3_ib_dma_map_sg_rdma(ic->i_cm_id->device,
+ op->r_rdma_sg, op->r_nents, &op->r_sg);
+ RDSV3_DPRINTF5("rdsv3_ib_xmit_rdma", "ic %p mapping op %p: %d",
+ ic, op, op->r_count);
+ if (op->r_count == 0) {
+ rdsv3_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+ RDSV3_DPRINTF2("rdsv3_ib_xmit_rdma",
+ "fail: ic %p mapping op %p: %d",
+ ic, op, op->r_count);
+ return (-ENOMEM); /* XXX ? */
+ }
+ op->r_mapped = 1;
+ }
+
+ /*
+ * Instead of knowing how to return a partial rdma read/write
+ * we insist that there
+ * be enough work requests to send the entire message.
+ */
+ work_alloc = rdsv3_ib_ring_alloc(&ic->i_send_ring, op->r_count, &pos);
+ if (work_alloc != op->r_count) {
+ rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rdsv3_ib_stats_inc(s_ib_tx_ring_full);
+ return (-ENOMEM);
+ }
+
+ /*
+ * take the scatter list and transpose into a list of
+ * send wr's each with a scatter list of RDSV3_IB_MAX_SGE
+ */
+ scat = &op->r_rdma_sg[0];
+ sent = 0;
+ remote_addr = op->r_remote_addr;
+
+ for (i = 0, k = 0; i < op->r_nents; i++) {
+ left = scat[i].swr.wr_nds;
+ for (idx = 0; left > 0; k++) {
+ send = &ic->i_sends[pos];
+ send->s_queued = jiffies;
+ send->s_opcode = op->r_write ? IBT_WRC_RDMAW :
+ IBT_WRC_RDMAR;
+ send->s_op = op;
+
+ wr = &ic->i_send_wrs[k];
+ wr->wr_flags = 0;
+ wr->wr_id = pos;
+ wr->wr_trans = IBT_RC_SRV;
+ wr->wr_opcode = op->r_write ? IBT_WRC_RDMAW :
+ IBT_WRC_RDMAR;
+ wr->wr.rc.rcwr.rdma.rdma_raddr = remote_addr;
+ wr->wr.rc.rcwr.rdma.rdma_rkey = op->r_key;
+
+ if (left > RDSV3_IB_MAX_SGE) {
+ count = RDSV3_IB_MAX_SGE;
+ left -= RDSV3_IB_MAX_SGE;
+ } else {
+ count = left;
+ left = 0;
+ }
+ wr->wr_nds = count;
+
+ for (j = 0; j < count; j++) {
+ sge = &wr->wr_sgl[j];
+ *sge = scat[i].swr.wr_sgl[idx];
+ remote_addr += scat[i].swr.wr_sgl[idx].ds_len;
+ sent += scat[i].swr.wr_sgl[idx].ds_len;
+ idx++;
+ RDSV3_DPRINTF4("xmit_rdma",
+ "send_wrs[%d]sgl[%d] va %llx len %x",
+ k, j, sge->ds_va, sge->ds_len);
+ }
+ RDSV3_DPRINTF4("rdsv3_ib_xmit_rdma",
+ "wr[%d] %p key: %x code: %d tlen: %d",
+ k, wr, wr->wr.rc.rcwr.rdma.rdma_rkey,
+ wr->wr_opcode, sent);
+
+ /*
+ * We want to delay signaling completions just enough
+ * to get the batching benefits but not so much that
+ * we create dead time on the wire.
+ */
+ if (ic->i_unsignaled_wrs-- == 0) {
+ ic->i_unsignaled_wrs =
+ rdsv3_ib_sysctl_max_unsig_wrs;
+ wr->wr_flags = IBT_WR_SEND_SIGNAL;
+ }
+
+ pos = (pos + 1) % ic->i_send_ring.w_nr;
+ }
+ }
+
+ status = ibt_post_send(ib_get_ibt_channel_hdl(ic->i_cm_id),
+ ic->i_send_wrs, k, &posted);
+ if (status != IBT_SUCCESS) {
+ RDSV3_DPRINTF0("rdsv3_ib_xmit_rdma",
+ "RDS/IB: rdma ib_post_send returned %d", status);
+ rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ }
+ return (status);
+}
+
+void
+rdsv3_ib_xmit_complete(struct rdsv3_connection *conn)
+{
+ struct rdsv3_ib_connection *ic = conn->c_transport_data;
+
+ RDSV3_DPRINTF4("rdsv3_ib_xmit_complete", "conn: %p", conn);
+
+ /*
+ * We may have a pending ACK or window update we were unable
+ * to send previously (due to flow control). Try again.
+ */
+ rdsv3_ib_attempt_ack(ic);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_stats.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_stats.c
new file mode 100644
index 0000000000..2abdc26d49
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_stats.c
@@ -0,0 +1,125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/rds.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/ib.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+RDSV3_DEFINE_PER_CPU(struct rdsv3_ib_statistics, rdsv3_ib_stats);
+
+static char *rdsv3_ib_stat_names[] = {
+ "ib_connect_raced",
+ "ib_listen_closed_stale",
+ "ib_tx_cq_call",
+ "ib_tx_cq_event",
+ "ib_tx_ring_full",
+ "ib_tx_throttle",
+ "ib_tx_sg_mapping_failure",
+ "ib_tx_stalled",
+ "ib_tx_credit_updates",
+ "ib_rx_cq_call",
+ "ib_rx_cq_event",
+ "ib_rx_ring_empty",
+ "ib_rx_refill_from_cq",
+ "ib_rx_refill_from_thread",
+ "ib_rx_alloc_limit",
+ "ib_rx_credit_updates",
+ "ib_ack_sent",
+ "ib_ack_send_failure",
+ "ib_ack_send_delayed",
+ "ib_ack_send_piggybacked",
+ "ib_ack_received",
+ "ib_rdma_mr_alloc",
+ "ib_rdma_mr_free",
+ "ib_rdma_mr_used",
+ "ib_rdma_mr_pool_flush",
+ "ib_rdma_mr_pool_wait",
+ "ib_rdma_mr_pool_depleted",
+};
+
+unsigned int
+rdsv3_ib_stats_info_copy(struct rdsv3_info_iterator *iter,
+ unsigned int avail)
+{
+ struct rdsv3_ib_statistics stats = {0, };
+ uint64_t *src;
+ uint64_t *sum;
+ size_t i;
+ int cpu;
+
+ RDSV3_DPRINTF4("rdsv3_ib_stats_info_copy", "iter: %p, avail: %d",
+ iter, avail);
+
+ if (avail < ARRAY_SIZE(rdsv3_ib_stat_names))
+ goto out;
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ src = (uint64_t *)&(rdsv3_per_cpu(rdsv3_ib_stats, cpu));
+ sum = (uint64_t *)&stats;
+ for (i = 0; i < sizeof (stats) / sizeof (uint64_t); i++)
+ *(sum++) += *(src++);
+ }
+
+ rdsv3_stats_info_copy(iter, (uint64_t *)&stats, rdsv3_ib_stat_names,
+ ARRAY_SIZE(rdsv3_ib_stat_names));
+
+ RDSV3_DPRINTF4("rdsv3_ib_stats_info_copy",
+ "Return: iter: %p, avail: %d", iter, avail);
+out:
+ return (ARRAY_SIZE(rdsv3_ib_stat_names));
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_sysctl.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_sysctl.c
new file mode 100644
index 0000000000..27bceddb48
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_sysctl.c
@@ -0,0 +1,90 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/ib/clients/rdsv3/ib.h>
+
+unsigned long rdsv3_ib_sysctl_max_send_wr = RDSV3_IB_DEFAULT_SEND_WR;
+unsigned long rdsv3_ib_sysctl_max_recv_wr = RDSV3_IB_DEFAULT_RECV_WR;
+unsigned long rdsv3_ib_sysctl_max_recv_allocation =
+ (128 * 1024 * 1024) / RDSV3_FRAG_SIZE;
+/* hardware will fail CQ creation long before this */
+
+unsigned long rdsv3_ib_sysctl_max_unsig_wrs = 16;
+
+unsigned long rdsv3_ib_sysctl_max_unsig_bytes = (16 << 20);
+
+/*
+ * This sysctl does nothing.
+ *
+ * Backwards compatibility with RDS 3.0 wire protocol
+ * disables initial FC credit exchange.
+ * If it's ever possible to drop 3.0 support,
+ * setting this to 1 and moving init/refill of send/recv
+ * rings from ib_cm_connect_complete() back into ib_setup_qp()
+ * will cause credits to be added before protocol negotiation.
+ */
+unsigned int rdsv3_ib_sysctl_flow_control = 0;
+
+void
+rdsv3_ib_sysctl_exit(void)
+{
+}
+
+int
+rdsv3_ib_sysctl_init(void)
+{
+ return (0);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/info.c b/usr/src/uts/common/io/ib/clients/rdsv3/info.c
new file mode 100644
index 0000000000..88dc4ecb6d
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/info.c
@@ -0,0 +1,155 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/rds.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+
+/*
+ * This file implements a getsockopt() call which copies a set of fixed
+ * sized structs into a user-specified buffer as a means of providing
+ * read-only information about RDS.
+ *
+ * For a given information source there are a given number of fixed sized
+ * structs at a given time. The structs are only copied if the user-specified
+ * buffer is big enough. The destination pages that make up the buffer
+ * are pinned for the duration of the copy.
+ *
+ * This gives us the following benefits:
+ *
+ * - simple implementation, no copy "position" across multiple calls
+ * - consistent snapshot of an info source
+ * - atomic copy works well with whatever locking info source has
+ * - one portable tool to get rds info across implementations
+ * - long-lived tool can get info without allocating
+ *
+ * at the following costs:
+ *
+ * - info source copy must be pinned, may be "large"
+ */
+
+static kmutex_t rdsv3_info_lock;
+static rdsv3_info_func rdsv3_info_funcs[RDSV3_INFO_LAST - RDSV3_INFO_FIRST + 1];
+
+void
+rdsv3_info_register_func(int optname, rdsv3_info_func func)
+{
+ int offset = optname - RDSV3_INFO_FIRST;
+
+ ASSERT(optname >= RDSV3_INFO_FIRST && optname <= RDSV3_INFO_LAST);
+
+ mutex_enter(&rdsv3_info_lock);
+ rdsv3_info_funcs[offset] = func;
+ mutex_exit(&rdsv3_info_lock);
+}
+
+/* ARGSUSED */
+void
+rdsv3_info_deregister_func(int optname, rdsv3_info_func func)
+{
+ int offset = optname - RDSV3_INFO_FIRST;
+
+ ASSERT(optname >= RDSV3_INFO_FIRST && optname <= RDSV3_INFO_LAST);
+
+ mutex_enter(&rdsv3_info_lock);
+ rdsv3_info_funcs[offset] = NULL;
+ mutex_exit(&rdsv3_info_lock);
+}
+
+/*
+ * @optval points to the userspace buffer that the information snapshot
+ * will be copied into.
+ *
+ * @optlen on input is the size of the buffer in userspace. @optlen
+ * on output is the size of the requested snapshot in bytes.
+ *
+ * This function returns -errno if there is a failure, particularly -ENOSPC
+ * if the given userspace buffer was not large enough to fit the snapshot.
+ * On success it returns the positive number of bytes of each array element
+ * in the snapshot.
+ */
+int
+rdsv3_info_getsockopt(struct rsock *sock, int optname, char *optval,
+ socklen_t *optlen)
+{
+ struct rdsv3_info_iterator iter;
+ struct rdsv3_info_lengths lens;
+ rdsv3_info_func func;
+
+ func = rdsv3_info_funcs[optname - RDSV3_INFO_FIRST];
+ if (func == NULL) {
+ return (-ENOPROTOOPT);
+ }
+
+ if (*optlen == sizeof (struct rdsv3_info_lengths)) {
+ iter.addr = NULL;
+ } else {
+ iter.addr = optval;
+ }
+
+ iter.offset = 0;
+
+ func(sock, *optlen, &iter, &lens);
+ ASSERT(lens.each != 0);
+
+ if (iter.addr == NULL) {
+ bcopy(&lens, optval, sizeof (struct rdsv3_info_lengths));
+ } else {
+ *optlen = lens.nr * lens.each;
+ }
+
+ return (0);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/loop.c b/usr/src/uts/common/io/ib/clients/rdsv3/loop.c
new file mode 100644
index 0000000000..8ae25caae0
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/loop.c
@@ -0,0 +1,242 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/rds.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/loop.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+kmutex_t loop_conns_lock;
+list_t loop_conns;
+
+/*
+ * This 'loopback' transport is a special case for flows that originate
+ * and terminate on the same machine.
+ *
+ * Connection build-up notices if the destination address is thought of
+ * as a local address by a transport. At that time it decides to use the
+ * loopback transport instead of the bound transport of the sending socket.
+ *
+ * The loopback transport's sending path just hands the sent rds_message
+ * straight to the receiving path via an embedded rds_incoming.
+ */
+
+/*
+ * Usually a message transits both the sender and receiver's conns as it
+ * flows to the receiver. In the loopback case, though, the receive path
+ * is handed the sending conn so the sense of the addresses is reversed.
+ */
+static int
+rdsv3_loop_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm,
+ unsigned int hdr_off, unsigned int sg,
+ unsigned int off)
+{
+ ASSERT(!(hdr_off || sg || off));
+
+ RDSV3_DPRINTF4("rdsv3_loop_xmit", "Enter(conn: %p, rm: %p)", conn, rm);
+
+ rdsv3_inc_init(&rm->m_inc, conn, conn->c_laddr);
+ rdsv3_message_addref(rm); /* for the inc */
+
+ rdsv3_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
+ KM_NOSLEEP);
+
+ rdsv3_send_drop_acked(conn, ntohll(rm->m_inc.i_hdr.h_sequence),
+ NULL);
+
+ rdsv3_inc_put(&rm->m_inc);
+
+ RDSV3_DPRINTF4("rdsv3_loop_xmit", "Return(conn: %p, rm: %p)", conn, rm);
+
+ return (sizeof (struct rdsv3_header) +
+ ntohl(rm->m_inc.i_hdr.h_len));
+}
+
+static int
+rdsv3_loop_xmit_cong_map(struct rdsv3_connection *conn,
+ struct rdsv3_cong_map *map,
+ unsigned long offset)
+{
+ RDSV3_DPRINTF4("rdsv3_loop_xmit_cong_map", "Enter(conn: %p)", conn);
+
+ ASSERT(!offset);
+ ASSERT(map == conn->c_lcong);
+
+ rdsv3_cong_map_updated(conn->c_fcong, ~(uint64_t)0);
+
+ RDSV3_DPRINTF4("rdsv3_loop_xmit_cong_map", "Return(conn: %p)", conn);
+
+ return (sizeof (struct rdsv3_header) + RDSV3_CONG_MAP_BYTES);
+}
+
+/* we need to at least give the thread something to succeed */
+/* ARGSUSED */
+static int
+rdsv3_loop_recv(struct rdsv3_connection *conn)
+{
+ return (0);
+}
+
+struct rdsv3_loop_connection
+{
+ struct list_node loop_node;
+ struct rdsv3_connection *conn;
+};
+
+/*
+ * Even the loopback transport needs to keep track of its connections,
+ * so it can call rdsv3_conn_destroy() on them on exit. N.B. there are
+ * 1+ loopback addresses (127.*.*.*) so it's not a bug to have
+ * multiple loopback conns allocated, although rather useless.
+ */
+/* ARGSUSED */
+static int
+rdsv3_loop_conn_alloc(struct rdsv3_connection *conn, int gfp)
+{
+ struct rdsv3_loop_connection *lc;
+
+ RDSV3_DPRINTF4("rdsv3_loop_conn_alloc", "Enter(conn: %p)", conn);
+
+ lc = kmem_zalloc(sizeof (struct rdsv3_loop_connection), KM_NOSLEEP);
+ if (lc == NULL)
+ return (-ENOMEM);
+
+ list_link_init(&lc->loop_node);
+ lc->conn = conn;
+ conn->c_transport_data = lc;
+
+ mutex_enter(&loop_conns_lock);
+ list_insert_tail(&loop_conns, lc);
+ mutex_exit(&loop_conns_lock);
+
+ RDSV3_DPRINTF4("rdsv3_loop_conn_alloc", "Return(conn: %p)", conn);
+
+ return (0);
+}
+
+static void
+rdsv3_loop_conn_free(void *arg)
+{
+ struct rdsv3_loop_connection *lc = arg;
+ RDSV3_DPRINTF5("rdsv3_loop_conn_free", "lc %p\n", lc);
+ list_remove_node(&lc->loop_node);
+ kmem_free(lc, sizeof (struct rdsv3_loop_connection));
+}
+
+static int
+rdsv3_loop_conn_connect(struct rdsv3_connection *conn)
+{
+ rdsv3_connect_complete(conn);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+rdsv3_loop_conn_shutdown(struct rdsv3_connection *conn)
+{
+}
+
+void
+rdsv3_loop_exit(void)
+{
+ struct rdsv3_loop_connection *lc, *_lc;
+ list_t tmp_list;
+
+ RDSV3_DPRINTF4("rdsv3_loop_exit", "Enter");
+
+ list_create(&tmp_list, sizeof (struct rdsv3_loop_connection),
+ offsetof(struct rdsv3_loop_connection, loop_node));
+
+ /* avoid calling conn_destroy with irqs off */
+ mutex_enter(&loop_conns_lock);
+ list_splice(&loop_conns, &tmp_list);
+ mutex_exit(&loop_conns_lock);
+
+ RDSV3_FOR_EACH_LIST_NODE_SAFE(lc, _lc, &tmp_list, loop_node) {
+ ASSERT(!lc->conn->c_passive);
+ rdsv3_conn_destroy(lc->conn);
+ }
+
+ list_destroy(&loop_conns);
+ mutex_destroy(&loop_conns_lock);
+
+ RDSV3_DPRINTF4("rdsv3_loop_exit", "Return");
+}
+
+/*
+ * This is missing .xmit_* because loop doesn't go through generic
+ * rdsv3_send_xmit() and doesn't call rdsv3_recv_incoming(). .listen_stop and
+ * .laddr_check are missing because transport.c doesn't iterate over
+ * rdsv3_loop_transport.
+ */
+#ifndef __lock_lint
+struct rdsv3_transport rdsv3_loop_transport = {
+ .xmit = rdsv3_loop_xmit,
+ .xmit_cong_map = rdsv3_loop_xmit_cong_map,
+ .recv = rdsv3_loop_recv,
+ .conn_alloc = rdsv3_loop_conn_alloc,
+ .conn_free = rdsv3_loop_conn_free,
+ .conn_connect = rdsv3_loop_conn_connect,
+ .conn_shutdown = rdsv3_loop_conn_shutdown,
+ .inc_copy_to_user = rdsv3_message_inc_copy_to_user,
+ .inc_purge = rdsv3_message_inc_purge,
+ .inc_free = rdsv3_message_inc_free,
+ .t_name = "loopback",
+};
+#else
+struct rdsv3_transport rdsv3_loop_transport;
+#endif
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/message.c b/usr/src/uts/common/io/ib/clients/rdsv3/message.c
new file mode 100644
index 0000000000..470219c2da
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/message.c
@@ -0,0 +1,473 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/rds.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/rdma.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+static rdsv3_wait_queue_t rdsv3_message_flush_waitq;
+
+#ifndef __lock_lint
+static unsigned int rdsv3_exthdr_size[__RDSV3_EXTHDR_MAX] = {
+[RDSV3_EXTHDR_NONE] = 0,
+[RDSV3_EXTHDR_VERSION] = sizeof (struct rdsv3_ext_header_version),
+[RDSV3_EXTHDR_RDMA] = sizeof (struct rdsv3_ext_header_rdma),
+[RDSV3_EXTHDR_RDMA_DEST] = sizeof (struct rdsv3_ext_header_rdma_dest),
+};
+#else
+static unsigned int rdsv3_exthdr_size[__RDSV3_EXTHDR_MAX] = {
+ 0,
+ sizeof (struct rdsv3_ext_header_version),
+ sizeof (struct rdsv3_ext_header_rdma),
+ sizeof (struct rdsv3_ext_header_rdma_dest),
+};
+#endif
+
+void
+rdsv3_message_addref(struct rdsv3_message *rm)
+{
+ RDSV3_DPRINTF5("rdsv3_message_addref", "addref rm %p ref %d",
+ rm, atomic_get(&rm->m_refcount));
+ atomic_add_32(&rm->m_refcount, 1);
+}
+
+/*
+ * This relies on dma_map_sg() not touching sg[].page during merging.
+ */
+static void
+rdsv3_message_purge(struct rdsv3_message *rm)
+{
+ unsigned long i;
+
+ RDSV3_DPRINTF4("rdsv3_message_purge", "Enter(rm: %p)", rm);
+
+ if (test_bit(RDSV3_MSG_PAGEVEC, &rm->m_flags))
+ return;
+
+ for (i = 0; i < rm->m_nents; i++) {
+ RDSV3_DPRINTF5("rdsv3_message_purge", "putting data page %p\n",
+ (void *)rdsv3_sg_page(&rm->m_sg[i]));
+ /* XXX will have to put_page for page refs */
+ kmem_free(rdsv3_sg_page(&rm->m_sg[i]),
+ rdsv3_sg_len(&rm->m_sg[i]));
+ }
+
+ if (rm->m_rdma_op)
+ rdsv3_rdma_free_op(rm->m_rdma_op);
+ if (rm->m_rdma_mr) {
+ struct rdsv3_mr *mr = rm->m_rdma_mr;
+ if (mr->r_refcount == 0) {
+ RDSV3_DPRINTF4("rdsv3_message_purge ASSERT 0",
+ "rm %p mr %p", rm, mr);
+ return;
+ }
+ if (mr->r_refcount == 0xdeadbeef) {
+ RDSV3_DPRINTF4("rdsv3_message_purge ASSERT deadbeef",
+ "rm %p mr %p", rm, mr);
+ return;
+ }
+ if (atomic_dec_and_test(&mr->r_refcount)) {
+ rm->m_rdma_mr = NULL;
+ __rdsv3_put_mr_final(mr);
+ }
+ }
+
+ RDSV3_DPRINTF4("rdsv3_message_purge", "Return(rm: %p)", rm);
+
+}
+
+void
+rdsv3_message_inc_purge(struct rdsv3_incoming *inc)
+{
+ struct rdsv3_message *rm =
+ container_of(inc, struct rdsv3_message, m_inc);
+ rdsv3_message_purge(rm);
+}
+
+void
+rdsv3_message_put(struct rdsv3_message *rm)
+{
+ RDSV3_DPRINTF5("rdsv3_message_put",
+ "put rm %p ref %d\n", rm, atomic_get(&rm->m_refcount));
+
+ if (atomic_dec_and_test(&rm->m_refcount)) {
+ ASSERT(!list_link_active(&rm->m_sock_item));
+ ASSERT(!list_link_active(&rm->m_conn_item));
+ rdsv3_message_purge(rm);
+
+ kmem_free(rm, sizeof (struct rdsv3_message) +
+ (rm->m_nents * sizeof (struct rdsv3_scatterlist)));
+ }
+}
+
+void
+rdsv3_message_inc_free(struct rdsv3_incoming *inc)
+{
+ struct rdsv3_message *rm =
+ container_of(inc, struct rdsv3_message, m_inc);
+ rdsv3_message_put(rm);
+}
+
+void
+rdsv3_message_populate_header(struct rdsv3_header *hdr, uint16_be_t sport,
+ uint16_be_t dport, uint64_t seq)
+{
+ hdr->h_flags = 0;
+ hdr->h_sport = sport;
+ hdr->h_dport = dport;
+ hdr->h_sequence = htonll(seq);
+ hdr->h_exthdr[0] = RDSV3_EXTHDR_NONE;
+}
+
+int
+rdsv3_message_add_extension(struct rdsv3_header *hdr,
+ unsigned int type, const void *data, unsigned int len)
+{
+ unsigned int ext_len = sizeof (uint8_t) + len;
+ unsigned char *dst;
+
+ RDSV3_DPRINTF4("rdsv3_message_add_extension", "Enter");
+
+ /* For now, refuse to add more than one extension header */
+ if (hdr->h_exthdr[0] != RDSV3_EXTHDR_NONE)
+ return (0);
+
+ if (type >= __RDSV3_EXTHDR_MAX ||
+ len != rdsv3_exthdr_size[type])
+ return (0);
+
+ if (ext_len >= RDSV3_HEADER_EXT_SPACE)
+ return (0);
+ dst = hdr->h_exthdr;
+
+ *dst++ = type;
+ (void) memcpy(dst, data, len);
+
+ dst[len] = RDSV3_EXTHDR_NONE;
+
+ RDSV3_DPRINTF4("rdsv3_message_add_extension", "Return");
+ return (1);
+}
+
+/*
+ * If a message has extension headers, retrieve them here.
+ * Call like this:
+ *
+ * unsigned int pos = 0;
+ *
+ * while (1) {
+ * buflen = sizeof(buffer);
+ * type = rdsv3_message_next_extension(hdr, &pos, buffer, &buflen);
+ * if (type == RDSV3_EXTHDR_NONE)
+ * break;
+ * ...
+ * }
+ */
+int
+rdsv3_message_next_extension(struct rdsv3_header *hdr,
+ unsigned int *pos, void *buf, unsigned int *buflen)
+{
+ unsigned int offset, ext_type, ext_len;
+ uint8_t *src = hdr->h_exthdr;
+
+ RDSV3_DPRINTF4("rdsv3_message_next_extension", "Enter");
+
+ offset = *pos;
+ if (offset >= RDSV3_HEADER_EXT_SPACE)
+ goto none;
+
+ /*
+ * Get the extension type and length. For now, the
+ * length is implied by the extension type.
+ */
+ ext_type = src[offset++];
+
+ if (ext_type == RDSV3_EXTHDR_NONE || ext_type >= __RDSV3_EXTHDR_MAX)
+ goto none;
+ ext_len = rdsv3_exthdr_size[ext_type];
+ if (offset + ext_len > RDSV3_HEADER_EXT_SPACE)
+ goto none;
+
+ *pos = offset + ext_len;
+ if (ext_len < *buflen)
+ *buflen = ext_len;
+ (void) memcpy(buf, src + offset, *buflen);
+ return (ext_type);
+
+none:
+ *pos = RDSV3_HEADER_EXT_SPACE;
+ *buflen = 0;
+ return (RDSV3_EXTHDR_NONE);
+}
+
+int
+rdsv3_message_add_version_extension(struct rdsv3_header *hdr,
+ unsigned int version)
+{
+ struct rdsv3_ext_header_version ext_hdr;
+
+ ext_hdr.h_version = htonl(version);
+ return (rdsv3_message_add_extension(hdr, RDSV3_EXTHDR_VERSION,
+ &ext_hdr, sizeof (ext_hdr)));
+}
+
+int
+rdsv3_message_get_version_extension(struct rdsv3_header *hdr,
+ unsigned int *version)
+{
+ struct rdsv3_ext_header_version ext_hdr;
+ unsigned int pos = 0, len = sizeof (ext_hdr);
+
+ RDSV3_DPRINTF4("rdsv3_message_get_version_extension", "Enter");
+
+ /*
+ * We assume the version extension is the only one present
+ */
+ if (rdsv3_message_next_extension(hdr, &pos, &ext_hdr, &len) !=
+ RDSV3_EXTHDR_VERSION)
+ return (0);
+ *version = ntohl(ext_hdr.h_version);
+ return (1);
+}
+
+int
+rdsv3_message_add_rdma_dest_extension(struct rdsv3_header *hdr, uint32_t r_key,
+ uint32_t offset)
+{
+ struct rdsv3_ext_header_rdma_dest ext_hdr;
+
+ ext_hdr.h_rdma_rkey = htonl(r_key);
+ ext_hdr.h_rdma_offset = htonl(offset);
+ return (rdsv3_message_add_extension(hdr, RDSV3_EXTHDR_RDMA_DEST,
+ &ext_hdr, sizeof (ext_hdr)));
+}
+
+struct rdsv3_message *
+rdsv3_message_alloc(unsigned int nents, int gfp)
+{
+ struct rdsv3_message *rm;
+
+ RDSV3_DPRINTF4("rdsv3_message_alloc", "Enter(nents: %d)", nents);
+
+ rm = kmem_zalloc(sizeof (struct rdsv3_message) +
+ (nents * sizeof (struct rdsv3_scatterlist)), gfp);
+ if (!rm)
+ goto out;
+
+ rm->m_refcount = 1;
+ list_link_init(&rm->m_sock_item);
+ list_link_init(&rm->m_conn_item);
+ mutex_init(&rm->m_rs_lock, NULL, MUTEX_DRIVER, NULL);
+
+ RDSV3_DPRINTF4("rdsv3_message_alloc", "Return(rm: %p)", rm);
+out:
+ return (rm);
+}
+
+struct rdsv3_message *
+rdsv3_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
+{
+ struct rdsv3_message *rm;
+ unsigned int i;
+
+ RDSV3_DPRINTF4("rdsv3_message_map_pages", "Enter(len: %d)", total_len);
+
+#ifndef __lock_lint
+ rm = rdsv3_message_alloc(ceil(total_len, PAGE_SIZE), KM_NOSLEEP);
+#else
+ rm = NULL;
+#endif
+ if (rm == NULL)
+ return (ERR_PTR(-ENOMEM));
+
+ set_bit(RDSV3_MSG_PAGEVEC, &rm->m_flags);
+ rm->m_inc.i_hdr.h_len = htonl(total_len);
+#ifndef __lock_lint
+ rm->m_nents = ceil(total_len, PAGE_SIZE);
+#else
+ rm->m_nents = 0;
+#endif
+
+ for (i = 0; i < rm->m_nents; ++i) {
+ rdsv3_sg_set_page(&rm->m_sg[i],
+ page_addrs[i],
+ PAGE_SIZE, 0);
+ }
+
+ return (rm);
+}
+
+struct rdsv3_message *
+rdsv3_message_copy_from_user(struct uio *uiop,
+ size_t total_len)
+{
+ struct rdsv3_message *rm;
+ struct rdsv3_scatterlist *sg;
+ int ret;
+
+ RDSV3_DPRINTF4("rdsv3_message_copy_from_user", "Enter: %d", total_len);
+
+#ifndef __lock_lint
+ rm = rdsv3_message_alloc(ceil(total_len, PAGE_SIZE), KM_NOSLEEP);
+#else
+ rm = NULL;
+#endif
+ if (rm == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ rm->m_inc.i_hdr.h_len = htonl(total_len);
+
+ /*
+ * now allocate and copy in the data payload.
+ */
+ sg = rm->m_sg;
+
+ while (total_len) {
+ if (rdsv3_sg_page(sg) == NULL) {
+ ret = rdsv3_page_remainder_alloc(sg, total_len, 0);
+ if (ret)
+ goto out;
+ rm->m_nents++;
+ }
+
+ ret = uiomove(rdsv3_sg_page(sg), rdsv3_sg_len(sg), UIO_WRITE,
+ uiop);
+ if (ret)
+ goto out;
+
+ total_len -= rdsv3_sg_len(sg);
+ sg++;
+ }
+
+ ret = 0;
+out:
+ if (ret) {
+ if (rm)
+ rdsv3_message_put(rm);
+ rm = ERR_PTR(ret);
+ }
+ return (rm);
+}
+
+int
+rdsv3_message_inc_copy_to_user(struct rdsv3_incoming *inc,
+ uio_t *uiop, size_t size)
+{
+ struct rdsv3_message *rm;
+ struct rdsv3_scatterlist *sg;
+ unsigned long to_copy;
+ unsigned long vec_off;
+ int copied;
+ int ret;
+ uint32_t len;
+
+ rm = container_of(inc, struct rdsv3_message, m_inc);
+ len = ntohl(rm->m_inc.i_hdr.h_len);
+
+ RDSV3_DPRINTF4("rdsv3_message_inc_copy_to_user",
+ "Enter(rm: %p, len: %d)", rm, len);
+
+ sg = rm->m_sg;
+ vec_off = 0;
+ copied = 0;
+
+ while (copied < size && copied < len) {
+
+ to_copy = min(len - copied, sg->length - vec_off);
+ to_copy = min(size - copied, to_copy);
+
+ RDSV3_DPRINTF5("rdsv3_message_inc_copy_to_user",
+ "copying %lu bytes to user iov %p from sg [%p, %u] + %lu\n",
+ to_copy, uiop,
+ rdsv3_sg_page(sg), sg->length, vec_off);
+
+ ret = uiomove(rdsv3_sg_page(sg), to_copy, UIO_READ, uiop);
+ if (ret)
+ break;
+
+ vec_off += to_copy;
+ copied += to_copy;
+
+ if (vec_off == sg->length) {
+ vec_off = 0;
+ sg++;
+ }
+ }
+
+ return (copied);
+}
+
+/*
+ * If the message is still on the send queue, wait until the transport
+ * is done with it. This is particularly important for RDMA operations.
+ */
+void
+rdsv3_message_wait(struct rdsv3_message *rm)
+{
+ rdsv3_wait_event(rdsv3_message_flush_waitq,
+ !test_bit(RDSV3_MSG_MAPPED, &rm->m_flags));
+}
+
+void
+rdsv3_message_unmapped(struct rdsv3_message *rm)
+{
+ clear_bit(RDSV3_MSG_MAPPED, &rm->m_flags);
+ rdsv3_wake_up_all(&rdsv3_message_flush_waitq);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/page.c b/usr/src/uts/common/io/ib/clients/rdsv3/page.c
new file mode 100644
index 0000000000..356917c711
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/page.c
@@ -0,0 +1,102 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/rds.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+/*
+ * @bytes - the number of bytes needed.
+ *
+ * XXX - This is different from Linux.
+ */
+int
+rdsv3_page_remainder_alloc(struct rdsv3_scatterlist *scat, unsigned long bytes,
+ int gfp)
+{
+ caddr_t page;
+ int ret;
+
+ ASSERT(rdsv3_sg_page(scat) == NULL);
+
+ if (bytes >= PAGE_SIZE) {
+ page = kmem_alloc(PAGE_SIZE, gfp);
+ if (page == NULL) {
+ ret = -ENOMEM;
+ } else {
+ rdsv3_sg_set_page(scat, page, PAGE_SIZE, 0);
+ ret = 0;
+ }
+ goto out;
+ }
+
+ /*
+ * XXX - This is not same as linux.
+ */
+ page = kmem_alloc(bytes, KM_NOSLEEP);
+ if (page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ rdsv3_sg_set_page(scat, page, bytes, 0);
+ ret = 0;
+out:
+ RDSV3_DPRINTF5("rdsv3_page_remainder_alloc", "bytes %lu %p %u",
+ bytes, rdsv3_sg_page(scat), scat->length);
+ return (ret);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdma.c b/usr/src/uts/common/io/ib/clients/rdsv3/rdma.c
new file mode 100644
index 0000000000..8f795120cd
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdma.c
@@ -0,0 +1,672 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2007 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/ib/clients/of/rdma/ib_verbs.h>
+#include <sys/ib/clients/of/rdma/ib_addr.h>
+#include <sys/ib/clients/of/rdma/rdma_cm.h>
+
+#include <sys/ib/clients/rdsv3/ib.h>
+#include <sys/ib/clients/rdsv3/rdma.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+#define DMA_TO_DEVICE 0
+#define DMA_FROM_DEVICE 1
+#define RB_CLEAR_NODE(nodep) AVL_SETPARENT(nodep, nodep);
+
+/*
+ * XXX
+ * - build with sparse
+ * - should we limit the size of a mr region? let transport return failure?
+ * - should we detect duplicate keys on a socket? hmm.
+ * - an rdma is an mlock, apply rlimit?
+ */
+
+/*
+ * get the number of pages by looking at the page indices that the start and
+ * end addresses fall in.
+ *
+ * Returns 0 if the vec is invalid. It is invalid if the number of bytes
+ * causes the address to wrap or overflows an unsigned int. This comes
+ * from being stored in the 'length' member of 'struct rdsv3_scatterlist'.
+ */
+static unsigned int
+rdsv3_pages_in_vec(struct rdsv3_iovec *vec)
+{
+ if ((vec->addr + vec->bytes <= vec->addr) ||
+ (vec->bytes > (uint64_t)UINT_MAX)) {
+ return (0);
+ }
+
+ return (((vec->addr + vec->bytes + PAGESIZE - 1) >>
+ PAGESHIFT) - (vec->addr >> PAGESHIFT));
+}
+
+static struct rdsv3_mr *
+rdsv3_mr_tree_walk(struct avl_tree *root, uint32_t key,
+ struct rdsv3_mr *insert)
+{
+ struct rdsv3_mr *mr;
+ avl_index_t where;
+
+ mr = avl_find(root, &key, &where);
+ if ((mr == NULL) && (insert != NULL)) {
+ avl_insert(root, (void *)insert, where);
+ atomic_add_32(&insert->r_refcount, 1);
+ return (NULL);
+ }
+
+ return (mr);
+}
+
+/*
+ * Destroy the transport-specific part of a MR.
+ */
+static void
+rdsv3_destroy_mr(struct rdsv3_mr *mr)
+{
+ struct rdsv3_sock *rs = mr->r_sock;
+ void *trans_private = NULL;
+ avl_node_t *np;
+
+ RDSV3_DPRINTF5("rdsv3_destroy_mr",
+ "RDS: destroy mr key is %x refcnt %u",
+ mr->r_key, atomic_get(&mr->r_refcount));
+
+ if (test_and_set_bit(RDSV3_MR_DEAD, &mr->r_state))
+ return;
+
+ mutex_enter(&rs->rs_rdma_lock);
+ np = &mr->r_rb_node;
+ if (AVL_XPARENT(np) != np)
+ avl_remove(&rs->rs_rdma_keys, mr);
+ trans_private = mr->r_trans_private;
+ mr->r_trans_private = NULL;
+ mutex_exit(&rs->rs_rdma_lock);
+
+ if (trans_private)
+ mr->r_trans->free_mr(trans_private, mr->r_invalidate);
+}
+
+void
+__rdsv3_put_mr_final(struct rdsv3_mr *mr)
+{
+ rdsv3_destroy_mr(mr);
+ kmem_free(mr, sizeof (*mr));
+}
+
+/*
+ * By the time this is called we can't have any more ioctls called on
+ * the socket so we don't need to worry about racing with others.
+ */
+void
+rdsv3_rdma_drop_keys(struct rdsv3_sock *rs)
+{
+ struct rdsv3_mr *mr;
+ struct avl_node *node;
+
+ /* Release any MRs associated with this socket */
+ mutex_enter(&rs->rs_rdma_lock);
+ while ((node = avl_first(&rs->rs_rdma_keys))) {
+ mr = container_of(node, struct rdsv3_mr, r_rb_node);
+ if (mr->r_trans == rs->rs_transport)
+ mr->r_invalidate = 0;
+ avl_remove(&rs->rs_rdma_keys, &mr->r_rb_node);
+ RB_CLEAR_NODE(&mr->r_rb_node)
+ mutex_exit(&rs->rs_rdma_lock);
+ rdsv3_destroy_mr(mr);
+ rdsv3_mr_put(mr);
+ mutex_enter(&rs->rs_rdma_lock);
+ }
+ mutex_exit(&rs->rs_rdma_lock);
+
+ if (rs->rs_transport && rs->rs_transport->flush_mrs)
+ rs->rs_transport->flush_mrs();
+}
+
+/*
+ * Helper function to pin user pages.
+ */
+#if 0
+static int
+rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
+ struct page **pages, int write)
+{
+ unsigned long l_user_addr = user_addr;
+ unsigned int l_nr_pages = nr_pages;
+ struct page **l_pages = pages;
+ int l_write = write;
+
+ /* memory pin in rds_ib_get_mr() */
+ return (0);
+}
+#endif
+
+static int
+__rdsv3_rdma_map(struct rdsv3_sock *rs, struct rdsv3_get_mr_args *args,
+ uint64_t *cookie_ret, struct rdsv3_mr **mr_ret)
+{
+ struct rdsv3_mr *mr = NULL, *found;
+ void *trans_private;
+ rdsv3_rdma_cookie_t cookie;
+ unsigned int nents = 0;
+ int ret;
+
+ if (rs->rs_bound_addr == 0) {
+ ret = -ENOTCONN; /* XXX not a great errno */
+ goto out;
+ }
+
+ if (rs->rs_transport->get_mr == NULL) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ mr = kmem_zalloc(sizeof (struct rdsv3_mr), KM_NOSLEEP);
+ if (mr == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mr->r_refcount = 1;
+ RB_CLEAR_NODE(&mr->r_rb_node);
+ mr->r_trans = rs->rs_transport;
+ mr->r_sock = rs;
+
+ if (args->flags & RDSV3_RDMA_USE_ONCE)
+ mr->r_use_once = 1;
+ if (args->flags & RDSV3_RDMA_INVALIDATE)
+ mr->r_invalidate = 1;
+ if (args->flags & RDSV3_RDMA_READWRITE)
+ mr->r_write = 1;
+
+ /*
+ * Obtain a transport specific MR. If this succeeds, the
+ * s/g list is now owned by the MR.
+ * Note that dma_map() implies that pending writes are
+ * flushed to RAM, so no dma_sync is needed here.
+ */
+ trans_private = rs->rs_transport->get_mr(&args->vec, nents, rs,
+ &mr->r_key);
+
+ if (IS_ERR(trans_private)) {
+ ret = PTR_ERR(trans_private);
+ goto out;
+ }
+
+ mr->r_trans_private = trans_private;
+
+ /*
+ * The user may pass us an unaligned address, but we can only
+ * map page aligned regions. So we keep the offset, and build
+ * a 64bit cookie containing <R_Key, offset> and pass that
+ * around.
+ */
+ cookie = rdsv3_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGEMASK);
+ if (cookie_ret)
+ *cookie_ret = cookie;
+
+ /*
+ * copy value of cookie to user address at args->cookie_addr
+ */
+ if (args->cookie_addr) {
+ ret = ddi_copyout((void *)&cookie,
+ (void *)((intptr_t)args->cookie_addr),
+ sizeof (rdsv3_rdma_cookie_t), 0);
+ if (ret != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ RDSV3_DPRINTF5("__rdsv3_rdma_map",
+ "RDS: get_mr mr 0x%p addr 0x%llx key 0x%x",
+ mr, args->vec.addr, mr->r_key);
+ /*
+ * Inserting the new MR into the rbtree bumps its
+ * reference count.
+ */
+ mutex_enter(&rs->rs_rdma_lock);
+ found = rdsv3_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr);
+ mutex_exit(&rs->rs_rdma_lock);
+
+ ASSERT(!(found && found != mr));
+
+ if (mr_ret) {
+ atomic_add_32(&mr->r_refcount, 1);
+ *mr_ret = mr;
+ }
+
+ ret = 0;
+out:
+ if (mr)
+ rdsv3_mr_put(mr);
+ return (ret);
+}
+
+int
+rdsv3_get_mr(struct rdsv3_sock *rs, const void *optval, int optlen)
+{
+ struct rdsv3_get_mr_args args;
+
+ if (optlen != sizeof (struct rdsv3_get_mr_args))
+ return (-EINVAL);
+
+#if 1
+ bcopy((struct rdsv3_get_mr_args *)optval, &args,
+ sizeof (struct rdsv3_get_mr_args));
+#else
+ if (ddi_copyin(optval, &args, optlen, 0))
+ return (-EFAULT);
+#endif
+
+ return (__rdsv3_rdma_map(rs, &args, NULL, NULL));
+}
+
+/*
+ * Free the MR indicated by the given R_Key
+ */
+int
+rdsv3_free_mr(struct rdsv3_sock *rs, const void *optval, int optlen)
+{
+ struct rdsv3_free_mr_args args;
+ struct rdsv3_mr *mr;
+
+ if (optlen != sizeof (struct rdsv3_free_mr_args))
+ return (-EINVAL);
+
+#if 1
+ bcopy((struct rdsv3_free_mr_args *)optval, &args,
+ sizeof (struct rdsv3_free_mr_args));
+#else
+ if (ddi_copyin((struct rdsv3_free_mr_args *)optval, &args,
+ sizeof (struct rdsv3_free_mr_args), 0))
+ return (-EFAULT);
+#endif
+
+ /* Special case - a null cookie means flush all unused MRs */
+ if (args.cookie == 0) {
+ if (!rs->rs_transport || !rs->rs_transport->flush_mrs)
+ return (-EINVAL);
+ rs->rs_transport->flush_mrs();
+ return (0);
+ }
+
+ /*
+ * Look up the MR given its R_key and remove it from the rbtree
+ * so nobody else finds it.
+ * This should also prevent races with rdsv3_rdma_unuse.
+ */
+ mutex_enter(&rs->rs_rdma_lock);
+ mr = rdsv3_mr_tree_walk(&rs->rs_rdma_keys,
+ rdsv3_rdma_cookie_key(args.cookie), NULL);
+ if (mr) {
+ avl_remove(&rs->rs_rdma_keys, &mr->r_rb_node);
+ RB_CLEAR_NODE(&mr->r_rb_node);
+ if (args.flags & RDSV3_RDMA_INVALIDATE)
+ mr->r_invalidate = 1;
+ }
+ mutex_exit(&rs->rs_rdma_lock);
+
+ if (!mr)
+ return (-EINVAL);
+
+ /*
+ * call rdsv3_destroy_mr() ourselves so that we're sure it's done
+ * by time we return. If we let rdsv3_mr_put() do it it might not
+ * happen until someone else drops their ref.
+ */
+ rdsv3_destroy_mr(mr);
+ rdsv3_mr_put(mr);
+ return (0);
+}
+
+/*
+ * This is called when we receive an extension header that
+ * tells us this MR was used. It allows us to implement
+ * use_once semantics
+ */
+void
+rdsv3_rdma_unuse(struct rdsv3_sock *rs, uint32_t r_key, int force)
+{
+ struct rdsv3_mr *mr;
+ int zot_me = 0;
+
+ RDSV3_DPRINTF4("rdsv3_rdma_unuse", "Enter rkey: 0x%x", r_key);
+
+ mutex_enter(&rs->rs_rdma_lock);
+ mr = rdsv3_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+ if (mr && (mr->r_use_once || force)) {
+ avl_remove(&rs->rs_rdma_keys, &mr->r_rb_node);
+ RB_CLEAR_NODE(&mr->r_rb_node);
+ zot_me = 1;
+ } else if (mr)
+ atomic_add_32(&mr->r_refcount, 1);
+ mutex_exit(&rs->rs_rdma_lock);
+
+ /*
+ * May have to issue a dma_sync on this memory region.
+ * Note we could avoid this if the operation was a RDMA READ,
+ * but at this point we can't tell.
+ */
+ if (mr != NULL) {
+ RDSV3_DPRINTF4("rdsv3_rdma_unuse", "mr: %p zot_me %d",
+ mr, zot_me);
+ if (mr->r_trans->sync_mr)
+ mr->r_trans->sync_mr(mr->r_trans_private,
+ DMA_FROM_DEVICE);
+
+ /*
+ * If the MR was marked as invalidate, this will
+ * trigger an async flush.
+ */
+ if (zot_me)
+ rdsv3_destroy_mr(mr);
+ rdsv3_mr_put(mr);
+ }
+ RDSV3_DPRINTF4("rdsv3_rdma_unuse", "Return");
+}
+
+void
+rdsv3_rdma_free_op(struct rdsv3_rdma_op *ro)
+{
+ unsigned int i;
+
+ /* deallocate RDMA resources on rdsv3_message */
+
+ for (i = 0; i < ro->r_nents; i++) {
+ ddi_umem_unlock(ro->r_rdma_sg[i].umem_cookie);
+ }
+
+ if (ro->r_notifier)
+ kmem_free(ro->r_notifier, sizeof (*ro->r_notifier));
+ kmem_free(ro, sizeof (*ro));
+}
+
+extern struct umem_callback_ops rdsv3_umem_cbops;
+/*
+ * args is a pointer to an in-kernel copy in the sendmsg cmsg.
+ */
+static struct rdsv3_rdma_op *
+rdsv3_rdma_prepare(struct rdsv3_sock *rs, struct rdsv3_rdma_args *args)
+{
+ struct rdsv3_iovec vec;
+ struct rdsv3_rdma_op *op = NULL;
+ unsigned int nr_bytes;
+ struct rdsv3_iovec *local_vec;
+ unsigned int nr;
+ unsigned int i;
+ ddi_umem_cookie_t umem_cookie;
+ size_t umem_len;
+ caddr_t umem_addr;
+ int umem_flags;
+ int ret;
+
+ if (rs->rs_bound_addr == 0) {
+ ret = -ENOTCONN; /* XXX not a great errno */
+ goto out;
+ }
+
+ if (args->nr_local > (uint64_t)UINT_MAX) {
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
+ op = kmem_zalloc(offsetof(struct rdsv3_rdma_op,
+ r_rdma_sg[args->nr_local]), KM_NOSLEEP);
+ if (op == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ op->r_write = !!(args->flags & RDSV3_RDMA_READWRITE);
+ op->r_fence = !!(args->flags & RDSV3_RDMA_FENCE);
+ op->r_notify = !!(args->flags & RDSV3_RDMA_NOTIFY_ME);
+ op->r_recverr = rs->rs_recverr;
+
+ if (op->r_notify || op->r_recverr) {
+ /*
+ * We allocate an uninitialized notifier here, because
+ * we don't want to do that in the completion handler. We
+ * would have to use GFP_ATOMIC there, and don't want to deal
+ * with failed allocations.
+ */
+ op->r_notifier = kmem_alloc(sizeof (struct rdsv3_notifier),
+ KM_NOSLEEP);
+ if (!op->r_notifier) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ op->r_notifier->n_user_token = args->user_token;
+ op->r_notifier->n_status = RDSV3_RDMA_SUCCESS;
+ }
+
+ /*
+ * The cookie contains the R_Key of the remote memory region, and
+ * optionally an offset into it. This is how we implement RDMA into
+ * unaligned memory.
+ * When setting up the RDMA, we need to add that offset to the
+ * destination address (which is really an offset into the MR)
+ * FIXME: We may want to move this into ib_rdma.c
+ */
+ op->r_key = rdsv3_rdma_cookie_key(args->cookie);
+ op->r_remote_addr = args->remote_vec.addr +
+ rdsv3_rdma_cookie_offset(args->cookie);
+
+ nr_bytes = 0;
+
+ RDSV3_DPRINTF5("rdsv3_rdma_prepare",
+ "RDS: rdma prepare nr_local %llu rva %llx rkey %x",
+ (unsigned long long)args->nr_local,
+ (unsigned long long)args->remote_vec.addr,
+ op->r_key);
+
+ local_vec = (struct rdsv3_iovec *)(unsigned long) args->local_vec_addr;
+
+ /* pin the scatter list of user buffers */
+ for (i = 0; i < args->nr_local; i++) {
+ if (ddi_copyin(&local_vec[i], &vec,
+ sizeof (struct rdsv3_iovec), 0)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ nr = rdsv3_pages_in_vec(&vec);
+ if (nr == 0) {
+ RDSV3_DPRINTF2("rdsv3_rdma_prepare",
+ "rdsv3_pages_in_vec returned 0");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ rs->rs_user_addr = vec.addr;
+ rs->rs_user_bytes = vec.bytes;
+
+ /* pin user memory pages */
+ umem_len = ptob(btopr(vec.bytes +
+ ((uintptr_t)vec.addr & PAGEOFFSET)));
+ umem_addr = (caddr_t)((uintptr_t)vec.addr & ~PAGEOFFSET);
+ umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
+ DDI_UMEMLOCK_LONGTERM);
+ ret = umem_lockmemory(umem_addr, umem_len, umem_flags,
+ &umem_cookie, &rdsv3_umem_cbops, NULL);
+ if (ret != 0) {
+ RDSV3_DPRINTF2("rdsv3_rdma_prepare",
+ "umem_lockmemory() returned %d", ret);
+ ret = -EFAULT;
+ goto out;
+ }
+ op->r_rdma_sg[i].umem_cookie = umem_cookie;
+ op->r_rdma_sg[i].iovec = vec;
+ nr_bytes += vec.bytes;
+
+ RDSV3_DPRINTF5("rdsv3_rdma_prepare",
+ "RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx",
+ nr_bytes, nr, vec.bytes, vec.addr);
+ }
+ op->r_nents = i;
+
+ if (nr_bytes > args->remote_vec.bytes) {
+ RDSV3_DPRINTF2("rdsv3_rdma_prepare",
+ "RDS nr_bytes %u remote_bytes %u do not match",
+ nr_bytes, (unsigned int) args->remote_vec.bytes);
+ ret = -EINVAL;
+ goto out;
+ }
+ op->r_bytes = nr_bytes;
+
+ ret = 0;
+out:
+ if (ret) {
+ if (op)
+ rdsv3_rdma_free_op(op);
+ op = ERR_PTR(ret);
+ }
+ return (op);
+}
+
+/*
+ * The application asks for a RDMA transfer.
+ * Extract all arguments and set up the rdma_op
+ */
+int
+rdsv3_cmsg_rdma_args(struct rdsv3_sock *rs, struct rdsv3_message *rm,
+ struct cmsghdr *cmsg)
+{
+ struct rdsv3_rdma_op *op;
+ struct rdsv3_rdma_args *ap;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof (struct rdsv3_rdma_args)) ||
+ rm->m_rdma_op != NULL)
+ return (-EINVAL);
+
+ /* uint64_t alignment on struct rdsv3_get_mr_args */
+ ap = (struct rdsv3_rdma_args *)kmem_alloc(cmsg->cmsg_len, KM_SLEEP);
+ bcopy(CMSG_DATA(cmsg), ap, cmsg->cmsg_len);
+ op = rdsv3_rdma_prepare(rs, ap);
+ kmem_free(ap, cmsg->cmsg_len);
+ if (IS_ERR(op))
+ return (PTR_ERR(op));
+ rdsv3_stats_inc(s_send_rdma);
+ rm->m_rdma_op = op;
+ return (0);
+}
+
+/*
+ * The application wants us to pass an RDMA destination (aka MR)
+ * to the remote
+ */
+int
+rdsv3_cmsg_rdma_dest(struct rdsv3_sock *rs, struct rdsv3_message *rm,
+ struct cmsghdr *cmsg)
+{
+ struct rdsv3_mr *mr;
+ uint32_t r_key;
+ int err = 0;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof (rdsv3_rdma_cookie_t)) ||
+ rm->m_rdma_cookie != 0)
+ return (-EINVAL);
+
+ (void) memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg),
+ sizeof (rm->m_rdma_cookie));
+
+ /*
+ * We are reusing a previously mapped MR here. Most likely, the
+ * application has written to the buffer, so we need to explicitly
+ * flush those writes to RAM. Otherwise the HCA may not see them
+ * when doing a DMA from that buffer.
+ */
+ r_key = rdsv3_rdma_cookie_key(rm->m_rdma_cookie);
+
+ mutex_enter(&rs->rs_rdma_lock);
+ mr = rdsv3_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+ if (mr == NULL)
+ err = -EINVAL; /* invalid r_key */
+ else
+ atomic_add_32(&mr->r_refcount, 1);
+ mutex_exit(&rs->rs_rdma_lock);
+
+ if (mr) {
+ mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
+ rm->m_rdma_mr = mr;
+ }
+ return (err);
+}
+
+/*
+ * The application passes us an address range it wants to enable RDMA
+ * to/from. We map the area, and save the <R_Key,offset> pair
+ * in rm->m_rdma_cookie. This causes it to be sent along to the peer
+ * in an extension header.
+ */
+int
+rdsv3_cmsg_rdma_map(struct rdsv3_sock *rs, struct rdsv3_message *rm,
+ struct cmsghdr *cmsg)
+{
+ struct rdsv3_get_mr_args *mrp;
+ int status;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof (struct rdsv3_get_mr_args)) ||
+ rm->m_rdma_cookie != 0)
+ return (-EINVAL);
+
+ /* uint64_t alignment on struct rdsv3_get_mr_args */
+ mrp = (struct rdsv3_get_mr_args *)kmem_alloc(cmsg->cmsg_len, KM_SLEEP);
+ bcopy(CMSG_DATA(cmsg), mrp, cmsg->cmsg_len);
+ status = __rdsv3_rdma_map(rs, mrp, &rm->m_rdma_cookie, &rm->m_rdma_mr);
+ kmem_free(mrp, cmsg->cmsg_len);
+ return (status);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdma_transport.c b/usr/src/uts/common/io/ib/clients/rdsv3/rdma_transport.c
new file mode 100644
index 0000000000..1c87e52cdf
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdma_transport.c
@@ -0,0 +1,292 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2009 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/ib/clients/of/rdma/ib_verbs.h>
+#include <sys/ib/clients/of/rdma/ib_addr.h>
+#include <sys/ib/clients/of/rdma/rdma_cm.h>
+
+#include <sys/ib/clients/rdsv3/ib.h>
+#include <sys/ib/clients/rdsv3/rdma_transport.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+kmutex_t rdsv3_rdma_listen_id_lock;
+struct rdma_cm_id *rdsv3_rdma_listen_id = NULL;
+
+int
+rdsv3_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ /* this can be null in the listening path */
+ struct rdsv3_connection *conn = cm_id->context;
+ struct rdsv3_transport *trans;
+ int ret = 0;
+
+ RDSV3_DPRINTF2("rdsv3_rdma_cm_event_handler",
+ "conn %p id %p handling event %u", conn, cm_id, event->event);
+
+ trans = &rdsv3_ib_transport;
+
+ /*
+ * Prevent shutdown from tearing down the connection
+ * while we're executing.
+ */
+ if (conn) {
+ mutex_enter(&conn->c_cm_lock);
+
+ /*
+ * If the connection is being shut down, bail out
+ * right away. We return 0 so cm_id doesn't get
+ * destroyed prematurely
+ */
+ if (rdsv3_conn_state(conn) == RDSV3_CONN_DISCONNECTING) {
+ /*
+ * Reject incoming connections while we're tearing
+ * down an existing one.
+ */
+ if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
+ ret = 1;
+ RDSV3_DPRINTF2("rdsv3_rdma_cm_event_handler",
+ "conn %p id %p incoming event %u when "
+ "disconnecting", conn, cm_id, event->event);
+ goto out;
+ }
+ }
+
+ switch (event->event) {
+ case RDMA_CM_EVENT_CONNECT_REQUEST:
+ ret = trans->cm_handle_connect(cm_id, event);
+ break;
+
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ /* XXX do we need to clean up if this fails? */
+ ret = rdma_resolve_route(cm_id,
+ RDSV3_RDMA_RESOLVE_TIMEOUT_MS);
+ break;
+
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ /* XXX worry about racing with listen acceptance */
+ ret = trans->cm_initiate_connect(cm_id);
+ break;
+
+ case RDMA_CM_EVENT_ESTABLISHED:
+ trans->cm_connect_complete(conn, event);
+ break;
+
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ case RDMA_CM_EVENT_UNREACHABLE:
+ case RDMA_CM_EVENT_REJECTED:
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ if (conn)
+ rdsv3_conn_drop(conn);
+ break;
+
+ case RDMA_CM_EVENT_DISCONNECTED:
+ RDSV3_DPRINTF2("rdsv3_rdma_cm_event_handler",
+ "RDS/RDMA: DISCONNECT event - dropping connection "
+ "cm_id: %p", cm_id);
+ if (conn) {
+ RDSV3_DPRINTF0("rdsv3_rdma_cm_event_handler",
+ "RDS/RDMA: DISCONNECT event - dropping connection "
+ "%u.%u.%u.%u ->%u.%u.%u.%u", NIPQUAD(conn->c_laddr),
+ NIPQUAD(conn->c_faddr));
+ rdsv3_conn_drop(conn);
+ }
+ break;
+
+ default:
+ /* things like device disconnect? */
+ RDSV3_DPRINTF0("rdsv3_rdma_cm_event_handler",
+ "unknown event %u\n", event->event);
+ RDSV3_PANIC();
+ break;
+ }
+
+out:
+ if (conn) {
+#ifndef __lock_lint
+ // struct rds_iw_connection *ic = conn->c_transport_data;
+
+ /* If we return non-zero, we must to hang on to the cm_id */
+ // BUG_ON(ic->i_cm_id == cm_id && ret);
+#endif
+
+ mutex_exit(&conn->c_cm_lock);
+ }
+
+ RDSV3_DPRINTF2("rdsv3_rdma_cm_event_handler",
+ "id %p event %u handling ret %d", cm_id, event->event, ret);
+
+ return (ret);
+}
+
+static int
+rdsv3_rdma_listen_init(void)
+{
+ struct sockaddr_in sin;
+ struct rdma_cm_id *cm_id;
+ int ret;
+
+ RDSV3_DPRINTF2("rdsv3_rdma_listen_init", "Enter");
+
+ cm_id = rdma_create_id(rdsv3_rdma_cm_event_handler, NULL, RDMA_PS_TCP);
+ if (IS_ERR(cm_id)) {
+ ret = PTR_ERR(cm_id);
+ RDSV3_DPRINTF0("rdsv3_rdma_listen_init",
+ "RDS/RDMA: failed to setup listener, "
+ "rdma_create_id() returned %d", ret);
+ goto out;
+ }
+
+ sin.sin_family = PF_INET;
+ sin.sin_addr.s_addr = (uint32_t)htonl(INADDR_ANY);
+ sin.sin_port = (uint16_t)htons(RDSV3_PORT);
+
+ /*
+ * XXX I bet this binds the cm_id to a device. If we want to support
+ * fail-over we'll have to take this into consideration.
+ */
+ ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+ if (ret) {
+ RDSV3_DPRINTF0("rdsv3_rdma_listen_init",
+ "RDS/RDMA: failed to setup listener, "
+ "rdma_bind_addr() returned %d", ret);
+ goto out;
+ }
+
+ ret = rdma_listen(cm_id, 128);
+ if (ret) {
+ RDSV3_DPRINTF0("rdsv3_rdma_listen_init",
+ "RDS/RDMA: failed to setup listener, "
+ "rdma_listen() returned %d", ret);
+ goto out;
+ }
+
+ RDSV3_DPRINTF5("rdsv3_rdma_listen_init",
+ "cm %p listening on port %u", cm_id, RDSV3_PORT);
+
+ rdsv3_rdma_listen_id = cm_id;
+ cm_id = NULL;
+
+ RDSV3_DPRINTF2("rdsv3_rdma_listen_init",
+ "Return: rdsv3_rdma_listen_id: %p", rdsv3_rdma_listen_id);
+out:
+ if (cm_id)
+ rdma_destroy_id(cm_id);
+ return (ret);
+}
+
+static void rdsv3_rdma_listen_stop(void)
+{
+ RDSV3_DPRINTF2("rdsv3_rdma_listen_stop", "cm %p", rdsv3_rdma_listen_id);
+ rdma_destroy_id(rdsv3_rdma_listen_id);
+
+ RDSV3_DPRINTF2("rdsv3_rdma_listen_stop", "Return");
+}
+
+/*
+ * This function can be called via two routes.
+ * 1. During attach on a worker thread.
+ * 2. From rdsv3_create() for 1st socket.
+ */
+void
+rdsv3_rdma_init()
+{
+ int ret;
+
+ RDSV3_DPRINTF2("rdsv3_rdma_init", "Enter");
+
+ mutex_enter(&rdsv3_rdma_listen_id_lock);
+ if (rdsv3_rdma_listen_id != NULL) {
+ RDSV3_DPRINTF2("rdsv3_rdma_init",
+ "rdsv3_rdma_listen_id is already initialized: %p",
+ rdsv3_rdma_listen_id);
+ mutex_exit(&rdsv3_rdma_listen_id_lock);
+ return;
+ }
+
+ ret = rdsv3_rdma_listen_init();
+ if (ret) {
+ mutex_exit(&rdsv3_rdma_listen_id_lock);
+ return;
+ }
+
+ ret = rdsv3_ib_init();
+ if (ret) {
+ rdsv3_rdma_listen_stop();
+ }
+ mutex_exit(&rdsv3_rdma_listen_id_lock);
+
+ RDSV3_DPRINTF2("rdsv3_rdma_init", "Return");
+}
+
+/*ARGSUSED*/
+void
+rdsv3_rdma_exit(void *arg)
+{
+ RDSV3_DPRINTF2("rdsv3_rdma_exit", "Enter");
+
+ /* stop listening first to ensure no new connections are attempted */
+ if (rdsv3_rdma_listen_id) {
+ rdsv3_rdma_listen_stop();
+ rdsv3_ib_exit();
+ rdsv3_rdma_listen_id = NULL;
+ }
+
+ RDSV3_DPRINTF2("rdsv3_rdma_exit", "Return");
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rds_recv.c b/usr/src/uts/common/io/ib/clients/rdsv3/rds_recv.c
new file mode 100644
index 0000000000..59ffba52a8
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/rds_recv.c
@@ -0,0 +1,677 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/rds.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/rdma.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+void
+rdsv3_inc_init(struct rdsv3_incoming *inc, struct rdsv3_connection *conn,
+ uint32_be_t saddr)
+{
+ RDSV3_DPRINTF5("rdsv3_inc_init", "Enter(inc: %p, conn: %p)", inc, conn);
+ inc->i_refcount = 1;
+ list_link_init(&inc->i_item);
+ inc->i_conn = conn;
+ inc->i_saddr = saddr;
+ inc->i_rdma_cookie = 0;
+}
+
+void
+rdsv3_inc_addref(struct rdsv3_incoming *inc)
+{
+ RDSV3_DPRINTF4("rdsv3_inc_addref",
+ "addref inc %p ref %d", inc, atomic_get(&inc->i_refcount));
+ atomic_add_32(&inc->i_refcount, 1);
+}
+
+void
+rdsv3_inc_put(struct rdsv3_incoming *inc)
+{
+ RDSV3_DPRINTF4("rdsv3_inc_put", "put inc %p ref %d",
+ inc, atomic_get(&inc->i_refcount));
+ if (atomic_dec_and_test(&inc->i_refcount)) {
+ ASSERT(!list_link_active(&inc->i_item));
+
+ inc->i_conn->c_trans->inc_free(inc);
+ }
+}
+
+/*ARGSUSED*/
+static void
+rdsv3_recv_rcvbuf_delta(struct rdsv3_sock *rs, struct rsock *sk,
+ struct rdsv3_cong_map *map,
+ int delta, uint16_be_t port)
+{
+ int now_congested;
+
+ RDSV3_DPRINTF4("rdsv3_recv_rcvbuf_delta",
+ "Enter(rs: %p, map: %p, delta: %d, port: %d)",
+ rs, map, delta, port);
+
+ if (delta == 0)
+ return;
+
+ rs->rs_rcv_bytes += delta;
+ now_congested = rs->rs_rcv_bytes > rdsv3_sk_rcvbuf(rs);
+
+ RDSV3_DPRINTF5("rdsv3_recv_rcvbuf_delta",
+ "rs %p (%u.%u.%u.%u:%u) recv bytes %d buf %d "
+ "now_cong %d delta %d",
+ rs, NIPQUAD(rs->rs_bound_addr),
+ (int)ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
+ rdsv3_sk_rcvbuf(rs), now_congested, delta);
+
+ /* wasn't -> am congested */
+ if (!rs->rs_congested && now_congested) {
+ rs->rs_congested = 1;
+ rdsv3_cong_set_bit(map, port);
+ rdsv3_cong_queue_updates(map);
+ }
+ /* was -> aren't congested */
+ /*
+ * Require more free space before reporting uncongested to prevent
+ * bouncing cong/uncong state too often
+ */
+ else if (rs->rs_congested &&
+ (rs->rs_rcv_bytes < (rdsv3_sk_rcvbuf(rs)/2))) {
+ rs->rs_congested = 0;
+ rdsv3_cong_clear_bit(map, port);
+ rdsv3_cong_queue_updates(map);
+ }
+
+ /* do nothing if no change in cong state */
+
+ RDSV3_DPRINTF4("rdsv3_recv_rcvbuf_delta", "Return(rs: %p)", rs);
+}
+
+/*
+ * Process all extension headers that come with this message.
+ */
+static void
+rdsv3_recv_incoming_exthdrs(struct rdsv3_incoming *inc, struct rdsv3_sock *rs)
+{
+ struct rdsv3_header *hdr = &inc->i_hdr;
+ unsigned int pos = 0, type, len;
+ union {
+ struct rdsv3_ext_header_version version;
+ struct rdsv3_ext_header_rdma rdma;
+ struct rdsv3_ext_header_rdma_dest rdma_dest;
+ } buffer;
+
+ RDSV3_DPRINTF4("rdsv3_recv_incoming_exthdrs", "Enter");
+ while (1) {
+ len = sizeof (buffer);
+ type = rdsv3_message_next_extension(hdr, &pos, &buffer, &len);
+ if (type == RDSV3_EXTHDR_NONE)
+ break;
+ RDSV3_DPRINTF4("recv_incoming_exthdrs", "type %d", type);
+ /* Process extension header here */
+ switch (type) {
+ case RDSV3_EXTHDR_RDMA:
+ rdsv3_rdma_unuse(rs, ntohl(buffer.rdma.h_rdma_rkey),
+ 0);
+ break;
+
+ case RDSV3_EXTHDR_RDMA_DEST:
+ /*
+ * We ignore the size for now. We could stash it
+ * somewhere and use it for error checking.
+ */
+ inc->i_rdma_cookie = rdsv3_rdma_make_cookie(
+ ntohl(buffer.rdma_dest.h_rdma_rkey),
+ ntohl(buffer.rdma_dest.h_rdma_offset));
+
+ break;
+ }
+ }
+ RDSV3_DPRINTF4("rdsv3_recv_incoming_exthdrs", "Return");
+}
+
+/*
+ * The transport must make sure that this is serialized against other
+ * rx and conn reset on this specific conn.
+ *
+ * We currently assert that only one fragmented message will be sent
+ * down a connection at a time. This lets us reassemble in the conn
+ * instead of per-flow which means that we don't have to go digging through
+ * flows to tear down partial reassembly progress on conn failure and
+ * we save flow lookup and locking for each frag arrival. It does mean
+ * that small messages will wait behind large ones. Fragmenting at all
+ * is only to reduce the memory consumption of pre-posted buffers.
+ *
+ * The caller passes in saddr and daddr instead of us getting it from the
+ * conn. This lets loopback, who only has one conn for both directions,
+ * tell us which roles the addrs in the conn are playing for this message.
+ */
+/* ARGSUSED */
+void
+rdsv3_recv_incoming(struct rdsv3_connection *conn, uint32_be_t saddr,
+ uint32_be_t daddr, struct rdsv3_incoming *inc, int gfp)
+{
+ struct rdsv3_sock *rs = NULL;
+ struct rsock *sk;
+
+ inc->i_conn = conn;
+ inc->i_rx_jiffies = jiffies;
+
+ RDSV3_DPRINTF5("rdsv3_recv_incoming",
+ "conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
+ "flags 0x%x rx_jiffies %lu", conn,
+ (unsigned long long)conn->c_next_rx_seq,
+ inc,
+ (unsigned long long)ntohll(inc->i_hdr.h_sequence),
+ ntohl(inc->i_hdr.h_len),
+ ntohs(inc->i_hdr.h_sport),
+ ntohs(inc->i_hdr.h_dport),
+ inc->i_hdr.h_flags,
+ inc->i_rx_jiffies);
+
+ /*
+ * Sequence numbers should only increase. Messages get their
+ * sequence number as they're queued in a sending conn. They
+ * can be dropped, though, if the sending socket is closed before
+ * they hit the wire. So sequence numbers can skip forward
+ * under normal operation. They can also drop back in the conn
+ * failover case as previously sent messages are resent down the
+ * new instance of a conn. We drop those, otherwise we have
+ * to assume that the next valid seq does not come after a
+ * hole in the fragment stream.
+ *
+ * The headers don't give us a way to realize if fragments of
+ * a message have been dropped. We assume that frags that arrive
+ * to a flow are part of the current message on the flow that is
+ * being reassembled. This means that senders can't drop messages
+ * from the sending conn until all their frags are sent.
+ *
+ * XXX we could spend more on the wire to get more robust failure
+ * detection, arguably worth it to avoid data corruption.
+ */
+ if (ntohll(inc->i_hdr.h_sequence) < conn->c_next_rx_seq &&
+ (inc->i_hdr.h_flags & RDSV3_FLAG_RETRANSMITTED)) {
+ rdsv3_stats_inc(s_recv_drop_old_seq);
+ goto out;
+ }
+ conn->c_next_rx_seq = ntohll(inc->i_hdr.h_sequence) + 1;
+
+ if (rdsv3_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
+ rdsv3_stats_inc(s_recv_ping);
+ (void) rdsv3_send_pong(conn, inc->i_hdr.h_sport);
+ goto out;
+ }
+
+ rs = rdsv3_find_bound(daddr, inc->i_hdr.h_dport);
+ if (rs == NULL) {
+ rdsv3_stats_inc(s_recv_drop_no_sock);
+ goto out;
+ }
+
+ /* Process extension headers */
+ rdsv3_recv_incoming_exthdrs(inc, rs);
+
+ /* We can be racing with rdsv3_release() which marks the socket dead. */
+ sk = rdsv3_rs_to_sk(rs);
+
+ /* serialize with rdsv3_release -> sock_orphan */
+ rw_enter(&rs->rs_recv_lock, RW_WRITER);
+ if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD)) {
+ int error, bytes;
+ RDSV3_DPRINTF5("rdsv3_recv_incoming",
+ "adding inc %p to rs %p's recv queue", inc, rs);
+ rdsv3_stats_inc(s_recv_queued);
+ rdsv3_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+ ntohl(inc->i_hdr.h_len),
+ inc->i_hdr.h_dport);
+ rdsv3_inc_addref(inc);
+ list_insert_tail(&rs->rs_recv_queue, inc);
+ bytes = rs->rs_rcv_bytes;
+ rw_exit(&rs->rs_recv_lock);
+
+ __rdsv3_wake_sk_sleep(sk);
+
+ /* wake up anyone waiting in poll */
+ sk->sk_upcalls->su_recv(sk->sk_upper_handle, NULL,
+ bytes, 0, &error, NULL);
+ if (error != 0) {
+ RDSV3_DPRINTF2("rdsv3_recv_incoming",
+ "su_recv returned: %d", error);
+ }
+ } else {
+ rdsv3_stats_inc(s_recv_drop_dead_sock);
+ rw_exit(&rs->rs_recv_lock);
+ }
+
+out:
+ if (rs)
+ rdsv3_sock_put(rs);
+}
+
+/*
+ * be very careful here. This is being called as the condition in
+ * wait_event_*() needs to cope with being called many times.
+ */
+static int
+rdsv3_next_incoming(struct rdsv3_sock *rs, struct rdsv3_incoming **inc)
+{
+ if (*inc == NULL) {
+ rw_enter(&rs->rs_recv_lock, RW_READER);
+ if (!list_is_empty(&rs->rs_recv_queue)) {
+ *inc = list_head(&rs->rs_recv_queue);
+ rdsv3_inc_addref(*inc);
+ }
+ rw_exit(&rs->rs_recv_lock);
+ }
+
+ return (*inc != NULL);
+}
+
+static int
+rdsv3_still_queued(struct rdsv3_sock *rs, struct rdsv3_incoming *inc,
+ int drop)
+{
+ struct rsock *sk = rdsv3_rs_to_sk(rs);
+ int ret = 0;
+
+ RDSV3_DPRINTF4("rdsv3_still_queued", "Enter rs: %p inc: %p drop: %d",
+ rs, inc, drop);
+
+ rw_enter(&rs->rs_recv_lock, RW_WRITER);
+ if (list_link_active(&inc->i_item)) {
+ ret = 1;
+ if (drop) {
+ /* XXX make sure this i_conn is reliable */
+ rdsv3_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+ -ntohl(inc->i_hdr.h_len),
+ inc->i_hdr.h_dport);
+ list_remove_node(&inc->i_item);
+ rdsv3_inc_put(inc);
+ }
+ }
+ rw_exit(&rs->rs_recv_lock);
+
+ RDSV3_DPRINTF5("rdsv3_still_queued",
+ "inc %p rs %p still %d dropped %d", inc, rs, ret, drop);
+ return (ret);
+}
+
+/*
+ * Pull errors off the error queue.
+ * If msghdr is NULL, we will just purge the error queue.
+ */
+int
+rdsv3_notify_queue_get(struct rdsv3_sock *rs, struct msghdr *msghdr)
+{
+ struct rdsv3_notifier *notifier;
+ struct rdsv3_rdma_notify cmsg;
+ unsigned int count = 0, max_messages = ~0U;
+ list_t copy;
+ int err = 0;
+
+ RDSV3_DPRINTF4("rdsv3_notify_queue_get", "Enter(rs: %p)", rs);
+
+ list_create(&copy, sizeof (struct rdsv3_notifier),
+ offsetof(struct rdsv3_notifier, n_list));
+
+
+ /*
+ * put_cmsg copies to user space and thus may sleep. We can't do this
+ * with rs_lock held, so first grab as many notifications as we can
+ * stuff
+ * in the user provided cmsg buffer. We don't try to copy more, to avoid
+ * losing notifications - except when the buffer is so small that
+ * it wouldn't
+ * even hold a single notification. Then we give him as much of this
+ * single
+ * msg as we can squeeze in, and set MSG_CTRUNC.
+ */
+ if (msghdr) {
+ max_messages =
+ msghdr->msg_controllen / CMSG_SPACE(sizeof (cmsg));
+ if (!max_messages)
+ max_messages = 1;
+ }
+
+ mutex_enter(&rs->rs_lock);
+ while (!list_is_empty(&rs->rs_notify_queue) && count < max_messages) {
+ notifier = list_remove_head(&rs->rs_notify_queue);
+ list_insert_tail(&copy, notifier);
+ count++;
+ }
+ mutex_exit(&rs->rs_lock);
+
+ if (!count)
+ return (0);
+
+ while (!list_is_empty(&copy)) {
+ notifier = list_remove_head(&copy);
+
+ if (msghdr) {
+ cmsg.user_token = notifier->n_user_token;
+ cmsg.status = notifier->n_status;
+
+ err = rdsv3_put_cmsg(msghdr, SOL_RDS,
+ RDSV3_CMSG_RDMA_STATUS, sizeof (cmsg), &cmsg);
+ if (err)
+ break;
+ }
+
+ kmem_free(notifier, sizeof (struct rdsv3_notifier));
+ }
+
+ /*
+ * If we bailed out because of an error in put_cmsg,
+ * we may be left with one or more notifications that we
+ * didn't process. Return them to the head of the list.
+ */
+ if (!list_is_empty(&copy)) {
+ mutex_enter(&rs->rs_lock);
+ list_splice(&copy, &rs->rs_notify_queue);
+ mutex_exit(&rs->rs_lock);
+ }
+
+ RDSV3_DPRINTF4("rdsv3_notify_queue_get", "Return(rs: %p)", rs);
+
+ return (err);
+}
+
+/*
+ * Queue a congestion notification
+ */
+static int
+rdsv3_notify_cong(struct rdsv3_sock *rs, struct msghdr *msghdr)
+{
+ uint64_t notify = rs->rs_cong_notify;
+ int err;
+
+ err = rdsv3_put_cmsg(msghdr, SOL_RDS, RDSV3_CMSG_CONG_UPDATE,
+ sizeof (notify), &notify);
+ if (err)
+ return (err);
+
+ mutex_enter(&rs->rs_lock);
+ rs->rs_cong_notify &= ~notify;
+ mutex_exit(&rs->rs_lock);
+
+ return (0);
+}
+
+/*
+ * Receive any control messages.
+ */
+static int
+rdsv3_cmsg_recv(struct rdsv3_incoming *inc, struct msghdr *msg)
+{
+ return (rdsv3_put_cmsg(msg, SOL_RDS, RDSV3_CMSG_RDMA_DEST,
+ sizeof (inc->i_rdma_cookie), &inc->i_rdma_cookie));
+}
+
+int
+rdsv3_recvmsg(struct rdsv3_sock *rs, uio_t *uio,
+ struct nmsghdr *msg, size_t size, int msg_flags)
+{
+ struct rsock *sk = rdsv3_rs_to_sk(rs);
+ long timeo;
+ int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
+ struct sockaddr_in *sin = NULL;
+ struct rdsv3_incoming *inc = NULL;
+
+ RDSV3_DPRINTF4("rdsv3_recvmsg",
+ "Enter(rs: %p size: %d msg_flags: 0x%x)", rs, size, msg_flags);
+
+ /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
+ timeo = rdsv3_rcvtimeo(sk, nonblock);
+
+ if (msg_flags & MSG_OOB)
+ goto out;
+
+ /* mark the first cmsg position */
+ if (msg) {
+ msg->msg_control = NULL;
+ }
+
+ while (1) {
+ /*
+ * If there are pending notifications, do those -
+ * and nothing else
+ */
+ if (!list_is_empty(&rs->rs_notify_queue)) {
+ ret = rdsv3_notify_queue_get(rs, msg);
+
+ if (msg && msg->msg_namelen) {
+ sin = kmem_zalloc(sizeof (struct sockaddr_in),
+ KM_SLEEP);
+ sin->sin_family = AF_INET_OFFLOAD;
+ if (inc) {
+ sin->sin_port = inc->i_hdr.h_sport;
+ sin->sin_addr.s_addr = inc->i_saddr;
+ }
+ msg->msg_namelen = sizeof (struct sockaddr_in);
+ msg->msg_name = sin;
+ }
+ break;
+ }
+
+ if (rs->rs_cong_notify) {
+ ret = rdsv3_notify_cong(rs, msg);
+ goto out;
+ }
+
+ if (!rdsv3_next_incoming(rs, &inc)) {
+ if (nonblock) {
+ ret = -EAGAIN;
+ break;
+ }
+
+ RDSV3_DPRINTF3("rdsv3_recvmsg",
+ "Before wait (rs: %p)", rs);
+
+ mutex_enter(&sk->sk_sleep->waitq_mutex);
+ while ((list_is_empty(&rs->rs_notify_queue) &&
+ !rs->rs_cong_notify &&
+ !rdsv3_next_incoming(rs, &inc))) {
+#if 0
+ ret = cv_timedwait_sig(&sk->sk_sleep->waitq_cv,
+ &sk->sk_sleep->waitq_mutex,
+ timeo * drv_usectohz(1000000) +
+ ddi_get_lbolt());
+ if (ret <= 0) {
+ /* signal/timeout pending */
+ RDSV3_DPRINTF2("rdsv3_recvmsg",
+ "woke due to signal/timeout: %d",
+ ret);
+ ret = (ret == 0) ? -ERESTART :
+ -ETIMEDOUT;
+ break;
+ }
+#else
+ ret = cv_wait_sig(&sk->sk_sleep->waitq_cv,
+ &sk->sk_sleep->waitq_mutex);
+ if (ret == 0) {
+ /* signal/timeout pending */
+ RDSV3_DPRINTF2("rdsv3_recvmsg",
+ "woke due to signal");
+ ret = -ERESTART;
+ break;
+ }
+#endif
+ }
+ mutex_exit(&sk->sk_sleep->waitq_mutex);
+
+ RDSV3_DPRINTF5("rdsv3_recvmsg",
+ "recvmsg woke rs: %p inc %p ret %d",
+ rs, inc, -ret);
+
+ if (ret < 0)
+ break;
+
+ /*
+ * if the wakeup was due to rs_notify_queue or
+ * rs_cong_notify then we need to handle those first.
+ */
+ continue;
+ }
+
+ RDSV3_DPRINTF5("rdsv3_recvmsg",
+ "copying inc %p from %u.%u.%u.%u:%u to user", inc,
+ NIPQUAD(inc->i_conn->c_faddr),
+ ntohs(inc->i_hdr.h_sport));
+ ret = inc->i_conn->c_trans->inc_copy_to_user(inc, uio, size);
+ if (ret < 0)
+ break;
+
+ /*
+ * if the message we just copied isn't at the head of the
+ * recv queue then someone else raced us to return it, try
+ * to get the next message.
+ */
+ if (!rdsv3_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
+ rdsv3_inc_put(inc);
+ inc = NULL;
+ rdsv3_stats_inc(s_recv_deliver_raced);
+ continue;
+ }
+
+ if (ret < ntohl(inc->i_hdr.h_len)) {
+ if (msg_flags & MSG_TRUNC)
+ ret = ntohl(inc->i_hdr.h_len);
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ if (rdsv3_cmsg_recv(inc, msg)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ rdsv3_stats_inc(s_recv_delivered);
+
+ if (msg->msg_namelen) {
+ sin = kmem_alloc(sizeof (struct sockaddr_in), KM_SLEEP);
+ sin->sin_family = AF_INET_OFFLOAD;
+ sin->sin_port = inc->i_hdr.h_sport;
+ sin->sin_addr.s_addr = inc->i_saddr;
+ (void) memset(sin->sin_zero, 0,
+ sizeof (sin->sin_zero));
+ msg->msg_namelen = sizeof (struct sockaddr_in);
+ msg->msg_name = sin;
+ }
+ break;
+ }
+
+ if (inc)
+ rdsv3_inc_put(inc);
+
+out:
+ RDSV3_DPRINTF4("rdsv3_recvmsg", "Return(rs: %p, ret: %d)", rs, ret);
+
+ return (ret);
+}
+
+/*
+ * The socket is being shut down and we're asked to drop messages that were
+ * queued for recvmsg. The caller has unbound the socket so the receive path
+ * won't queue any more incoming fragments or messages on the socket.
+ */
+void
+rdsv3_clear_recv_queue(struct rdsv3_sock *rs)
+{
+ struct rsock *sk = rdsv3_rs_to_sk(rs);
+ struct rdsv3_incoming *inc, *tmp;
+
+ RDSV3_DPRINTF4("rdsv3_clear_recv_queue", "Enter(rs: %p)", rs);
+
+ rw_enter(&rs->rs_recv_lock, RW_WRITER);
+ RDSV3_FOR_EACH_LIST_NODE_SAFE(inc, tmp, &rs->rs_recv_queue, i_item) {
+ rdsv3_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+ -ntohl(inc->i_hdr.h_len),
+ inc->i_hdr.h_dport);
+ list_remove_node(&inc->i_item);
+ rdsv3_inc_put(inc);
+ }
+ rw_exit(&rs->rs_recv_lock);
+
+ RDSV3_DPRINTF4("rdsv3_clear_recv_queue", "Return(rs: %p)", rs);
+}
+
+/*
+ * inc->i_saddr isn't used here because it is only set in the receive
+ * path.
+ */
+void
+rdsv3_inc_info_copy(struct rdsv3_incoming *inc,
+ struct rdsv3_info_iterator *iter,
+ uint32_be_t saddr, uint32_be_t daddr, int flip)
+{
+ struct rdsv3_info_message minfo;
+
+ minfo.seq = ntohll(inc->i_hdr.h_sequence);
+ minfo.len = ntohl(inc->i_hdr.h_len);
+
+ if (flip) {
+ minfo.laddr = daddr;
+ minfo.faddr = saddr;
+ minfo.lport = inc->i_hdr.h_dport;
+ minfo.fport = inc->i_hdr.h_sport;
+ } else {
+ minfo.laddr = saddr;
+ minfo.faddr = daddr;
+ minfo.lport = inc->i_hdr.h_sport;
+ minfo.fport = inc->i_hdr.h_dport;
+ }
+
+ rdsv3_info_copy(iter, &minfo, sizeof (minfo));
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3.conf b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3.conf
new file mode 100644
index 0000000000..c17689cf40
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3.conf
@@ -0,0 +1,25 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+#
+name="rdsv3" parent="ib" unit-address="0";
+ddi-forceattach=1;
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_ddi.c b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_ddi.c
new file mode 100644
index 0000000000..82417cba04
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_ddi.c
@@ -0,0 +1,303 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/modctl.h>
+#include <sys/strsubr.h>
+#include <sys/socketvar.h>
+#include <sys/rds.h>
+
+#include <sys/ib/ibtl/ibti.h>
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+extern int rdsv3_init(void);
+extern void rdsv3_exit(void);
+extern void rdsv3_cong_init(void);
+extern void rdsv3_cong_exit(void);
+extern void rdsv3_trans_init(void);
+extern void rdsv3_trans_exit(void);
+extern int rdsv3_sock_init(void);
+extern void rdsv3_sock_exit(void);
+
+/* global */
+dev_info_t *rdsv3_dev_info = NULL;
+kmem_cache_t *rdsv3_alloc_cache = NULL;
+
+extern kmutex_t rdsv3_rdma_listen_id_lock;
+extern struct rdma_cm_id *rdsv3_rdma_listen_id;
+
+extern kmutex_t rdsv3_sock_lock;
+extern list_t rdsv3_sock_list;
+
+extern void rdsv3_bind_tree_init();
+extern void rdsv3_bind_tree_exit();
+
+int
+rdsv3_sock_init()
+{
+ RDSV3_DPRINTF4("rdsv3_sock_init", "Enter");
+
+ rdsv3_alloc_cache = kmem_cache_create("rdsv3_alloc_cache",
+ sizeof (struct rsock) + sizeof (struct rdsv3_sock), 0, NULL,
+ NULL, NULL, NULL, NULL, 0);
+ if (rdsv3_alloc_cache == NULL) {
+ RDSV3_DPRINTF1("rdsv3_alloc_cache",
+ "kmem_cache_create(rdsv3_alloc_cache) failed");
+ return (-1);
+ }
+ rdsv3_bind_tree_init();
+
+ mutex_init(&rdsv3_sock_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&rdsv3_sock_list, sizeof (struct rdsv3_sock),
+ offsetof(struct rdsv3_sock, rs_item));
+
+ RDSV3_DPRINTF4("rdsv3_sock_init", "Return");
+
+ return (0);
+}
+
+void
+rdsv3_sock_exit()
+{
+ RDSV3_DPRINTF2("rdsv3_sock_exit", "Enter");
+
+ rdsv3_bind_tree_exit();
+
+ kmem_cache_destroy(rdsv3_alloc_cache);
+
+ list_destroy(&rdsv3_sock_list);
+ mutex_destroy(&rdsv3_sock_lock);
+
+ RDSV3_DPRINTF2("rdsv3_sock_exit", "Return");
+}
+
+static int
+rdsv3_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ int ret;
+
+ RDSV3_DPRINTF2("rdsv3_attach", "Enter (dip: %p)", dip);
+
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ if (rdsv3_dev_info != NULL) {
+ RDSV3_DPRINTF1("rdsv3_attach", "Multiple RDS instances are"
+ " not supported (rdsv3_dev_info: 0x%p)", rdsv3_dev_info);
+ return (DDI_FAILURE);
+ }
+ rdsv3_dev_info = dip;
+
+ mutex_init(&rdsv3_rdma_listen_id_lock, NULL, MUTEX_DRIVER, NULL);
+ rdsv3_rdma_listen_id = NULL;
+
+ rdsv3_trans_init();
+ ret = rdsv3_init();
+ if (ret) {
+ RDSV3_DPRINTF1("rdsv3_attach", "rdsv3_init failed: %d", ret);
+ rdsv3_trans_exit();
+ mutex_destroy(&rdsv3_rdma_listen_id_lock);
+ rdsv3_dev_info = NULL;
+ return (DDI_FAILURE);
+ }
+
+ ret = rdsv3_sock_init();
+ if (ret) {
+ rdsv3_exit();
+ rdsv3_trans_exit();
+ mutex_destroy(&rdsv3_rdma_listen_id_lock);
+ rdsv3_dev_info = NULL;
+ return (DDI_FAILURE);
+ }
+
+ ret = ddi_create_minor_node(dip, "rdsv3", S_IFCHR, 0, DDI_PSEUDO, 0);
+ if (ret != DDI_SUCCESS) {
+ cmn_err(CE_CONT, "ddi_create_minor_node failed: %d", ret);
+ rdsv3_sock_exit();
+ rdsv3_exit();
+ rdsv3_trans_exit();
+ mutex_destroy(&rdsv3_rdma_listen_id_lock);
+ rdsv3_dev_info = NULL;
+ return (DDI_FAILURE);
+ }
+
+ RDSV3_DPRINTF2("rdsv3_attach", "Return");
+
+ return (DDI_SUCCESS);
+}
+
+static int
+rdsv3_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ RDSV3_DPRINTF2("rdsv3_detach", "Enter (dip: %p)", dip);
+
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ rdsv3_sock_exit();
+ rdsv3_exit();
+ rdsv3_trans_exit();
+ ddi_remove_minor_node(dip, "rdsv3");
+ rdsv3_dev_info = NULL;
+
+ RDSV3_DPRINTF2("rdsv3_detach", "Return");
+
+ return (DDI_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+rdsv3_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
+{
+ int ret = DDI_FAILURE;
+
+ RDSV3_DPRINTF2("rdsv3_info", "Enter (dip: %p, cmd: %d)", dip, cmd);
+
+ switch (cmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ if (rdsv3_dev_info != NULL) {
+ *result = (void *)rdsv3_dev_info;
+ ret = DDI_SUCCESS;
+ }
+ break;
+
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = NULL;
+ ret = DDI_SUCCESS;
+ break;
+
+ default:
+ break;
+ }
+
+ RDSV3_DPRINTF4("rdsv3_info", "Return");
+
+ return (ret);
+}
+
+/* Driver entry points */
+static struct cb_ops rdsv3_cb_ops = {
+ nulldev, /* open */
+ nulldev, /* close */
+ nodev, /* strategy */
+ nodev, /* print */
+ nodev, /* dump */
+ nodev, /* read */
+ nodev, /* write */
+ nodev, /* ioctl */
+ nodev, /* devmap */
+ nodev, /* mmap */
+ nodev, /* segmap */
+ nochpoll, /* poll */
+ ddi_prop_op, /* prop_op */
+ NULL, /* stream */
+ D_MP, /* cb_flag */
+ CB_REV, /* rev */
+ nodev, /* int (*cb_aread)() */
+ nodev, /* int (*cb_awrite)() */
+};
+
+/* Device options */
+static struct dev_ops rdsv3_ops = {
+ DEVO_REV, /* devo_rev, */
+ 0, /* refcnt */
+ rdsv3_info, /* info */
+ nulldev, /* identify */
+ nulldev, /* probe */
+ rdsv3_attach, /* attach */
+ rdsv3_detach, /* detach */
+ nodev, /* reset */
+ &rdsv3_cb_ops, /* driver ops - devctl interfaces */
+ NULL, /* bus operations */
+ NULL, /* power */
+ ddi_quiesce_not_needed /* quiesce */
+};
+
+/*
+ * Module linkage information.
+ */
+#define RDSV3_DEVDESC "RDSv3 IB transport driver"
+static struct modldrv rdsv3_modldrv = {
+ &mod_driverops, /* Driver module */
+ RDSV3_DEVDESC, /* Driver name and version */
+ &rdsv3_ops, /* Driver ops */
+};
+
+static struct modlinkage rdsv3_modlinkage = {
+ MODREV_1,
+ (void *)&rdsv3_modldrv,
+ NULL
+};
+
+int
+_init(void)
+{
+ int ret;
+
+ if (ibt_hw_is_present() == 0) {
+ return (ENODEV);
+ }
+
+ /* Initialize logging */
+ rdsv3_logging_initialization();
+
+ ret = mod_install(&rdsv3_modlinkage);
+ if (ret != 0) {
+ /*
+ * Could not load module
+ */
+ rdsv3_logging_destroy();
+ return (ret);
+ }
+
+ return (0);
+}
+
+int
+_fini()
+{
+ int ret;
+
+ /*
+ * Remove module
+ */
+ if ((ret = mod_remove(&rdsv3_modlinkage)) != 0) {
+ return (ret);
+ }
+
+ /* Stop logging */
+ rdsv3_logging_destroy();
+
+ return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&rdsv3_modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_debug.c b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_debug.c
new file mode 100644
index 0000000000..8327b5b866
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_debug.c
@@ -0,0 +1,348 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+#include <sys/types.h>
+#include <sys/varargs.h>
+#include <sys/cmn_err.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+/*
+ * This file contains the debug defines and routines.
+ * Debugging information is collected in a circular kernel buffer. Debug
+ * messages with level lower than rdsv3dbglvl are ignored. The size of the
+ * of the debug buffer can be changed by setting 'rdsv3_debug_buf_size' in
+ * bytes in /etc/system.
+ *
+ * The debug buffer can be cleared by setting 'rdsv3_clear_debug_buf_flag = 1'
+ * on a running system.
+ */
+
+#define RDSV3_DEBUG_SIZE_EXTRA_ALLOC 8
+#define RDSV3_MIN_DEBUG_BUF_SIZE 0x1000
+#define RDSV3_FUNCNAME_LEN 40
+#define RDSV3_PRINTBUF_LEN 4096
+#ifdef DEBUG
+#define RDSV3_DEBUG_BUF_SIZE 0x200000 /* 2M size */
+#else
+#define RDSV3_DEBUG_BUF_SIZE 0x2000
+#endif /* DEBUG */
+
+/* Max length of a debug statement */
+#define RDSV3_PRINT_BUF_LEN 4096
+
+static int rdsv3_suppress_dprintf; /* Suppress debug printing */
+static int rdsv3_buffer_dprintf = 1; /* Use debug buffer (0 == console) */
+static int rdsv3_debug_buf_size = RDSV3_DEBUG_BUF_SIZE; /* Sz of Debug buf */
+static int rdsv3_allow_intr_msgs = 0; /* log "intr" messages */
+char *rdsv3_debug_buf = NULL; /* The Debug Buf */
+char *rdsv3_buf_sptr, *rdsv3_buf_eptr; /* debug buffer temp pointer */
+int rdsv3_clear_debug_buf_flag = 0; /* Clear debug buffer */
+uint_t rdsv3dbglvl = RDSV3_LOG_L4;
+
+/*
+ * Print Buffer protected by mutex for debug stuff. The mutex also
+ * ensures serializing debug messages.
+ */
+static kmutex_t rdsv3_debug_mutex;
+static char rdsv3_print_buf[RDSV3_PRINT_BUF_LEN];
+
+/* Function Prototypes */
+static void rdsv3_clear_print_buf();
+
+/* RDS logging init */
+void
+rdsv3_logging_initialization()
+{
+ boolean_t flag = B_FALSE;
+
+ mutex_init(&rdsv3_debug_mutex, NULL, MUTEX_DRIVER, NULL);
+ mutex_enter(&rdsv3_debug_mutex);
+
+ if (rdsv3_debug_buf_size <= RDSV3_DEBUG_SIZE_EXTRA_ALLOC) {
+ rdsv3_debug_buf_size = RDSV3_MIN_DEBUG_BUF_SIZE;
+ flag = B_TRUE;
+ }
+
+ /* if it is less that RDSV3_MIN_DEBUG_BUF_SIZE, adjust it */
+ rdsv3_debug_buf_size = max(RDSV3_MIN_DEBUG_BUF_SIZE,
+ rdsv3_debug_buf_size);
+
+ rdsv3_debug_buf = (char *)kmem_alloc(rdsv3_debug_buf_size, KM_SLEEP);
+ rdsv3_clear_print_buf();
+ mutex_exit(&rdsv3_debug_mutex);
+
+ if (flag == B_TRUE) {
+ RDSV3_DPRINTF2("RDS", "rdsv3_debug_buf_size was too small, "
+ "adjusted to %x", rdsv3_debug_buf_size);
+ }
+}
+
+
+/* RDS logging destroy */
+void
+rdsv3_logging_destroy()
+{
+ mutex_enter(&rdsv3_debug_mutex);
+ if (rdsv3_debug_buf) {
+ kmem_free(rdsv3_debug_buf, rdsv3_debug_buf_size);
+ rdsv3_debug_buf = NULL;
+ }
+ mutex_exit(&rdsv3_debug_mutex);
+ mutex_destroy(&rdsv3_debug_mutex);
+}
+
+
+/*
+ * debug, log, and console message handling
+ */
+
+/*
+ * clear the RDS debug buffer
+ */
+static void
+rdsv3_clear_print_buf()
+{
+ ASSERT(MUTEX_HELD(&rdsv3_debug_mutex));
+ if (rdsv3_debug_buf) {
+ rdsv3_buf_sptr = rdsv3_debug_buf;
+ rdsv3_buf_eptr = rdsv3_debug_buf + rdsv3_debug_buf_size -
+ RDSV3_DEBUG_SIZE_EXTRA_ALLOC;
+
+ bzero(rdsv3_debug_buf, rdsv3_debug_buf_size);
+ }
+}
+
+
+static void
+rdsv3_vlog(char *name, uint_t level, char *fmt, va_list ap)
+{
+ char *label = (name == NULL) ? "rds" : name;
+ char *msg_ptr;
+ size_t len;
+
+ mutex_enter(&rdsv3_debug_mutex);
+
+ /* if not using logging scheme; quit */
+ if (rdsv3_suppress_dprintf || (rdsv3_debug_buf == NULL)) {
+ mutex_exit(&rdsv3_debug_mutex);
+ return;
+ }
+
+ /* If user requests to clear debug buffer, go ahead */
+ if (rdsv3_clear_debug_buf_flag != 0) {
+ rdsv3_clear_print_buf();
+ rdsv3_clear_debug_buf_flag = 0;
+ }
+
+ /*
+ * put "label" into the buffer
+ */
+ len = snprintf(rdsv3_print_buf, RDSV3_FUNCNAME_LEN, "%s:\t", label);
+
+ msg_ptr = rdsv3_print_buf + len;
+ len += vsnprintf(msg_ptr, RDSV3_PRINT_BUF_LEN - len - 2, fmt, ap);
+
+ len = min(len, RDSV3_PRINT_BUF_LEN - 2);
+ ASSERT(len == strlen(rdsv3_print_buf));
+ rdsv3_print_buf[len++] = '\n';
+ rdsv3_print_buf[len] = '\0';
+
+ /*
+ * stuff the message in the debug buf
+ */
+ if (rdsv3_buffer_dprintf) {
+
+ /*
+ * overwrite >>>> that might be over the end of the
+ * the buffer
+ */
+ *rdsv3_buf_sptr = '\0';
+
+ if (rdsv3_buf_sptr + len > rdsv3_buf_eptr) {
+ size_t left = (uintptr_t)rdsv3_buf_eptr -
+ (uintptr_t)rdsv3_buf_sptr;
+
+ bcopy((caddr_t)rdsv3_print_buf,
+ (caddr_t)rdsv3_buf_sptr, left);
+ bcopy((caddr_t)rdsv3_print_buf + left,
+ (caddr_t)rdsv3_debug_buf, len - left);
+ rdsv3_buf_sptr = rdsv3_debug_buf + len - left;
+ } else {
+ bcopy((caddr_t)rdsv3_print_buf, rdsv3_buf_sptr, len);
+ rdsv3_buf_sptr += len;
+ }
+
+ /* add marker */
+ (void) sprintf(rdsv3_buf_sptr, ">>>>");
+ }
+
+ /*
+ * LINTR, L5-L2 message may go to the rdsv3_debug_buf
+ * L1 messages will go to the /var/adm/messages (debug & non-debug).
+ * L0 messages will go to console (debug & non-debug).
+ */
+ switch (level) {
+ case RDSV3_LOG_LINTR:
+ case RDSV3_LOG_L5:
+ case RDSV3_LOG_L4:
+ case RDSV3_LOG_L3:
+ case RDSV3_LOG_L2:
+ if (!rdsv3_buffer_dprintf) {
+ cmn_err(CE_CONT, "^%s", rdsv3_print_buf);
+ }
+ break;
+ case RDSV3_LOG_L1:
+ if (!rdsv3_buffer_dprintf) {
+ cmn_err(CE_CONT, "^%s", rdsv3_print_buf);
+ } else {
+ /* go to messages file */
+ cmn_err(CE_CONT, "!%s", rdsv3_print_buf);
+ }
+ break;
+ case RDSV3_LOG_L0:
+ /* Strip the "\n" added earlier */
+ if (rdsv3_print_buf[len - 1] == '\n') {
+ rdsv3_print_buf[len - 1] = '\0';
+ }
+ if (msg_ptr[len - 1] == '\n') {
+ msg_ptr[len - 1] = '\0';
+ }
+ /* go to console */
+ cmn_err(CE_CONT, "^%s", rdsv3_print_buf);
+ break;
+ }
+
+ mutex_exit(&rdsv3_debug_mutex);
+}
+
+void
+rdsv3_dprintf_intr(char *name, char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ rdsv3_vlog(name, RDSV3_LOG_LINTR, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Check individual subsystem err levels
+ */
+#define RDSV3_CHECK_ERR_LEVEL(level) \
+ if (rdsv3dbglvl < level) \
+ return; \
+
+void
+rdsv3_dprintf5(char *name, char *fmt, ...)
+{
+ va_list ap;
+
+ RDSV3_CHECK_ERR_LEVEL(RDSV3_LOG_L5);
+
+ va_start(ap, fmt);
+ rdsv3_vlog(name, RDSV3_LOG_L5, fmt, ap);
+ va_end(ap);
+}
+
+void
+rdsv3_dprintf4(char *name, char *fmt, ...)
+{
+ va_list ap;
+
+ RDSV3_CHECK_ERR_LEVEL(RDSV3_LOG_L4);
+
+ va_start(ap, fmt);
+ rdsv3_vlog(name, RDSV3_LOG_L4, fmt, ap);
+ va_end(ap);
+}
+
+void
+rdsv3_dprintf3(char *name, char *fmt, ...)
+{
+ va_list ap;
+
+ RDSV3_CHECK_ERR_LEVEL(RDSV3_LOG_L3);
+
+ va_start(ap, fmt);
+ rdsv3_vlog(name, RDSV3_LOG_L3, fmt, ap);
+ va_end(ap);
+}
+
+void
+rdsv3_dprintf2(char *name, char *fmt, ...)
+{
+ va_list ap;
+
+ RDSV3_CHECK_ERR_LEVEL(RDSV3_LOG_L2);
+
+ va_start(ap, fmt);
+ rdsv3_vlog(name, RDSV3_LOG_L2, fmt, ap);
+ va_end(ap);
+}
+
+void
+rdsv3_dprintf1(char *name, char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ rdsv3_vlog(name, RDSV3_LOG_L1, fmt, ap);
+ va_end(ap);
+}
+
+
+/*
+ * Function:
+ * rdsv3_dprintf0
+ * Input:
+ * name - Name of the function generating the debug message
+ * fmt - The message to be displayed.
+ * Output:
+ * none
+ * Returns:
+ * none
+ * Description:
+ * A generic log function to display RDS debug messages.
+ */
+void
+rdsv3_dprintf0(char *name, char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ rdsv3_vlog(name, RDSV3_LOG_L0, fmt, ap);
+ va_end(ap);
+}
+
+/* For ofed rdstrace */
+void
+rdsv3_trace(char *name, uint8_t lvl, char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ rdsv3_vlog(name, lvl, fmt, ap);
+ va_end(ap);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_impl.c b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_impl.c
new file mode 100644
index 0000000000..9ea3b28bec
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_impl.c
@@ -0,0 +1,1294 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/dlpi.h>
+#include <sys/stropts.h>
+#include <sys/strsun.h>
+#include <sys/sysmacros.h>
+#include <sys/strlog.h>
+#include <sys/ddi.h>
+#include <sys/cmn_err.h>
+#include <sys/socket.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <netinet/in.h>
+#include <sys/ethernet.h>
+#include <inet/arp.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ftable.h>
+
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+
+#include <sys/rds.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sockio.h>
+#include <sys/sysmacros.h>
+#include <inet/common.h>
+#include <inet/ip.h>
+#include <net/if_types.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/rdma.h>
+#include <sys/ib/clients/rdsv3/ib.h>
+#include <sys/ib/clients/rdsv3/rdsv3_impl.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+#include <sys/dls.h>
+#include <sys/mac.h>
+#include <sys/mac_client.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client_priv.h>
+
+ddi_taskq_t *rdsv3_taskq = NULL;
+extern kmem_cache_t *rdsv3_alloc_cache;
+
+extern unsigned int ip_ocsum(ushort_t *address, int halfword_count,
+ unsigned int sum);
+
+/*
+ * Check if the IP interface named by `lifrp' is RDS-capable.
+ */
+boolean_t
+rdsv3_capable_interface(struct lifreq *lifrp)
+{
+ char ifname[LIFNAMSIZ];
+ char drv[MAXLINKNAMELEN];
+ uint_t ppa;
+ char *cp;
+
+ RDSV3_DPRINTF4("rdsv3_capable_interface", "Enter");
+
+ if (lifrp->lifr_type == IFT_IB)
+ return (B_TRUE);
+
+ /*
+ * Strip off the logical interface portion before getting
+ * intimate with the name.
+ */
+ (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
+ if ((cp = strchr(ifname, ':')) != NULL)
+ *cp = '\0';
+
+ if (strcmp("lo0", ifname) == 0) {
+ /*
+ * loopback is considered RDS-capable
+ */
+ return (B_TRUE);
+ }
+
+ return (ddi_parse(ifname, drv, &ppa) == DDI_SUCCESS &&
+ rdsv3_if_lookup_by_name(drv));
+}
+
+int
+rdsv3_do_ip_ioctl(ksocket_t so4, void **ipaddrs, int *size, int *nifs)
+{
+ struct lifnum lifn;
+ struct lifconf lifc;
+ struct lifreq *lp, *rlp, lifr;
+ int rval = 0;
+ int numifs;
+ int bufsize, rbufsize;
+ void *buf, *rbuf;
+ int i, j, n, rc;
+
+ *ipaddrs = NULL;
+ *size = 0;
+ *nifs = 0;
+
+ RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Enter");
+
+retry_count:
+ /* snapshot the current number of interfaces */
+ lifn.lifn_family = PF_UNSPEC;
+ lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
+ lifn.lifn_count = 0;
+ rval = ksocket_ioctl(so4, SIOCGLIFNUM, (intptr_t)&lifn, &rval,
+ CRED());
+ if (rval != 0) {
+ RDSV3_DPRINTF2("rdsv3_do_ip_ioctl",
+ "ksocket_ioctl returned: %d", rval);
+ return (rval);
+ }
+
+ numifs = lifn.lifn_count;
+ if (numifs <= 0) {
+ RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No interfaces found");
+ return (0);
+ }
+
+ /* allocate extra room in case more interfaces appear */
+ numifs += 10;
+
+ /* get the interface names and ip addresses */
+ bufsize = numifs * sizeof (struct lifreq);
+ buf = kmem_alloc(bufsize, KM_SLEEP);
+
+ lifc.lifc_family = AF_UNSPEC;
+ lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
+ lifc.lifc_len = bufsize;
+ lifc.lifc_buf = buf;
+ rc = ksocket_ioctl(so4, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED());
+ if (rc != 0) {
+ RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "SIOCGLIFCONF failed");
+ kmem_free(buf, bufsize);
+ return (rc);
+ }
+ /* if our extra room is used up, try again */
+ if (bufsize <= lifc.lifc_len) {
+ kmem_free(buf, bufsize);
+ buf = NULL;
+ goto retry_count;
+ }
+ /* calc actual number of ifconfs */
+ n = lifc.lifc_len / sizeof (struct lifreq);
+
+ /*
+ * Count the RDS interfaces
+ */
+ for (i = 0, j = 0, lp = lifc.lifc_req; i < n; i++, lp++) {
+
+ /*
+ * Copy as the SIOCGLIFFLAGS ioctl is destructive
+ */
+ bcopy(lp, &lifr, sizeof (struct lifreq));
+ /*
+ * fetch the flags using the socket of the correct family
+ */
+ switch (lifr.lifr_addr.ss_family) {
+ case AF_INET:
+ rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)&lifr,
+ &rval, CRED());
+ break;
+ default:
+ continue;
+ }
+
+ if (rc != 0) continue;
+
+ /*
+ * If we got the flags, skip uninteresting
+ * interfaces based on flags
+ */
+ if ((lifr.lifr_flags & IFF_UP) != IFF_UP)
+ continue;
+ if (lifr.lifr_flags &
+ (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
+ continue;
+ if (!rdsv3_capable_interface(&lifr))
+ continue;
+ j++;
+ }
+
+ if (j <= 0) {
+ RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No RDS interfaces");
+ kmem_free(buf, bufsize);
+ return (rval);
+ }
+
+ numifs = j;
+
+ /* This is the buffer we pass back */
+ rbufsize = numifs * sizeof (struct lifreq);
+ rbuf = kmem_alloc(rbufsize, KM_SLEEP);
+ rlp = (struct lifreq *)rbuf;
+
+ /*
+ * Examine the array of interfaces and filter uninteresting ones
+ */
+ for (i = 0, lp = lifc.lifc_req; i < n; i++, lp++) {
+
+ /*
+ * Copy the address as the SIOCGLIFFLAGS ioctl is destructive
+ */
+ bcopy(lp, &lifr, sizeof (struct lifreq));
+ /*
+ * fetch the flags using the socket of the correct family
+ */
+ switch (lifr.lifr_addr.ss_family) {
+ case AF_INET:
+ rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)&lifr,
+ &rval, CRED());
+ break;
+ default:
+ continue;
+ }
+
+
+ if (rc != 0) {
+ RDSV3_DPRINTF2("rdsv3_do_ip_ioctl",
+ "ksocket_ioctl failed" " for %s", lifr.lifr_name);
+ continue;
+ }
+
+ /*
+ * If we got the flags, skip uninteresting
+ * interfaces based on flags
+ */
+ if ((lifr.lifr_flags & IFF_UP) != IFF_UP)
+ continue;
+ if (lifr.lifr_flags &
+ (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
+ continue;
+ if (!rdsv3_capable_interface(&lifr))
+ continue;
+
+ /* save the record */
+ bcopy(lp, rlp, sizeof (struct lifreq));
+ rlp++;
+ }
+
+ kmem_free(buf, bufsize);
+
+ *ipaddrs = rbuf;
+ *size = rbufsize;
+ *nifs = numifs;
+
+ RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Return");
+
+ return (rval);
+}
+
+/*
+ * Check if the IP interface named by `ifrp' is RDS-capable.
+ */
+boolean_t
+rdsv3_capable_interface_old(struct ifreq *ifrp)
+{
+ char ifname[IFNAMSIZ];
+ char drv[MAXLINKNAMELEN];
+ uint_t ppa;
+ char *cp;
+
+ RDSV3_DPRINTF4("rdsv3_capable_interface_old", "Enter");
+
+ /*
+ * Strip off the logical interface portion before getting
+ * intimate with the name.
+ */
+ (void) strlcpy(ifname, ifrp->ifr_name, IFNAMSIZ);
+ if ((cp = strchr(ifname, ':')) != NULL)
+ *cp = '\0';
+
+ RDSV3_DPRINTF4("rdsv3_capable_interface_old", "ifname: %s", ifname);
+
+ if ((strcmp("lo0", ifname) == 0) ||
+ (strncmp("ibd", ifname, 3) == 0)) {
+ /*
+ * loopback and IB are considered RDS-capable
+ */
+ return (B_TRUE);
+ }
+
+ return (ddi_parse(ifname, drv, &ppa) == DDI_SUCCESS &&
+ rdsv3_if_lookup_by_name(drv));
+}
+
+int
+rdsv3_do_ip_ioctl_old(ksocket_t so4, void **ipaddrs, int *size, int *nifs)
+{
+ uint_t ifn;
+ struct ifconf ifc;
+ struct ifreq *lp, *rlp, ifr;
+ int rval = 0;
+ int numifs;
+ int bufsize, rbufsize;
+ void *buf, *rbuf;
+ int i, j, n, rc;
+
+ *ipaddrs = NULL;
+ *size = 0;
+ *nifs = 0;
+
+ RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Enter");
+
+retry_count:
+ rval = ksocket_ioctl(so4, SIOCGIFNUM, (intptr_t)&ifn, &rval,
+ CRED());
+ if (rval != 0) {
+ RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
+ "ksocket_ioctl(SIOCGIFNUM) returned: %d", rval);
+ return (rval);
+ }
+
+ numifs = ifn;
+ if (numifs <= 0) {
+ RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No interfaces found");
+ return (0);
+ }
+
+ /* allocate extra room in case more interfaces appear */
+ numifs += 10;
+
+ /* get the interface names and ip addresses */
+ bufsize = numifs * sizeof (struct ifreq);
+ buf = kmem_alloc(bufsize, KM_SLEEP);
+
+ ifc.ifc_len = bufsize;
+ ifc.ifc_buf = buf;
+ rc = ksocket_ioctl(so4, SIOCGIFCONF, (intptr_t)&ifc, &rval, CRED());
+ if (rc != 0) {
+ RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
+ "SIOCGLIFCONF failed: %d", rc);
+ kmem_free(buf, bufsize);
+ return (rc);
+ }
+ /* if our extra room is used up, try again */
+ if (bufsize <= ifc.ifc_len) {
+ kmem_free(buf, bufsize);
+ buf = NULL;
+ goto retry_count;
+ }
+ /* calc actual number of ifconfs */
+ n = ifc.ifc_len / sizeof (struct ifreq);
+
+ /*
+ * Count the RDS interfaces
+ */
+ for (i = 0, j = 0, lp = ifc.ifc_req; i < n; i++, lp++) {
+
+ /*
+ * Copy as the SIOCGIFFLAGS ioctl is destructive
+ */
+ bcopy(lp, &ifr, sizeof (struct ifreq));
+ /*
+ * fetch the flags using the socket of the correct family
+ */
+ switch (ifr.ifr_addr.sa_family) {
+ case AF_INET:
+ rc = ksocket_ioctl(so4, SIOCGIFFLAGS, (intptr_t)&ifr,
+ &rval, CRED());
+ break;
+ default:
+ continue;
+ }
+
+ if (rc != 0) continue;
+
+ RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
+ "1. ifr_name: %s, flags: %d", ifr.ifr_name,
+ (ushort_t)ifr.ifr_flags);
+
+ /*
+ * If we got the flags, skip uninteresting
+ * interfaces based on flags
+ */
+ if ((((ushort_t)ifr.ifr_flags) & IFF_UP) != IFF_UP)
+ continue;
+ RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
+ "2. ifr_name: %s, flags: %d", ifr.ifr_name,
+ (ushort_t)ifr.ifr_flags);
+ if (((ushort_t)ifr.ifr_flags) &
+ (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
+ continue;
+ RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
+ "3. ifr_name: %s, flags: %d", ifr.ifr_name,
+ (ushort_t)ifr.ifr_flags);
+ if (!rdsv3_capable_interface_old(&ifr))
+ continue;
+ RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
+ "4. ifr_name: %s, flags: %d", ifr.ifr_name,
+ (ushort_t)ifr.ifr_flags);
+ j++;
+ }
+
+ if (j <= 0) {
+ RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No RDS interfaces");
+ kmem_free(buf, bufsize);
+ return (rval);
+ }
+
+ numifs = j;
+
+ /* This is the buffer we pass back */
+ rbufsize = numifs * sizeof (struct ifreq);
+ rbuf = kmem_alloc(rbufsize, KM_SLEEP);
+ rlp = (struct ifreq *)rbuf;
+
+ /*
+ * Examine the array of interfaces and filter uninteresting ones
+ */
+ for (i = 0, lp = ifc.ifc_req; i < n; i++, lp++) {
+
+ /*
+ * Copy the address as the SIOCGIFFLAGS ioctl is destructive
+ */
+ bcopy(lp, &ifr, sizeof (struct ifreq));
+ /*
+ * fetch the flags using the socket of the correct family
+ */
+ switch (ifr.ifr_addr.sa_family) {
+ case AF_INET:
+ rc = ksocket_ioctl(so4, SIOCGIFFLAGS, (intptr_t)&ifr,
+ &rval, CRED());
+ break;
+ default:
+ continue;
+ }
+
+
+ if (rc != 0) {
+ RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
+ "ksocket_ioctl failed: %d for %s",
+ rc, ifr.ifr_name);
+ continue;
+ }
+
+ /*
+ * If we got the flags, skip uninteresting
+ * interfaces based on flags
+ */
+ if ((((ushort_t)ifr.ifr_flags) & IFF_UP) != IFF_UP)
+ continue;
+ if (((ushort_t)ifr.ifr_flags) &
+ (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
+ continue;
+ if (!rdsv3_capable_interface_old(&ifr))
+ continue;
+
+ /* save the record */
+ bcopy(lp, rlp, sizeof (struct ifreq));
+ rlp++;
+ }
+
+ kmem_free(buf, bufsize);
+
+ *ipaddrs = rbuf;
+ *size = rbufsize;
+ *nifs = numifs;
+
+ RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Return");
+
+ return (rval);
+}
+
+boolean_t
+rdsv3_isloopback(ipaddr_t addr)
+{
+ ip_stack_t *ipst;
+
+ ipst = netstack_find_by_zoneid(GLOBAL_ZONEID)->netstack_ip;
+ ASSERT(ipst != NULL);
+ if (ip_type_v4(addr, ipst) != IRE_LOOPBACK) {
+ netstack_rele(ipst->ips_netstack);
+ return (B_FALSE);
+ }
+ netstack_rele(ipst->ips_netstack);
+ return (B_TRUE);
+}
+
+/*
+ * Work Queue Implementation
+ */
+
+#define RDSV3_WQ_THREAD_IDLE 0
+#define RDSV3_WQ_THREAD_RUNNING 1
+#define RDSV3_WQ_THREAD_FLUSHING 2
+#define RDSV3_WQ_THREAD_EXITING 3
+
+/* worker thread */
+void
+rdsv3_worker_thread(void *arg)
+{
+ rdsv3_workqueue_struct_t *wq = arg;
+ rdsv3_work_t *work;
+
+ RDSV3_DPRINTF4("rdsv3_worker_thread", "Enter(wq: 0x%p)", wq);
+
+ mutex_enter(&wq->wq_lock);
+ work = list_remove_head(&wq->wq_queue);
+ while (work) {
+ mutex_exit(&wq->wq_lock);
+
+ /* process work */
+ work->func(work);
+
+ mutex_enter(&wq->wq_lock);
+ work = list_remove_head(&wq->wq_queue);
+ }
+
+ /* No more work, go home, until called again */
+ if (wq->wq_state != RDSV3_WQ_THREAD_EXITING) {
+ wq->wq_state = RDSV3_WQ_THREAD_IDLE;
+ }
+ mutex_exit(&wq->wq_lock);
+
+ RDSV3_DPRINTF4("rdsv3_worker_thread", "Return(wq: 0x%p)", wq);
+}
+
+/* XXX */
+void
+rdsv3_flush_workqueue(rdsv3_workqueue_struct_t *wq)
+{
+ RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Enter(wq: %p)", wq);
+
+ mutex_enter(&wq->wq_lock);
+ switch (wq->wq_state) {
+ case RDSV3_WQ_THREAD_IDLE:
+ /* nothing to do */
+ ASSERT(list_is_empty(&wq->wq_queue));
+ break;
+
+ case RDSV3_WQ_THREAD_RUNNING:
+ wq->wq_state = RDSV3_WQ_THREAD_FLUSHING;
+ /* FALLTHRU */
+ case RDSV3_WQ_THREAD_FLUSHING:
+ /* already flushing, wait until the flushing is complete */
+ do {
+ mutex_exit(&wq->wq_lock);
+ delay(drv_usectohz(1000000));
+ mutex_enter(&wq->wq_lock);
+ } while (wq->wq_state == RDSV3_WQ_THREAD_FLUSHING);
+ break;
+ case RDSV3_WQ_THREAD_EXITING:
+ mutex_exit(&wq->wq_lock);
+ rdsv3_worker_thread(wq);
+ return;
+ }
+ mutex_exit(&wq->wq_lock);
+
+ RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Return(wq: %p)", wq);
+}
+
+void
+rdsv3_queue_work(rdsv3_workqueue_struct_t *wq, rdsv3_work_t *wp)
+{
+ RDSV3_DPRINTF4("rdsv3_queue_work", "Enter(wq: %p, wp: %p)", wq, wp);
+
+ mutex_enter(&wq->wq_lock);
+
+ if (list_link_active(&wp->work_item)) {
+ /* This is already in the queue, ignore this call */
+ mutex_exit(&wq->wq_lock);
+ RDSV3_DPRINTF3("rdsv3_queue_work", "already queued: %p", wp);
+ return;
+ }
+
+ switch (wq->wq_state) {
+ case RDSV3_WQ_THREAD_RUNNING:
+ list_insert_tail(&wq->wq_queue, wp);
+ mutex_exit(&wq->wq_lock);
+ break;
+
+ case RDSV3_WQ_THREAD_FLUSHING:
+ do {
+ mutex_exit(&wq->wq_lock);
+ delay(drv_usectohz(1000000));
+ mutex_enter(&wq->wq_lock);
+ } while (wq->wq_state == RDSV3_WQ_THREAD_FLUSHING);
+
+ if (wq->wq_state == RDSV3_WQ_THREAD_RUNNING) {
+ list_insert_tail(&wq->wq_queue, wp);
+ mutex_exit(&wq->wq_lock);
+ break;
+ }
+ /* FALLTHRU */
+
+ case RDSV3_WQ_THREAD_IDLE:
+ list_insert_tail(&wq->wq_queue, wp);
+ wq->wq_state = RDSV3_WQ_THREAD_RUNNING;
+ mutex_exit(&wq->wq_lock);
+
+ (void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_worker_thread, wq,
+ DDI_SLEEP);
+ break;
+
+ case RDSV3_WQ_THREAD_EXITING:
+ mutex_exit(&wq->wq_lock);
+ break;
+ }
+
+ RDSV3_DPRINTF4("rdsv3_queue_work", "Return(wq: %p, wp: %p)", wq, wp);
+}
+
+/* timeout handler for delayed work queuing */
+void
+rdsv3_work_timeout_handler(void *arg)
+{
+ rdsv3_delayed_work_t *dwp = (rdsv3_delayed_work_t *)arg;
+
+ RDSV3_DPRINTF4("rdsv3_work_timeout_handler",
+ "Enter(wq: %p, wp: %p)", dwp->wq, &dwp->work);
+
+ mutex_enter(&dwp->lock);
+ dwp->timeid = 0;
+ mutex_exit(&dwp->lock);
+
+ mutex_enter(&dwp->wq->wq_lock);
+ dwp->wq->wq_pending--;
+ if (dwp->wq->wq_state == RDSV3_WQ_THREAD_EXITING) {
+ mutex_exit(&dwp->wq->wq_lock);
+ return;
+ }
+ mutex_exit(&dwp->wq->wq_lock);
+
+ rdsv3_queue_work(dwp->wq, &dwp->work);
+
+ RDSV3_DPRINTF4("rdsv3_work_timeout_handler",
+ "Return(wq: %p, wp: %p)", dwp->wq, &dwp->work);
+}
+
+void
+rdsv3_queue_delayed_work(rdsv3_workqueue_struct_t *wq,
+ rdsv3_delayed_work_t *dwp, uint_t delay)
+{
+ RDSV3_DPRINTF4("rdsv3_queue_delayed_work",
+ "Enter(wq: %p, wp: %p)", wq, dwp);
+
+ if (delay == 0) {
+ rdsv3_queue_work(wq, &dwp->work);
+ return;
+ }
+
+ mutex_enter(&wq->wq_lock);
+ if (wq->wq_state == RDSV3_WQ_THREAD_EXITING) {
+ mutex_exit(&wq->wq_lock);
+ RDSV3_DPRINTF4("rdsv3_queue_delayed_work",
+ "WQ exiting - don't queue (wq: %p, wp: %p)", wq, dwp);
+ return;
+ }
+ wq->wq_pending++;
+ mutex_exit(&wq->wq_lock);
+
+ mutex_enter(&dwp->lock);
+ if (dwp->timeid == 0) {
+ dwp->wq = wq;
+ dwp->timeid = timeout(rdsv3_work_timeout_handler, dwp,
+ jiffies + (delay * rdsv3_one_sec_in_hz));
+ mutex_exit(&dwp->lock);
+ } else {
+ mutex_exit(&dwp->lock);
+ RDSV3_DPRINTF4("rdsv3_queue_delayed_work", "Already queued: %p",
+ dwp);
+ mutex_enter(&wq->wq_lock);
+ wq->wq_pending--;
+ mutex_exit(&wq->wq_lock);
+ }
+
+ RDSV3_DPRINTF4("rdsv3_queue_delayed_work",
+ "Return(wq: %p, wp: %p)", wq, dwp);
+}
+
+void
+rdsv3_cancel_delayed_work(rdsv3_delayed_work_t *dwp)
+{
+ RDSV3_DPRINTF4("rdsv3_cancel_delayed_work",
+ "Enter(wq: %p, dwp: %p)", dwp->wq, dwp);
+
+ mutex_enter(&dwp->lock);
+ if (dwp->timeid != 0) {
+ (void) untimeout(dwp->timeid);
+ dwp->timeid = 0;
+ } else {
+ RDSV3_DPRINTF4("rdsv3_cancel_delayed_work",
+ "Nothing to cancel (wq: %p, dwp: %p)", dwp->wq, dwp);
+ mutex_exit(&dwp->lock);
+ return;
+ }
+ mutex_exit(&dwp->lock);
+
+ mutex_enter(&dwp->wq->wq_lock);
+ dwp->wq->wq_pending--;
+ mutex_exit(&dwp->wq->wq_lock);
+
+ RDSV3_DPRINTF4("rdsv3_cancel_delayed_work",
+ "Return(wq: %p, dwp: %p)", dwp->wq, dwp);
+}
+
+void
+rdsv3_destroy_task_workqueue(rdsv3_workqueue_struct_t *wq)
+{
+ RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Enter");
+
+ ASSERT(wq);
+
+ mutex_enter(&wq->wq_lock);
+ wq->wq_state = RDSV3_WQ_THREAD_EXITING;
+
+ while (wq->wq_pending > 0) {
+ mutex_exit(&wq->wq_lock);
+ delay(drv_usectohz(1000000));
+ mutex_enter(&wq->wq_lock);
+ };
+ mutex_exit(&wq->wq_lock);
+
+ rdsv3_flush_workqueue(wq);
+
+ list_destroy(&wq->wq_queue);
+ mutex_destroy(&wq->wq_lock);
+ kmem_free(wq, sizeof (rdsv3_workqueue_struct_t));
+
+ ASSERT(rdsv3_taskq);
+ ddi_taskq_destroy(rdsv3_taskq);
+
+ wq = NULL;
+ rdsv3_taskq = NULL;
+
+ RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Return");
+}
+
+/* ARGSUSED */
+void
+rdsv3_rdma_init_worker(struct rdsv3_work_s *work)
+{
+ rdsv3_rdma_init();
+}
+
+#define RDSV3_NUM_TASKQ_THREADS 4
+rdsv3_workqueue_struct_t *
+rdsv3_create_task_workqueue(char *name)
+{
+ rdsv3_workqueue_struct_t *wq;
+
+ RDSV3_DPRINTF2("create_singlethread_workqueue", "Enter (dip: %p)",
+ rdsv3_dev_info);
+
+ rdsv3_taskq = ddi_taskq_create(rdsv3_dev_info, name,
+ RDSV3_NUM_TASKQ_THREADS, TASKQ_DEFAULTPRI, 0);
+ if (rdsv3_taskq == NULL) {
+ RDSV3_DPRINTF1(__FILE__,
+ "ddi_taskq_create failed for rdsv3_taskq");
+ return (NULL);
+ }
+
+ wq = kmem_zalloc(sizeof (rdsv3_workqueue_struct_t), KM_NOSLEEP);
+ if (wq == NULL) {
+ RDSV3_DPRINTF1(__FILE__, "kmem_zalloc failed for wq");
+ ddi_taskq_destroy(rdsv3_taskq);
+ return (NULL);
+ }
+
+ list_create(&wq->wq_queue, sizeof (struct rdsv3_work_s),
+ offsetof(struct rdsv3_work_s, work_item));
+ mutex_init(&wq->wq_lock, NULL, MUTEX_DRIVER, NULL);
+ wq->wq_state = RDSV3_WQ_THREAD_IDLE;
+ wq->wq_pending = 0;
+ rdsv3_one_sec_in_hz = drv_usectohz(1000000);
+
+ RDSV3_DPRINTF2("create_singlethread_workqueue", "Return");
+
+ return (wq);
+}
+
+/*
+ * Implementation for struct sock
+ */
+
+void
+rdsv3_sock_exit_data(struct rsock *sk)
+{
+ struct rdsv3_sock *rs = sk->sk_protinfo;
+
+ RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs, sk);
+
+ ASSERT(rs != NULL);
+ ASSERT(rdsv3_sk_sock_flag(sk, SOCK_DEAD));
+
+ rs->rs_sk = NULL;
+
+ list_destroy(&rs->rs_send_queue);
+ list_destroy(&rs->rs_notify_queue);
+ list_destroy(&rs->rs_recv_queue);
+
+ rw_destroy(&rs->rs_recv_lock);
+ mutex_destroy(&rs->rs_lock);
+
+ mutex_destroy(&rs->rs_rdma_lock);
+ avl_destroy(&rs->rs_rdma_keys);
+
+ rdsv3_exit_waitqueue(sk->sk_sleep);
+ kmem_free(sk->sk_sleep, sizeof (rdsv3_wait_queue_t));
+ mutex_destroy(&sk->sk_lock);
+
+ kmem_cache_free(rdsv3_alloc_cache, sk);
+ RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs, sk);
+}
+
+/* XXX - figure out right values */
+#define RDSV3_RECV_HIWATER (256 * 1024)
+#define RDSV3_RECV_LOWATER 128
+#define RDSV3_XMIT_HIWATER (256 * 1024)
+#define RDSV3_XMIT_LOWATER 1024
+
+struct rsock *
+rdsv3_sk_alloc()
+{
+ struct rsock *sk;
+
+ sk = kmem_cache_alloc(rdsv3_alloc_cache, KM_SLEEP);
+ if (sk == NULL) {
+ RDSV3_DPRINTF2("rdsv3_create", "kmem_cache_alloc failed");
+ return (NULL);
+ }
+
+ bzero(sk, sizeof (struct rsock) + sizeof (struct rdsv3_sock));
+ return (sk);
+}
+
+void
+rdsv3_sock_init_data(struct rsock *sk)
+{
+ sk->sk_sleep = kmem_zalloc(sizeof (rdsv3_wait_queue_t), KM_SLEEP);
+ rdsv3_init_waitqueue(sk->sk_sleep);
+
+ mutex_init(&sk->sk_lock, NULL, MUTEX_DRIVER, NULL);
+ sk->sk_refcount = 1;
+ sk->sk_protinfo = (struct rdsv3_sock *)(sk + 1);
+ sk->sk_sndbuf = RDSV3_XMIT_HIWATER;
+ sk->sk_rcvbuf = RDSV3_RECV_HIWATER;
+}
+
+/* XXX - not complete */
+void
+rdsv3_poll_wait(struct rsock *sk, rdsv3_wait_queue_t *waitq, short events)
+{
+ struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
+
+ if (events & POLLIN) {
+ rw_enter(&rs->rs_recv_lock, RW_READER);
+ while (list_is_empty(&rs->rs_recv_queue) &&
+ list_is_empty(&rs->rs_notify_queue)) {
+ rw_exit(&rs->rs_recv_lock);
+ mutex_enter(&waitq->waitq_mutex);
+ (void) cv_wait_sig(&waitq->waitq_cv,
+ &waitq->waitq_mutex);
+ mutex_exit(&waitq->waitq_mutex);
+ rw_enter(&rs->rs_recv_lock, RW_READER);
+ }
+ rw_exit(&rs->rs_recv_lock);
+ }
+}
+
+/*
+ * Connection cache
+ */
+/* ARGSUSED */
+int
+rdsv3_conn_constructor(void *buf, void *arg, int kmflags)
+{
+ struct rdsv3_connection *conn = buf;
+
+ bzero(conn, sizeof (struct rdsv3_connection));
+
+ conn->c_next_tx_seq = 1;
+ mutex_init(&conn->c_lock, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&conn->c_send_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&conn->c_send_queue, sizeof (struct rdsv3_message),
+ offsetof(struct rdsv3_message, m_conn_item));
+ list_create(&conn->c_retrans, sizeof (struct rdsv3_message),
+ offsetof(struct rdsv3_message, m_conn_item));
+ return (0);
+}
+
+/* ARGSUSED */
+void
+rdsv3_conn_destructor(void *buf, void *arg)
+{
+ struct rdsv3_connection *conn = buf;
+
+ ASSERT(list_is_empty(&conn->c_send_queue));
+ ASSERT(list_is_empty(&conn->c_retrans));
+ list_destroy(&conn->c_send_queue);
+ list_destroy(&conn->c_retrans);
+ mutex_destroy(&conn->c_send_lock);
+ mutex_destroy(&conn->c_lock);
+}
+
+int
+rdsv3_conn_compare(const void *conn1, const void *conn2)
+{
+ uint32_be_t laddr1, faddr1, laddr2, faddr2;
+
+ laddr1 = ((rdsv3_conn_info_t *)conn1)->c_laddr;
+ laddr2 = ((struct rdsv3_connection *)conn2)->c_laddr;
+
+ if (laddr1 == laddr2) {
+ faddr1 = ((rdsv3_conn_info_t *)conn1)->c_faddr;
+ faddr2 = ((struct rdsv3_connection *)conn2)->c_faddr;
+ if (faddr1 == faddr2)
+ return (0);
+ if (faddr1 < faddr2)
+ return (-1);
+ return (1);
+ }
+
+ if (laddr1 < laddr2)
+ return (-1);
+
+ return (1);
+}
+
+/* loop.c */
+extern kmutex_t loop_conns_lock;
+extern list_t loop_conns;
+
+struct rdsv3_loop_connection
+{
+ struct list_node loop_node;
+ struct rdsv3_connection *conn;
+};
+
+void
+rdsv3_loop_init(void)
+{
+ list_create(&loop_conns, sizeof (struct rdsv3_loop_connection),
+ offsetof(struct rdsv3_loop_connection, loop_node));
+ mutex_init(&loop_conns_lock, NULL, MUTEX_DRIVER, NULL);
+}
+
+/* rdma.c */
+/* IB Rkey is used here for comparison */
+int
+rdsv3_mr_compare(const void *mr1, const void *mr2)
+{
+ uint32_t key1 = *(uint32_t *)mr1;
+ uint32_t key2 = ((struct rdsv3_mr *)mr2)->r_key;
+
+ if (key1 < key2)
+ return (-1);
+ if (key1 > key2)
+ return (1);
+ return (0);
+}
+
+/* transport.c */
+extern list_t transports;
+extern krwlock_t trans_sem;
+
+void
+rdsv3_trans_exit(void)
+{
+ struct rdsv3_transport *trans;
+
+ RDSV3_DPRINTF2("rdsv3_trans_exit", "Enter");
+
+ /* currently, only IB transport */
+ rw_enter(&trans_sem, RW_READER);
+ if (!list_is_empty(&transports))
+ trans = list_head(&transports);
+ else
+ trans = NULL;
+ rw_exit(&trans_sem);
+
+ /* trans->exit() will remove the trans from the list */
+ if (trans)
+ trans->exit();
+
+ list_destroy(&transports);
+ rw_destroy(&trans_sem);
+
+ RDSV3_DPRINTF2("rdsv3_trans_exit", "Return");
+}
+
+void
+rdsv3_trans_init()
+{
+ RDSV3_DPRINTF2("rdsv3_trans_init", "Enter");
+
+ list_create(&transports, sizeof (struct rdsv3_transport),
+ offsetof(struct rdsv3_transport, t_item));
+ rw_init(&trans_sem, NULL, RW_DRIVER, NULL);
+
+ RDSV3_DPRINTF2("rdsv3_trans_init", "Return");
+}
+
+int
+rdsv3_put_cmsg(struct nmsghdr *msg, int level, int type, size_t size,
+ void *payload)
+{
+ struct cmsghdr *cp;
+ char *bp;
+ size_t cmlen;
+ size_t cmspace;
+ size_t bufsz;
+
+ RDSV3_DPRINTF4("rdsv3_put_cmsg",
+ "Enter(msg: %p level: %d type: %d sz: %d)",
+ msg, level, type, size);
+
+ if (msg == NULL || msg->msg_controllen == 0 || payload == NULL) {
+ return (0);
+ }
+ /* check for first cmsg or this is another cmsg to be appended */
+ if (msg->msg_control == NULL)
+ msg->msg_controllen = 0;
+
+ cmlen = CMSG_LEN(size);
+ cmspace = CMSG_SPACE(size);
+ bufsz = msg->msg_controllen + cmspace;
+
+ /* extend the existing cmsg to append the next cmsg */
+ bp = kmem_alloc(bufsz, KM_SLEEP);
+ if (msg->msg_control) {
+ bcopy(msg->msg_control, bp, msg->msg_controllen);
+ kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
+ }
+
+ /* assign payload the proper cmsg location */
+ cp = (struct cmsghdr *)(bp + msg->msg_controllen);
+ cp->cmsg_len = cmlen;
+ cp->cmsg_level = level;
+ cp->cmsg_type = type;
+
+ bcopy(payload, CMSG_DATA(cp), cmlen -
+ (unsigned int)_CMSG_DATA_ALIGN(sizeof (struct cmsghdr)));
+
+ msg->msg_control = bp;
+ msg->msg_controllen = bufsz;
+
+ RDSV3_DPRINTF4("rdsv3_put_cmsg", "Return(cmsg_len: %d)", cp->cmsg_len);
+
+ return (0);
+}
+
+/* bind.c */
+extern kmutex_t rdsv3_bind_lock;
+extern avl_tree_t rdsv3_bind_tree;
+
+/* ARGSUSED */
+int
+rdsv3_verify_bind_address(ipaddr_t addr)
+{
+ return (1);
+}
+
+/* XXX - need to enhance to compare IP address and port */
+int
+rdsv3_bind_node_compare(const void *a, const void *b)
+{
+ uint16_be_t port = *(in_port_t *)a;
+ struct rdsv3_sock *rs = (struct rdsv3_sock *)b;
+
+ RDSV3_DPRINTF5("rdsv3_bind_node_compare", "Enter (%x %x)", port,
+ rs->rs_bound_port);
+
+ if (port > rs->rs_bound_port)
+ return (+1);
+ else if (port < rs->rs_bound_port)
+ return (-1);
+
+ return (0);
+}
+
+void
+rdsv3_bind_tree_init()
+{
+ RDSV3_DPRINTF4("rdsv3_bind_tree_init", "Enter");
+
+ mutex_init(&rdsv3_bind_lock, NULL, MUTEX_DRIVER, NULL);
+ avl_create(&rdsv3_bind_tree, rdsv3_bind_node_compare,
+ sizeof (struct rdsv3_sock),
+ offsetof(struct rdsv3_sock, rs_bound_node));
+
+ RDSV3_DPRINTF4("rdsv3_bind_tree_init", "Return");
+}
+
+void
+rdsv3_bind_tree_exit()
+{
+ RDSV3_DPRINTF2("rdsv3_bind_tree_exit", "Enter");
+
+ ASSERT(avl_is_empty(&rdsv3_bind_tree));
+ avl_destroy(&rdsv3_bind_tree);
+ mutex_destroy(&rdsv3_bind_lock);
+
+ RDSV3_DPRINTF2("rdsv3_bind_tree_exit", "Return");
+}
+
+/* checksum */
+uint16_t
+rdsv3_ip_fast_csum(void *hdr, size_t length)
+{
+ return (0xffff &
+ (uint16_t)(~ip_ocsum((ushort_t *)hdr, (int)length <<1, 0)));
+}
+
+/* scatterlist implementation */
+/* ARGSUSED */
+caddr_t
+rdsv3_ib_sg_dma_address(ib_device_t *dev, struct rdsv3_scatterlist *scat,
+ uint_t offset)
+{
+ return (0);
+}
+
+uint_t
+rdsv3_ib_dma_map_sg(struct ib_device *dev, struct rdsv3_scatterlist *scat,
+ uint_t num)
+{
+ struct rdsv3_scatterlist *s, *first;
+ ibt_iov_t *iov;
+ ibt_wr_ds_t *sgl;
+ ibt_iov_attr_t iov_attr;
+ ibt_send_wr_t swr;
+ uint_t i;
+
+ RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg", "scat %p, num: %d", scat, num);
+
+ s = first = &scat[0];
+ ASSERT(first->mihdl == NULL);
+
+ iov = kmem_alloc(num * sizeof (ibt_iov_t), KM_SLEEP);
+ sgl = kmem_zalloc((num * 2) * sizeof (ibt_wr_ds_t), KM_SLEEP);
+
+ for (i = 0; i < num; i++, s++) {
+ iov[i].iov_addr = s->vaddr;
+ iov[i].iov_len = s->length;
+ }
+
+ iov_attr.iov_as = NULL;
+ iov_attr.iov = iov;
+ iov_attr.iov_buf = NULL;
+ iov_attr.iov_list_len = num;
+ iov_attr.iov_wr_nds = num * 2;
+ iov_attr.iov_lso_hdr_sz = 0;
+ iov_attr.iov_flags = IBT_IOV_SLEEP;
+
+ swr.wr_sgl = sgl;
+
+ i = ibt_map_mem_iov(ib_get_ibt_hca_hdl(dev),
+ &iov_attr, (ibt_all_wr_t *)&swr, &first->mihdl);
+ kmem_free(iov, num * sizeof (ibt_iov_t));
+ if (i != IBT_SUCCESS) {
+ RDSV3_DPRINTF2("rdsv3_ib_dma_map_sg",
+ "ibt_map_mem_iov returned: %d", i);
+ return (0);
+ }
+
+ s = first;
+ for (i = 0; i < num; i++, s++, sgl++) {
+ s->sgl = sgl;
+ }
+
+ return (num);
+}
+
+void
+rdsv3_ib_dma_unmap_sg(ib_device_t *dev, struct rdsv3_scatterlist *scat,
+ uint_t num)
+{
+ /* Zero length messages have no scatter gather entries */
+ if (num != 0) {
+ ASSERT(scat->mihdl != NULL);
+ ASSERT(scat->sgl != NULL);
+
+ (void) ibt_unmap_mem_iov(ib_get_ibt_hca_hdl(dev), scat->mihdl);
+
+ kmem_free(scat->sgl, (num * 2) * sizeof (ibt_wr_ds_t));
+ scat->sgl = NULL;
+ scat->mihdl = NULL;
+ }
+}
+
+int
+rdsv3_ib_alloc_hdrs(ib_device_t *dev, struct rdsv3_ib_connection *ic)
+{
+ caddr_t addr;
+ size_t size;
+ ibt_mr_attr_t mr_attr;
+ ibt_mr_desc_t mr_desc;
+ ibt_mr_hdl_t mr_hdl;
+ int ret;
+
+ RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Enter(dev: %p)", dev);
+
+ ASSERT(ic->i_mr == NULL);
+
+ size = (ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr + 1) *
+ sizeof (struct rdsv3_header);
+
+ addr = kmem_zalloc(size, KM_NOSLEEP);
+ if (addr == NULL)
+ return (-1);
+
+ mr_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)addr;
+ mr_attr.mr_len = size;
+ mr_attr.mr_as = NULL;
+ mr_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE;
+ ret = ibt_register_mr(ib_get_ibt_hca_hdl(dev), RDSV3_PD2PDHDL(ic->i_pd),
+ &mr_attr, &mr_hdl, &mr_desc);
+ if (ret != IBT_SUCCESS) {
+ RDSV3_DPRINTF2("rdsv3_ib_alloc_hdrs",
+ "ibt_register_mr returned: " "%d", ret);
+ return (-1);
+ }
+
+ ic->i_mr =
+ (struct rdsv3_hdrs_mr *)kmem_alloc(sizeof (struct rdsv3_hdrs_mr),
+ KM_SLEEP);
+ ic->i_mr->addr = addr;
+ ic->i_mr->size = size;
+ ic->i_mr->hdl = mr_hdl;
+ ic->i_mr->lkey = mr_desc.md_lkey;
+
+ ic->i_send_hdrs = (struct rdsv3_header *)addr;
+ ic->i_send_hdrs_dma = (uint64_t)(uintptr_t)addr;
+
+ ic->i_recv_hdrs = (struct rdsv3_header *)(addr +
+ (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header)));
+ ic->i_recv_hdrs_dma = (uint64_t)(uintptr_t)(addr +
+ (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header)));
+ ic->i_recv_tasklet_cpuid = -1;
+
+ ic->i_ack = (struct rdsv3_header *)(addr +
+ ((ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr) *
+ sizeof (struct rdsv3_header)));
+ ic->i_ack_dma = (uint64_t)(uintptr_t)(addr +
+ ((ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr) *
+ sizeof (struct rdsv3_header)));
+
+ RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Return(dev: %p)", dev);
+
+ return (0);
+}
+
+void
+rdsv3_ib_free_hdrs(ib_device_t *dev, struct rdsv3_ib_connection *ic)
+{
+ RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Enter(dev: %p)", dev);
+ ASSERT(ic->i_mr != NULL);
+
+ ic->i_send_hdrs = NULL;
+ ic->i_send_hdrs_dma = NULL;
+
+ ic->i_recv_hdrs = NULL;
+ ic->i_recv_hdrs_dma = NULL;
+
+ ic->i_ack = NULL;
+ ic->i_ack_dma = NULL;
+
+ (void) ibt_deregister_mr(ib_get_ibt_hca_hdl(dev), ic->i_mr->hdl);
+
+ kmem_free(ic->i_mr->addr, ic->i_mr->size);
+ kmem_free(ic->i_mr, sizeof (struct rdsv3_hdrs_mr));
+
+ ic->i_mr = NULL;
+ RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Return(dev: %p)", dev);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_sc.c b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_sc.c
new file mode 100644
index 0000000000..8510746b9e
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_sc.c
@@ -0,0 +1,395 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+#include <sys/types.h>
+#include <sys/sunddi.h>
+#include <sys/dlpi.h>
+#include <sys/ib/clients/rdsv3/rdsv3_sc.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+/*
+ * RDS Path MAP
+ *
+ * N - Node record, P - Path record
+ *
+ * rds_path_map -
+ * |
+ * v
+ * --------- --------- ---------
+ * | N |------>| N |------>| N |------> NULL
+ * NULL <-------| |<------| |<------| |
+ * --------- --------- ---------
+ * | | |
+ * | | |
+ * v v v
+ * -------- --------- ---------
+ * | P | | P | | P |
+ * -------- --------- ---------
+ * | ^ | ^ | ^
+ * | | | | | |
+ * v | v | v |
+ * -------- -------- ---------
+ * | P | | P | | P |
+ * -------- -------- ---------
+ * o o o
+ * o o o
+ * o o o
+ */
+
+typedef struct rds_path_record_s {
+ ipaddr_t libd_ip;
+ ipaddr_t ribd_ip;
+ struct rds_path_record_s *up;
+ struct rds_path_record_s *downp;
+ char lifname[MAXNAMELEN];
+ char rifname[MAXNAMELEN];
+} rds_path_record_t;
+
+typedef struct rds_node_record_s {
+ struct rds_node_record_s *nextp;
+ ipaddr_t lnode_ip; /* local ip */
+ ipaddr_t rnode_ip; /* remote ip */
+ struct rds_path_record_s *downp;
+ struct rds_node_record_s *prevp;
+} rds_node_record_t;
+
+static char sc_device_name[MAXNAMELEN] = "NotInitialized";
+static kmutex_t rdsv3_pathmap_lock;
+static rds_node_record_t *rdsv3_pathmap = NULL;
+
+#define RDS_VALIDATE_PATH(p) \
+ if ((p->local.iftype != DL_IB) || (p->remote.iftype != DL_IB)) \
+ return
+
+#define isalpha(ch) (((ch) >= 'a' && (ch) <= 'z') || \
+ ((ch) >= 'A' && (ch) <= 'Z'))
+
+/*
+ * Called by SC to register the Sun Cluster device name
+ */
+void
+rdsv3_clif_name(char *name)
+{
+ int i;
+
+ ASSERT(name != NULL);
+
+ mutex_enter(&rdsv3_pathmap_lock);
+
+ /* extract the device name from the interface name */
+ i = strlen(name) - 1;
+ while ((i >= 0) && (!isalpha(name[i]))) i--;
+ if (i >= 0) {
+ (void) strncpy(sc_device_name, name, i + 1);
+ sc_device_name[i + 1] = '\0';
+ }
+
+ mutex_exit(&rdsv3_pathmap_lock);
+}
+
+/*
+ * Called by SC on discovering a new path
+ */
+void
+rdsv3_path_up(rds_path_t *path)
+{
+ rds_node_record_t *p;
+ rds_path_record_t *p1;
+
+ ASSERT(path != NULL);
+
+ /* ignore if the end points are not of type DL_IB */
+ RDS_VALIDATE_PATH(path);
+
+ mutex_enter(&rdsv3_pathmap_lock);
+
+ p = rdsv3_pathmap;
+ while ((p) && ((p->lnode_ip != path->local.node_ipaddr) ||
+ (p->rnode_ip != path->remote.node_ipaddr))) {
+ p = p->nextp;
+ }
+
+ if (p == NULL) {
+ p = (rds_node_record_t *)kmem_alloc(sizeof (rds_node_record_t),
+ KM_SLEEP);
+ p1 = (rds_path_record_t *)kmem_alloc(
+ sizeof (rds_path_record_t), KM_SLEEP);
+
+ p->nextp = NULL;
+ p->lnode_ip = path->local.node_ipaddr;
+ p->rnode_ip = path->remote.node_ipaddr;
+ p->downp = p1;
+ p->prevp = NULL;
+
+ p1->libd_ip = path->local.ipaddr;
+ p1->ribd_ip = path->remote.ipaddr;
+ p1->up = NULL;
+ p1->downp = NULL;
+ (void) strcpy(p1->lifname, path->local.ifname);
+ (void) strcpy(p1->rifname, path->remote.ifname);
+
+ if (rdsv3_pathmap == NULL) {
+ rdsv3_pathmap = p;
+ } else {
+ /* insert this node at the head */
+ rdsv3_pathmap->prevp = p;
+ p->nextp = rdsv3_pathmap;
+ rdsv3_pathmap = p;
+ }
+ } else {
+ /* we found a match */
+ p1 = (rds_path_record_t *)kmem_alloc(
+ sizeof (rds_path_record_t), KM_SLEEP);
+
+ p1->libd_ip = path->local.ipaddr;
+ p1->ribd_ip = path->remote.ipaddr;
+ p1->downp = p->downp;
+ p->downp->up = p1;
+ p1->up = NULL;
+ p->downp = p1;
+ (void) strcpy(p1->lifname, path->local.ifname);
+ (void) strcpy(p1->rifname, path->remote.ifname);
+ }
+
+ mutex_exit(&rdsv3_pathmap_lock);
+}
+
+/*
+ * Called by SC to delete a path
+ */
+void
+rdsv3_path_down(rds_path_t *path)
+{
+ rds_node_record_t *p;
+ rds_path_record_t *p1, *p1up, *p1downp;
+
+ ASSERT(path != NULL);
+
+ /* ignore if the end points are not of type DL_IB */
+ RDS_VALIDATE_PATH(path);
+
+ mutex_enter(&rdsv3_pathmap_lock);
+
+ p = rdsv3_pathmap;
+ while ((p) && ((p->lnode_ip != path->local.node_ipaddr) ||
+ (p->rnode_ip != path->remote.node_ipaddr))) {
+ p = p->nextp;
+ }
+
+ if (p == NULL) {
+ /* no match */
+ RDSV3_DPRINTF2("rdsv3_path_down", "Node record not found "
+ "(0x%x <-> 0x%x)", path->local.node_ipaddr,
+ path->remote.node_ipaddr);
+ mutex_exit(&rdsv3_pathmap_lock);
+ return;
+ }
+
+ p1 = p->downp;
+ while ((p1) && ((p1->libd_ip != path->local.ipaddr) ||
+ (p1->ribd_ip != path->remote.ipaddr))) {
+ p1 = p1->downp;
+ }
+
+ if (p1 == NULL) {
+ /* no match */
+ RDSV3_DPRINTF2("rdsv3_path_down", "Path record not found "
+ "(0x%x <-> 0x%x)", path->local.ipaddr, path->remote.ipaddr);
+ mutex_exit(&rdsv3_pathmap_lock);
+ return;
+ }
+
+ /* we found the record, remove it */
+ p1up = p1->up;
+ p1downp = p1->downp;
+
+ if (p1up) {
+ p1up->downp = p1downp;
+ } else {
+ /* this is the first path record */
+ p->downp = p1downp;
+ }
+
+ if (p1downp) {
+ p1downp->up = p1up;
+ }
+
+ kmem_free(p1, sizeof (rds_path_record_t));
+
+ /* remove the node record if there are no path records */
+ if (p->downp == NULL) {
+ if (p->prevp) {
+ p->prevp->nextp = p->nextp;
+ } else {
+ /* this is the first node record */
+ ASSERT(p == rdsv3_pathmap);
+ rdsv3_pathmap = p->nextp;
+ }
+
+ if (p->nextp) {
+ p->nextp->prevp = p->prevp;
+ }
+
+ kmem_free(p, sizeof (rds_node_record_t));
+ }
+
+ mutex_exit(&rdsv3_pathmap_lock);
+}
+
+int
+rdsv3_sc_path_lookup(ipaddr_t *localip, ipaddr_t *remip)
+{
+ rds_node_record_t *p;
+ rds_path_record_t *p1, *p1downp;
+
+ mutex_enter(&rdsv3_pathmap_lock);
+
+ p = rdsv3_pathmap;
+ while ((p) && ((p->lnode_ip != *localip) || (p->rnode_ip != *remip))) {
+ p = p->nextp;
+ }
+
+ if (p == NULL) {
+ /* no match */
+ RDSV3_DPRINTF2("rdsv3_sc_path_lookup", "Node record not found "
+ "(0x%x <-> 0x%x)", *localip, *remip);
+ mutex_exit(&rdsv3_pathmap_lock);
+ return (0);
+ }
+
+ /* found a path */
+ p1 = p->downp;
+ *localip = p1->libd_ip;
+ *remip = p1->ribd_ip;
+
+ /*
+ * But next time, we want to use a different path record so move this
+ * path record to the end.
+ */
+ p1downp = p1->downp;
+ if (p1downp != NULL) {
+ p->downp = p1downp;
+ p1downp->up = NULL;
+
+ /* walk down to the last path record */
+ while (p1downp->downp != NULL) {
+ p1downp = p1downp->downp;
+ }
+
+ /* Attach the first path record to the end */
+ p1downp->downp = p1;
+ p1->up = p1downp;
+ p1->downp = NULL;
+ }
+
+ mutex_exit(&rdsv3_pathmap_lock);
+
+ return (1);
+}
+
+boolean_t
+rdsv3_if_lookup_by_name(char *devname)
+{
+ mutex_enter(&rdsv3_pathmap_lock);
+
+ /*
+ * Sun Cluster always names its interconnect virtual network interface
+ * as clprivnetx, so return TRUE if there is atleast one node record
+ * and the interface name is clprivnet something.
+ */
+ if (strcmp(devname, sc_device_name) == 0) {
+ /* clprivnet address */
+ mutex_exit(&rdsv3_pathmap_lock);
+ return (B_TRUE);
+ }
+
+ mutex_exit(&rdsv3_pathmap_lock);
+ return (B_FALSE);
+}
+
+boolean_t
+rdsv3_if_lookup_by_addr(ipaddr_t addr)
+{
+ rds_node_record_t *p;
+ rds_path_record_t *p1;
+
+ mutex_enter(&rdsv3_pathmap_lock);
+
+ p = rdsv3_pathmap;
+ while ((p) && (p->lnode_ip != addr)) {
+ p1 = p->downp;
+ while ((p1) && (p1->libd_ip != addr)) {
+ p1 = p1->downp;
+ }
+
+ /* we found a match */
+ if (p1 != NULL)
+ break;
+
+ /* go to the next node record */
+ p = p->nextp;
+ }
+
+ mutex_exit(&rdsv3_pathmap_lock);
+ if (p == NULL) {
+ /* no match */
+ RDSV3_DPRINTF2("rds_if_lookup_by_addr",
+ "Addr: 0x%x not found", addr);
+ return (B_FALSE);
+ }
+
+ /* Found a matching node record */
+ return (B_TRUE);
+}
+
+/*
+ * If SC is configured then addr would be a clprivnet address. Find the
+ * node record and return the first IB address. If the node record is not
+ * found, then return addr as-is.
+ */
+ipaddr_t
+rdsv3_scaddr_to_ibaddr(ipaddr_t addr)
+{
+ rds_node_record_t *p;
+ rds_path_record_t *p1;
+ ipaddr_t ret = addr;
+
+ mutex_enter(&rdsv3_pathmap_lock);
+
+ p = rdsv3_pathmap;
+ while ((p) && (p->lnode_ip != addr)) {
+ /* go to the next node record */
+ p = p->nextp;
+ }
+
+ if (p != NULL) {
+ p1 = p->downp;
+ ret = p1->libd_ip;
+ RDSV3_DPRINTF3("rds_scaddr_to_ibaddr",
+ "Addr: 0x%x found: 0x%x", addr, p1->libd_ip);
+ }
+ mutex_exit(&rdsv3_pathmap_lock);
+
+ /* Found a matching node record */
+ return (ret);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/send.c b/usr/src/uts/common/io/ib/clients/rdsv3/send.c
new file mode 100644
index 0000000000..8d5d0f7fa4
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/send.c
@@ -0,0 +1,1178 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/stropts.h>
+#include <sys/systm.h>
+
+#include <sys/rds.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/rdma.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+/*
+ * When transmitting messages in rdsv3_send_xmit, we need to emerge from
+ * time to time and briefly release the CPU. Otherwise the softlock watchdog
+ * will kick our shin.
+ * Also, it seems fairer to not let one busy connection stall all the
+ * others.
+ *
+ * send_batch_count is the number of times we'll loop in send_xmit. Setting
+ * it to 0 will restore the old behavior (where we looped until we had
+ * drained the queue).
+ */
+static int send_batch_count = 64;
+
+extern void rdsv3_ib_send_unmap_rdma(void *ic, struct rdsv3_rdma_op *op);
+/*
+ * Reset the send state. Caller must hold c_send_lock when calling here.
+ */
+void
+rdsv3_send_reset(struct rdsv3_connection *conn)
+{
+ struct rdsv3_message *rm, *tmp;
+ struct rdsv3_rdma_op *ro;
+
+ RDSV3_DPRINTF4("rdsv3_send_reset", "Enter(conn: %p)", conn);
+
+ if (conn->c_xmit_rm) {
+ rm = conn->c_xmit_rm;
+ ro = rm->m_rdma_op;
+ if (ro && ro->r_mapped) {
+ RDSV3_DPRINTF2("rdsv3_send_reset",
+ "rm %p mflg 0x%x map %d mihdl %p sgl %p",
+ rm, rm->m_flags, ro->r_mapped,
+ ro->r_rdma_sg[0].mihdl,
+ ro->r_rdma_sg[0].swr.wr_sgl);
+ rdsv3_ib_send_unmap_rdma(conn->c_transport_data, ro);
+ }
+ /*
+ * Tell the user the RDMA op is no longer mapped by the
+ * transport. This isn't entirely true (it's flushed out
+ * independently) but as the connection is down, there's
+ * no ongoing RDMA to/from that memory
+ */
+ rdsv3_message_unmapped(conn->c_xmit_rm);
+ rdsv3_message_put(conn->c_xmit_rm);
+ conn->c_xmit_rm = NULL;
+ }
+ conn->c_xmit_sg = 0;
+ conn->c_xmit_hdr_off = 0;
+ conn->c_xmit_data_off = 0;
+ conn->c_xmit_rdma_sent = 0;
+
+ conn->c_map_queued = 0;
+
+ conn->c_unacked_packets = rdsv3_sysctl_max_unacked_packets;
+ conn->c_unacked_bytes = rdsv3_sysctl_max_unacked_bytes;
+
+ /* Mark messages as retransmissions, and move them to the send q */
+ mutex_enter(&conn->c_lock);
+ RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &conn->c_retrans, m_conn_item) {
+ set_bit(RDSV3_MSG_ACK_REQUIRED, &rm->m_flags);
+ set_bit(RDSV3_MSG_RETRANSMITTED, &rm->m_flags);
+ if (rm->m_rdma_op && rm->m_rdma_op->r_mapped) {
+ RDSV3_DPRINTF4("_send_reset",
+ "RT rm %p mflg 0x%x sgl %p",
+ rm, rm->m_flags,
+ rm->m_rdma_op->r_rdma_sg[0].swr.wr_sgl);
+ }
+ }
+ list_move_tail(&conn->c_send_queue, &conn->c_retrans);
+ mutex_exit(&conn->c_lock);
+
+ RDSV3_DPRINTF4("rdsv3_send_reset", "Return(conn: %p)", conn);
+}
+
+/*
+ * We're making the concious trade-off here to only send one message
+ * down the connection at a time.
+ * Pro:
+ * - tx queueing is a simple fifo list
+ * - reassembly is optional and easily done by transports per conn
+ * - no per flow rx lookup at all, straight to the socket
+ * - less per-frag memory and wire overhead
+ * Con:
+ * - queued acks can be delayed behind large messages
+ * Depends:
+ * - small message latency is higher behind queued large messages
+ * - large message latency isn't starved by intervening small sends
+ */
+int
+rdsv3_send_xmit(struct rdsv3_connection *conn)
+{
+ struct rdsv3_message *rm;
+ unsigned int tmp;
+ unsigned int send_quota = send_batch_count;
+ struct rdsv3_scatterlist *sg;
+ int ret = 0;
+ int was_empty = 0;
+ list_t to_be_dropped;
+
+ RDSV3_DPRINTF4("rdsv3_send_xmit", "Enter(conn: %p)", conn);
+
+ list_create(&to_be_dropped, sizeof (struct rdsv3_message),
+ offsetof(struct rdsv3_message, m_conn_item));
+
+ /*
+ * sendmsg calls here after having queued its message on the send
+ * queue. We only have one task feeding the connection at a time. If
+ * another thread is already feeding the queue then we back off. This
+ * avoids blocking the caller and trading per-connection data between
+ * caches per message.
+ *
+ * The sem holder will issue a retry if they notice that someone queued
+ * a message after they stopped walking the send queue but before they
+ * dropped the sem.
+ */
+ if (!mutex_tryenter(&conn->c_send_lock)) {
+ RDSV3_DPRINTF4("rdsv3_send_xmit",
+ "Another thread running(conn: %p)", conn);
+ rdsv3_stats_inc(s_send_sem_contention);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (conn->c_trans->xmit_prepare)
+ conn->c_trans->xmit_prepare(conn);
+
+ /*
+ * spin trying to push headers and data down the connection until
+ * the connection doens't make forward progress.
+ */
+ while (--send_quota) {
+ /*
+ * See if need to send a congestion map update if we're
+ * between sending messages. The send_sem protects our sole
+ * use of c_map_offset and _bytes.
+ * Note this is used only by transports that define a special
+ * xmit_cong_map function. For all others, we create allocate
+ * a cong_map message and treat it just like any other send.
+ */
+ if (conn->c_map_bytes) {
+ ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
+ conn->c_map_offset);
+ if (ret <= 0)
+ break;
+
+ conn->c_map_offset += ret;
+ conn->c_map_bytes -= ret;
+ if (conn->c_map_bytes)
+ continue;
+ }
+
+ /*
+ * If we're done sending the current message, clear the
+ * offset and S/G temporaries.
+ */
+ rm = conn->c_xmit_rm;
+ if (rm != NULL &&
+ conn->c_xmit_hdr_off == sizeof (struct rdsv3_header) &&
+ conn->c_xmit_sg == rm->m_nents) {
+ conn->c_xmit_rm = NULL;
+ conn->c_xmit_sg = 0;
+ conn->c_xmit_hdr_off = 0;
+ conn->c_xmit_data_off = 0;
+ conn->c_xmit_rdma_sent = 0;
+
+ /* Release the reference to the previous message. */
+ rdsv3_message_put(rm);
+ rm = NULL;
+ }
+
+ /* If we're asked to send a cong map update, do so. */
+ if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
+ if (conn->c_trans->xmit_cong_map != NULL) {
+ conn->c_map_offset = 0;
+ conn->c_map_bytes =
+ sizeof (struct rdsv3_header) +
+ RDSV3_CONG_MAP_BYTES;
+ continue;
+ }
+
+ rm = rdsv3_cong_update_alloc(conn);
+ if (IS_ERR(rm)) {
+ ret = PTR_ERR(rm);
+ break;
+ }
+
+ conn->c_xmit_rm = rm;
+ }
+
+ /*
+ * Grab the next message from the send queue, if there is one.
+ *
+ * c_xmit_rm holds a ref while we're sending this message down
+ * the connction. We can use this ref while holding the
+ * send_sem.. rdsv3_send_reset() is serialized with it.
+ */
+ if (rm == NULL) {
+ unsigned int len;
+
+ mutex_enter(&conn->c_lock);
+
+ if (!list_is_empty(&conn->c_send_queue)) {
+ rm = list_remove_head(&conn->c_send_queue);
+ rdsv3_message_addref(rm);
+
+ /*
+ * Move the message from the send queue to
+ * the retransmit
+ * list right away.
+ */
+ list_insert_tail(&conn->c_retrans, rm);
+ }
+
+ mutex_exit(&conn->c_lock);
+
+ if (rm == NULL) {
+ was_empty = 1;
+ break;
+ }
+
+ /*
+ * Unfortunately, the way Infiniband deals with
+ * RDMA to a bad MR key is by moving the entire
+ * queue pair to error state. We cold possibly
+ * recover from that, but right now we drop the
+ * connection.
+ * Therefore, we never retransmit messages with
+ * RDMA ops.
+ */
+ if (rm->m_rdma_op &&
+ test_bit(RDSV3_MSG_RETRANSMITTED, &rm->m_flags)) {
+ mutex_enter(&conn->c_lock);
+ if (test_and_clear_bit(RDSV3_MSG_ON_CONN,
+ &rm->m_flags))
+ list_remove_node(&rm->m_conn_item);
+ list_insert_tail(&to_be_dropped, rm);
+ mutex_exit(&conn->c_lock);
+ rdsv3_message_put(rm);
+ continue;
+ }
+
+ /* Require an ACK every once in a while */
+ len = ntohl(rm->m_inc.i_hdr.h_len);
+ if (conn->c_unacked_packets == 0 ||
+ conn->c_unacked_bytes < len) {
+ set_bit(RDSV3_MSG_ACK_REQUIRED, &rm->m_flags);
+
+ conn->c_unacked_packets =
+ rdsv3_sysctl_max_unacked_packets;
+ conn->c_unacked_bytes =
+ rdsv3_sysctl_max_unacked_bytes;
+ rdsv3_stats_inc(s_send_ack_required);
+ } else {
+ conn->c_unacked_bytes -= len;
+ conn->c_unacked_packets--;
+ }
+
+ conn->c_xmit_rm = rm;
+ }
+
+ /*
+ * Try and send an rdma message. Let's see if we can
+ * keep this simple and require that the transport either
+ * send the whole rdma or none of it.
+ */
+ if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
+ ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
+ if (ret)
+ break;
+ conn->c_xmit_rdma_sent = 1;
+ /*
+ * The transport owns the mapped memory for now.
+ * You can't unmap it while it's on the send queue
+ */
+ set_bit(RDSV3_MSG_MAPPED, &rm->m_flags);
+ }
+
+ if (conn->c_xmit_hdr_off < sizeof (struct rdsv3_header) ||
+ conn->c_xmit_sg < rm->m_nents) {
+ ret = conn->c_trans->xmit(conn, rm,
+ conn->c_xmit_hdr_off,
+ conn->c_xmit_sg,
+ conn->c_xmit_data_off);
+ if (ret <= 0)
+ break;
+
+ if (conn->c_xmit_hdr_off <
+ sizeof (struct rdsv3_header)) {
+ tmp = min(ret,
+ sizeof (struct rdsv3_header) -
+ conn->c_xmit_hdr_off);
+ conn->c_xmit_hdr_off += tmp;
+ ret -= tmp;
+ }
+
+ sg = &rm->m_sg[conn->c_xmit_sg];
+ while (ret) {
+ tmp = min(ret, rdsv3_sg_len(sg) -
+ conn->c_xmit_data_off);
+ conn->c_xmit_data_off += tmp;
+ ret -= tmp;
+ if (conn->c_xmit_data_off == rdsv3_sg_len(sg)) {
+ conn->c_xmit_data_off = 0;
+ sg++;
+ conn->c_xmit_sg++;
+ ASSERT(!(ret != 0 &&
+ conn->c_xmit_sg == rm->m_nents));
+ }
+ }
+ }
+ }
+
+ /* Nuke any messages we decided not to retransmit. */
+ if (!list_is_empty(&to_be_dropped))
+ rdsv3_send_remove_from_sock(&to_be_dropped, RDSV3_RDMA_DROPPED);
+
+ if (conn->c_trans->xmit_complete)
+ conn->c_trans->xmit_complete(conn);
+
+ /*
+ * We might be racing with another sender who queued a message but
+ * backed off on noticing that we held the c_send_lock. If we check
+ * for queued messages after dropping the sem then either we'll
+ * see the queued message or the queuer will get the sem. If we
+ * notice the queued message then we trigger an immediate retry.
+ *
+ * We need to be careful only to do this when we stopped processing
+ * the send queue because it was empty. It's the only way we
+ * stop processing the loop when the transport hasn't taken
+ * responsibility for forward progress.
+ */
+ mutex_exit(&conn->c_send_lock);
+
+ if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) {
+ /*
+ * We exhausted the send quota, but there's work left to
+ * do. Return and (re-)schedule the send worker.
+ */
+ ret = -EAGAIN;
+ }
+
+ if (ret == 0 && was_empty) {
+ /*
+ * A simple bit test would be way faster than taking the
+ * spin lock
+ */
+ mutex_enter(&conn->c_lock);
+ if (!list_is_empty(&conn->c_send_queue)) {
+ rdsv3_stats_inc(s_send_sem_queue_raced);
+ ret = -EAGAIN;
+ }
+ mutex_exit(&conn->c_lock);
+ }
+
+out:
+ RDSV3_DPRINTF4("rdsv3_send_xmit", "Return(conn: %p, ret: %d)",
+ conn, ret);
+ return (ret);
+}
+
+static void
+rdsv3_send_sndbuf_remove(struct rdsv3_sock *rs, struct rdsv3_message *rm)
+{
+ uint32_t len = ntohl(rm->m_inc.i_hdr.h_len);
+
+ ASSERT(mutex_owned(&rs->rs_lock));
+
+ ASSERT(rs->rs_snd_bytes >= len);
+ rs->rs_snd_bytes -= len;
+
+ if (rs->rs_snd_bytes == 0)
+ rdsv3_stats_inc(s_send_queue_empty);
+}
+
+static inline int
+rdsv3_send_is_acked(struct rdsv3_message *rm, uint64_t ack,
+ is_acked_func is_acked)
+{
+ if (is_acked)
+ return (is_acked(rm, ack));
+ return (ntohll(rm->m_inc.i_hdr.h_sequence) <= ack);
+}
+
+/*
+ * Returns true if there are no messages on the send and retransmit queues
+ * which have a sequence number greater than or equal to the given sequence
+ * number.
+ */
+int
+rdsv3_send_acked_before(struct rdsv3_connection *conn, uint64_t seq)
+{
+ struct rdsv3_message *rm;
+ int ret = 1;
+
+ RDSV3_DPRINTF4("rdsv3_send_acked_before", "Enter(conn: %p)", conn);
+
+ mutex_enter(&conn->c_lock);
+
+ /* XXX - original code spits out warning */
+ rm = list_head(&conn->c_retrans);
+ if (ntohll(rm->m_inc.i_hdr.h_sequence) < seq)
+ ret = 0;
+
+ /* XXX - original code spits out warning */
+ rm = list_head(&conn->c_send_queue);
+ if (ntohll(rm->m_inc.i_hdr.h_sequence) < seq)
+ ret = 0;
+
+ mutex_exit(&conn->c_lock);
+
+ RDSV3_DPRINTF4("rdsv3_send_acked_before", "Return(conn: %p)", conn);
+
+ return (ret);
+}
+
+/*
+ * This is pretty similar to what happens below in the ACK
+ * handling code - except that we call here as soon as we get
+ * the IB send completion on the RDMA op and the accompanying
+ * message.
+ */
+void
+rdsv3_rdma_send_complete(struct rdsv3_message *rm, int status)
+{
+ struct rdsv3_sock *rs = NULL;
+ struct rdsv3_rdma_op *ro;
+ struct rdsv3_notifier *notifier;
+
+ RDSV3_DPRINTF4("rdsv3_rdma_send_complete", "Enter(rm: %p)", rm);
+
+ mutex_enter(&rm->m_rs_lock);
+
+ ro = rm->m_rdma_op;
+ if (test_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags) &&
+ ro && ro->r_notify &&
+ (notifier = ro->r_notifier) != NULL) {
+ ro->r_notifier = NULL;
+ rs = rm->m_rs;
+ rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs));
+
+ notifier->n_status = status;
+ mutex_enter(&rs->rs_lock);
+ list_insert_tail(&rs->rs_notify_queue, notifier);
+ mutex_exit(&rs->rs_lock);
+ }
+
+ mutex_exit(&rm->m_rs_lock);
+
+ if (rs) {
+ rdsv3_wake_sk_sleep(rs);
+ rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs));
+ }
+
+ RDSV3_DPRINTF4("rdsv3_rdma_send_complete", "Return(rm: %p)", rm);
+}
+
+/*
+ * This is the same as rdsv3_rdma_send_complete except we
+ * don't do any locking - we have all the ingredients (message,
+ * socket, socket lock) and can just move the notifier.
+ */
+static inline void
+__rdsv3_rdma_send_complete(struct rdsv3_sock *rs, struct rdsv3_message *rm,
+ int status)
+{
+ struct rdsv3_rdma_op *ro;
+ void *ic;
+
+ RDSV3_DPRINTF4("__rdsv3_rdma_send_complete",
+ "Enter(rs: %p, rm: %p)", rs, rm);
+
+ ro = rm->m_rdma_op;
+ if (ro && ro->r_notify && ro->r_notifier) {
+ ro->r_notifier->n_status = status;
+ list_insert_tail(&rs->rs_notify_queue, ro->r_notifier);
+ ro->r_notifier = NULL;
+ }
+
+ /* No need to wake the app - caller does this */
+}
+
+/*
+ * This is called from the IB send completion when we detect
+ * a RDMA operation that failed with remote access error.
+ * So speed is not an issue here.
+ */
+struct rdsv3_message *
+rdsv3_send_get_message(struct rdsv3_connection *conn,
+ struct rdsv3_rdma_op *op)
+{
+ struct rdsv3_message *rm, *tmp, *found = NULL;
+
+ RDSV3_DPRINTF4("rdsv3_send_get_message", "Enter(conn: %p)", conn);
+
+ mutex_enter(&conn->c_lock);
+
+ RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &conn->c_retrans, m_conn_item) {
+ if (rm->m_rdma_op == op) {
+ atomic_add_32(&rm->m_refcount, 1);
+ found = rm;
+ goto out;
+ }
+ }
+
+ RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &conn->c_send_queue,
+ m_conn_item) {
+ if (rm->m_rdma_op == op) {
+ atomic_add_32(&rm->m_refcount, 1);
+ found = rm;
+ break;
+ }
+ }
+
+out:
+ mutex_exit(&conn->c_lock);
+
+ return (found);
+}
+
+/*
+ * This removes messages from the socket's list if they're on it. The list
+ * argument must be private to the caller, we must be able to modify it
+ * without locks. The messages must have a reference held for their
+ * position on the list. This function will drop that reference after
+ * removing the messages from the 'messages' list regardless of if it found
+ * the messages on the socket list or not.
+ */
+void
+rdsv3_send_remove_from_sock(struct list *messages, int status)
+{
+ struct rdsv3_sock *rs = NULL;
+ struct rdsv3_message *rm;
+
+ RDSV3_DPRINTF4("rdsv3_send_remove_from_sock", "Enter");
+
+ while (!list_is_empty(messages)) {
+ rm = list_remove_head(messages);
+
+ /*
+ * If we see this flag cleared then we're *sure* that someone
+ * else beat us to removing it from the sock. If we race
+ * with their flag update we'll get the lock and then really
+ * see that the flag has been cleared.
+ *
+ * The message spinlock makes sure nobody clears rm->m_rs
+ * while we're messing with it. It does not prevent the
+ * message from being removed from the socket, though.
+ */
+ mutex_enter(&rm->m_rs_lock);
+ if (!test_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags))
+ goto unlock_and_drop;
+
+ if (rs != rm->m_rs) {
+ if (rs) {
+ rdsv3_wake_sk_sleep(rs);
+ rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs));
+ }
+ rs = rm->m_rs;
+ rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs));
+ }
+
+ mutex_enter(&rs->rs_lock);
+ if (test_and_clear_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags)) {
+ struct rdsv3_rdma_op *ro = rm->m_rdma_op;
+ struct rdsv3_notifier *notifier;
+
+ list_remove_node(&rm->m_sock_item);
+ rdsv3_send_sndbuf_remove(rs, rm);
+
+ if (ro &&
+ (notifier = ro->r_notifier) != NULL &&
+ (status || ro->r_notify)) {
+ list_insert_tail(&rs->rs_notify_queue,
+ notifier);
+ if (!notifier->n_status)
+ notifier->n_status = status;
+ rm->m_rdma_op->r_notifier = NULL;
+ }
+ rdsv3_message_put(rm);
+ rm->m_rs = NULL;
+ }
+ mutex_exit(&rs->rs_lock);
+
+unlock_and_drop:
+ mutex_exit(&rm->m_rs_lock);
+ rdsv3_message_put(rm);
+ }
+
+ if (rs) {
+ rdsv3_wake_sk_sleep(rs);
+ rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs));
+ }
+
+ RDSV3_DPRINTF4("rdsv3_send_remove_from_sock", "Return");
+}
+
+/*
+ * Transports call here when they've determined that the receiver queued
+ * messages up to, and including, the given sequence number. Messages are
+ * moved to the retrans queue when rdsv3_send_xmit picks them off the send
+ * queue. This means that in the TCP case, the message may not have been
+ * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
+ * checks the RDSV3_MSG_HAS_ACK_SEQ bit.
+ *
+ * XXX It's not clear to me how this is safely serialized with socket
+ * destruction. Maybe it should bail if it sees SOCK_DEAD.
+ */
+void
+rdsv3_send_drop_acked(struct rdsv3_connection *conn, uint64_t ack,
+ is_acked_func is_acked)
+{
+ struct rdsv3_message *rm, *tmp;
+ list_t list;
+
+ RDSV3_DPRINTF4("rdsv3_send_drop_acked", "Enter(conn: %p)", conn);
+
+ list_create(&list, sizeof (struct rdsv3_message),
+ offsetof(struct rdsv3_message, m_conn_item));
+
+ mutex_enter(&conn->c_lock);
+
+ RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &conn->c_retrans, m_conn_item) {
+ if (!rdsv3_send_is_acked(rm, ack, is_acked))
+ break;
+
+ list_remove_node(&rm->m_conn_item);
+ list_insert_tail(&list, rm);
+ clear_bit(RDSV3_MSG_ON_CONN, &rm->m_flags);
+ }
+
+#if 0
+XXX
+ /* order flag updates with spin locks */
+ if (!list_is_empty(&list))
+ smp_mb__after_clear_bit();
+#endif
+
+ mutex_exit(&conn->c_lock);
+
+ /* now remove the messages from the sock list as needed */
+ rdsv3_send_remove_from_sock(&list, RDSV3_RDMA_SUCCESS);
+
+ RDSV3_DPRINTF4("rdsv3_send_drop_acked", "Return(conn: %p)", conn);
+}
+
+void
+rdsv3_send_drop_to(struct rdsv3_sock *rs, struct sockaddr_in *dest)
+{
+ struct rdsv3_message *rm, *tmp;
+ struct rdsv3_connection *conn;
+ list_t list;
+ int wake = 0;
+
+ RDSV3_DPRINTF4("rdsv3_send_drop_to", "Enter(rs: %p)", rs);
+
+ list_create(&list, sizeof (struct rdsv3_message),
+ offsetof(struct rdsv3_message, m_sock_item));
+
+ /* get all the messages we're dropping under the rs lock */
+ mutex_enter(&rs->rs_lock);
+
+ RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &rs->rs_send_queue,
+ m_sock_item) {
+ if (dest && (dest->sin_addr.s_addr != rm->m_daddr ||
+ dest->sin_port != rm->m_inc.i_hdr.h_dport))
+ continue;
+
+ wake = 1;
+ list_remove(&rs->rs_send_queue, rm);
+ list_insert_tail(&list, rm);
+ rdsv3_send_sndbuf_remove(rs, rm);
+ clear_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags);
+ }
+
+ mutex_exit(&rs->rs_lock);
+
+ conn = NULL;
+
+ /* now remove the messages from the conn list as needed */
+ RDSV3_FOR_EACH_LIST_NODE(rm, &list, m_sock_item) {
+ /*
+ * We do this here rather than in the loop above, so that
+ * we don't have to nest m_rs_lock under rs->rs_lock
+ */
+ mutex_enter(&rm->m_rs_lock);
+ /* If this is a RDMA operation, notify the app. */
+ __rdsv3_rdma_send_complete(rs, rm, RDSV3_RDMA_CANCELED);
+ rm->m_rs = NULL;
+ mutex_exit(&rm->m_rs_lock);
+
+ /*
+ * If we see this flag cleared then we're *sure* that someone
+ * else beat us to removing it from the conn. If we race
+ * with their flag update we'll get the lock and then really
+ * see that the flag has been cleared.
+ */
+ if (!test_bit(RDSV3_MSG_ON_CONN, &rm->m_flags))
+ continue;
+
+ if (conn != rm->m_inc.i_conn) {
+ if (conn)
+ mutex_exit(&conn->c_lock);
+ conn = rm->m_inc.i_conn;
+ mutex_enter(&conn->c_lock);
+ }
+
+ if (test_and_clear_bit(RDSV3_MSG_ON_CONN, &rm->m_flags)) {
+ list_remove_node(&rm->m_conn_item);
+ rdsv3_message_put(rm);
+ }
+ }
+
+ if (conn)
+ mutex_exit(&conn->c_lock);
+
+ if (wake)
+ rdsv3_wake_sk_sleep(rs);
+
+ while (!list_is_empty(&list)) {
+ rm = list_remove_head(&list);
+
+ rdsv3_message_wait(rm);
+ rdsv3_message_put(rm);
+ }
+
+ RDSV3_DPRINTF4("rdsv3_send_drop_to", "Return(rs: %p)", rs);
+}
+
+/*
+ * we only want this to fire once so we use the callers 'queued'. It's
+ * possible that another thread can race with us and remove the
+ * message from the flow with RDSV3_CANCEL_SENT_TO.
+ */
+static int
+rdsv3_send_queue_rm(struct rdsv3_sock *rs, struct rdsv3_connection *conn,
+ struct rdsv3_message *rm, uint16_be_t sport,
+ uint16_be_t dport, int *queued)
+{
+ uint32_t len;
+
+ RDSV3_DPRINTF4("rdsv3_send_queue_rm", "Enter(rs: %p, rm: %p)", rs, rm);
+
+ if (*queued)
+ goto out;
+
+ len = ntohl(rm->m_inc.i_hdr.h_len);
+
+ /*
+ * this is the only place which holds both the socket's rs_lock
+ * and the connection's c_lock
+ */
+ mutex_enter(&rs->rs_lock);
+
+ /*
+ * If there is a little space in sndbuf, we don't queue anything,
+ * and userspace gets -EAGAIN. But poll() indicates there's send
+ * room. This can lead to bad behavior (spinning) if snd_bytes isn't
+ * freed up by incoming acks. So we check the *old* value of
+ * rs_snd_bytes here to allow the last msg to exceed the buffer,
+ * and poll() now knows no more data can be sent.
+ */
+ if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs)) {
+ rs->rs_snd_bytes += len;
+
+ /*
+ * let recv side know we are close to send space exhaustion.
+ * This is probably not the optimal way to do it, as this
+ * means we set the flag on *all* messages as soon as our
+ * throughput hits a certain threshold.
+ */
+ if (rs->rs_snd_bytes >= rdsv3_sk_sndbuf(rs) / 2)
+ set_bit(RDSV3_MSG_ACK_REQUIRED, &rm->m_flags);
+
+ list_insert_tail(&rs->rs_send_queue, rm);
+ set_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags);
+
+ rdsv3_message_addref(rm);
+ rm->m_rs = rs;
+
+ /*
+ * The code ordering is a little weird, but we're
+ * trying to minimize the time we hold c_lock
+ */
+ rdsv3_message_populate_header(&rm->m_inc.i_hdr, sport,
+ dport, 0);
+ rm->m_inc.i_conn = conn;
+ rdsv3_message_addref(rm); /* XXX - called twice */
+
+ mutex_enter(&conn->c_lock);
+ rm->m_inc.i_hdr.h_sequence = htonll(conn->c_next_tx_seq++);
+ list_insert_tail(&conn->c_send_queue, rm);
+ set_bit(RDSV3_MSG_ON_CONN, &rm->m_flags);
+ mutex_exit(&conn->c_lock);
+
+ RDSV3_DPRINTF5("rdsv3_send_queue_rm",
+ "queued msg %p len %d, rs %p bytes %d seq %llu",
+ rm, len, rs, rs->rs_snd_bytes,
+ (unsigned long long)ntohll(
+ rm->m_inc.i_hdr.h_sequence));
+
+ *queued = 1;
+ }
+
+ mutex_exit(&rs->rs_lock);
+
+ RDSV3_DPRINTF4("rdsv3_send_queue_rm", "Return(rs: %p)", rs);
+out:
+ return (*queued);
+}
+
+static int
+rdsv3_cmsg_send(struct rdsv3_sock *rs, struct rdsv3_message *rm,
+ struct msghdr *msg, int *allocated_mr)
+{
+ struct cmsghdr *cmsg;
+ int ret = 0;
+
+ RDSV3_DPRINTF4("rdsv3_cmsg_send", "Enter(rs: %p)", rs);
+
+ for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+
+ if (cmsg->cmsg_level != SOL_RDS)
+ continue;
+
+ RDSV3_DPRINTF4("rdsv3_cmsg_send", "cmsg(%p, %p) type %d",
+ cmsg, rm, cmsg->cmsg_type);
+ /*
+ * As a side effect, RDMA_DEST and RDMA_MAP will set
+ * rm->m_rdma_cookie and rm->m_rdma_mr.
+ */
+ switch (cmsg->cmsg_type) {
+ case RDSV3_CMSG_RDMA_ARGS:
+ ret = rdsv3_cmsg_rdma_args(rs, rm, cmsg);
+ break;
+
+ case RDSV3_CMSG_RDMA_DEST:
+ ret = rdsv3_cmsg_rdma_dest(rs, rm, cmsg);
+ break;
+
+ case RDSV3_CMSG_RDMA_MAP:
+ ret = rdsv3_cmsg_rdma_map(rs, rm, cmsg);
+ if (ret)
+ *allocated_mr = 1;
+ break;
+
+ default:
+ return (-EINVAL);
+ }
+
+ if (ret)
+ break;
+ }
+
+ RDSV3_DPRINTF4("rdsv3_cmsg_send", "Return(rs: %p)", rs);
+
+ return (ret);
+}
+
+int
+rdsv3_sendmsg(struct rdsv3_sock *rs, uio_t *uio, struct nmsghdr *msg,
+ size_t payload_len)
+{
+ struct rsock *sk = rdsv3_rs_to_sk(rs);
+ struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
+ uint32_be_t daddr;
+ uint16_be_t dport;
+ struct rdsv3_message *rm = NULL;
+ struct rdsv3_connection *conn;
+ int ret = 0;
+ int queued = 0, allocated_mr = 0;
+ int nonblock = msg->msg_flags & MSG_DONTWAIT;
+ long timeo = rdsv3_rcvtimeo(sk, nonblock);
+
+ RDSV3_DPRINTF4("rdsv3_sendmsg", "Enter(rs: %p)", rs);
+
+ if (msg->msg_namelen) {
+ /* XXX fail non-unicast destination IPs? */
+ if (msg->msg_namelen < sizeof (*usin) ||
+ usin->sin_family != AF_INET_OFFLOAD) {
+ ret = -EINVAL;
+ RDSV3_DPRINTF2("rdsv3_sendmsg", "returning: %d", -ret);
+ goto out;
+ }
+ daddr = usin->sin_addr.s_addr;
+ dport = usin->sin_port;
+ } else {
+ /* We only care about consistency with ->connect() */
+ mutex_enter(&sk->sk_lock);
+ daddr = rs->rs_conn_addr;
+ dport = rs->rs_conn_port;
+ mutex_exit(&sk->sk_lock);
+ }
+
+ /* racing with another thread binding seems ok here */
+ if (daddr == 0 || rs->rs_bound_addr == 0) {
+ ret = -ENOTCONN; /* XXX not a great errno */
+ RDSV3_DPRINTF2("rdsv3_sendmsg", "returning: %d", -ret);
+ goto out;
+ }
+
+ rm = rdsv3_message_copy_from_user(uio, payload_len);
+ if (IS_ERR(rm)) {
+ ret = PTR_ERR(rm);
+ RDSV3_DPRINTF2("rdsv3_sendmsg",
+ "rdsv3_message_copy_from_user failed %d", -ret);
+ rm = NULL;
+ goto out;
+ }
+
+ rm->m_daddr = daddr;
+
+ /*
+ * rdsv3_conn_create has a spinlock that runs with IRQ off.
+ * Caching the conn in the socket helps a lot.
+ */
+ mutex_enter(&rs->rs_conn_lock);
+ if (rs->rs_conn && rs->rs_conn->c_faddr == daddr) {
+ conn = rs->rs_conn;
+ } else {
+ conn = rdsv3_conn_create_outgoing(rs->rs_bound_addr,
+ daddr, rs->rs_transport, KM_NOSLEEP);
+ if (IS_ERR(conn)) {
+ mutex_exit(&rs->rs_conn_lock);
+ ret = PTR_ERR(conn);
+ RDSV3_DPRINTF2("rdsv3_sendmsg",
+ "rdsv3_conn_create_outgoing failed %d",
+ -ret);
+ goto out;
+ }
+ rs->rs_conn = conn;
+ }
+ mutex_exit(&rs->rs_conn_lock);
+
+ /* Parse any control messages the user may have included. */
+ ret = rdsv3_cmsg_send(rs, rm, msg, &allocated_mr);
+ if (ret) {
+ RDSV3_DPRINTF2("rdsv3_sendmsg",
+ "rdsv3_cmsg_send(rs: %p rm: %p msg: %p) returned: %d",
+ rs, rm, msg, ret);
+ goto out;
+ }
+
+ if ((rm->m_rdma_cookie || rm->m_rdma_op) &&
+ conn->c_trans->xmit_rdma == NULL) {
+ RDSV3_DPRINTF0("rdsv3_sendmsg", "rdma_op %p conn xmit_rdma %p",
+ rm->m_rdma_op, conn->c_trans->xmit_rdma);
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /*
+ * If the connection is down, trigger a connect. We may
+ * have scheduled a delayed reconnect however - in this case
+ * we should not interfere.
+ */
+ if (rdsv3_conn_state(conn) == RDSV3_CONN_DOWN &&
+ !test_and_set_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags))
+ rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 0);
+
+ ret = rdsv3_cong_wait(conn->c_fcong, dport, nonblock, rs);
+ if (ret) {
+ RDSV3_DPRINTF2("rdsv3_sendmsg",
+ "rdsv3_cong_wait (dport: %d) returned: %d", dport, ret);
+ goto out;
+ }
+
+ (void) rdsv3_send_queue_rm(rs, conn, rm, rs->rs_bound_port, dport,
+ &queued);
+ if (!queued) {
+ /* rdsv3_stats_inc(s_send_queue_full); */
+ /* XXX make sure this is reasonable */
+ if (payload_len > rdsv3_sk_sndbuf(rs)) {
+ ret = -EMSGSIZE;
+ RDSV3_DPRINTF2("rdsv3_sendmsg",
+ "msgsize(%d) too big, returning: %d",
+ payload_len, -ret);
+ goto out;
+ }
+ if (nonblock) {
+ ret = -EAGAIN;
+ RDSV3_DPRINTF3("rdsv3_sendmsg",
+ "send queue full (%d), returning: %d",
+ payload_len, -ret);
+ goto out;
+ }
+
+ mutex_enter(&sk->sk_sleep->waitq_mutex);
+ while (!rdsv3_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
+ dport, &queued)) {
+#if 0
+ ret = cv_timedwait_sig(&sk->sk_sleep->waitq_cv,
+ &sk->sk_sleep->waitq_mutex,
+ timeo * drv_usectohz(1000000) + ddi_get_lbolt());
+ if (ret <= 0) {
+ /* signal/timeout pending */
+ RDSV3_DPRINTF2("rdsv3_sendmsg",
+ "woke due to signal/timeout: %d",
+ ret);
+ ret = (ret == 0) ? -ERESTART : -ETIMEDOUT;
+ mutex_exit(&sk->sk_sleep->waitq_mutex);
+ goto out;
+ }
+#else
+ ret = cv_wait_sig(&sk->sk_sleep->waitq_cv,
+ &sk->sk_sleep->waitq_mutex);
+ if (ret == 0) {
+ /* signal/timeout pending */
+ RDSV3_DPRINTF2("rdsv3_sendmsg",
+ "woke due to signal: %d",
+ ret);
+ ret = -ERESTART;
+ mutex_exit(&sk->sk_sleep->waitq_mutex);
+ goto out;
+ }
+#endif
+ }
+ mutex_exit(&sk->sk_sleep->waitq_mutex);
+
+ RDSV3_DPRINTF5("rdsv3_sendmsg", "sendmsg woke queued %d",
+ queued);
+
+ ASSERT(queued);
+ ret = 0;
+ }
+
+ /*
+ * By now we've committed to the send. We reuse rdsv3_send_worker()
+ * to retry sends in the rds thread if the transport asks us to.
+ */
+ rdsv3_stats_inc(s_send_queued);
+
+ if (!test_bit(RDSV3_LL_SEND_FULL, &conn->c_flags))
+ rdsv3_send_worker(&conn->c_send_w.work);
+
+ rdsv3_message_put(rm);
+ RDSV3_DPRINTF4("rdsv3_sendmsg", "Return(rs: %p, len: %d)",
+ rs, payload_len);
+ return (payload_len);
+
+out:
+ /*
+ * If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
+ * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
+ * or in any other way, we need to destroy the MR again
+ */
+ if (allocated_mr)
+ rdsv3_rdma_unuse(rs, rdsv3_rdma_cookie_key(rm->m_rdma_cookie),
+ 1);
+
+ if (rm)
+ rdsv3_message_put(rm);
+ return (ret);
+}
+
+/*
+ * Reply to a ping packet.
+ */
+int
+rdsv3_send_pong(struct rdsv3_connection *conn, uint16_be_t dport)
+{
+ struct rdsv3_message *rm;
+ int ret = 0;
+
+ RDSV3_DPRINTF4("rdsv3_send_pong", "Enter(conn: %p)", conn);
+
+ rm = rdsv3_message_alloc(0, KM_NOSLEEP);
+ if (rm == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ rm->m_daddr = conn->c_faddr;
+
+ /*
+ * If the connection is down, trigger a connect. We may
+ * have scheduled a delayed reconnect however - in this case
+ * we should not interfere.
+ */
+ if (rdsv3_conn_state(conn) == RDSV3_CONN_DOWN &&
+ !test_and_set_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags))
+ rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 0);
+
+ ret = rdsv3_cong_wait(conn->c_fcong, dport, 1, NULL);
+ if (ret)
+ goto out;
+
+ mutex_enter(&conn->c_lock);
+ list_insert_tail(&conn->c_send_queue, rm);
+ set_bit(RDSV3_MSG_ON_CONN, &rm->m_flags);
+ rdsv3_message_addref(rm);
+ rm->m_inc.i_conn = conn;
+
+ rdsv3_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
+ conn->c_next_tx_seq);
+ conn->c_next_tx_seq++;
+ mutex_exit(&conn->c_lock);
+
+ rdsv3_stats_inc(s_send_queued);
+ rdsv3_stats_inc(s_send_pong);
+
+ rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0);
+ rdsv3_message_put(rm);
+
+ RDSV3_DPRINTF4("rdsv3_send_pong", "Return(conn: %p)", conn);
+ return (0);
+
+out:
+ if (rm)
+ rdsv3_message_put(rm);
+ return (ret);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/stats.c b/usr/src/uts/common/io/ib/clients/rdsv3/stats.c
new file mode 100644
index 0000000000..0082657127
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/stats.c
@@ -0,0 +1,174 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/rds.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+
+RDSV3_DEFINE_PER_CPU(struct rdsv3_statistics, rdsv3_stats);
+
+static char *rdsv3_stat_names[] = {
+ "conn_reset",
+ "recv_drop_bad_checksum",
+ "recv_drop_old_seq",
+ "recv_drop_no_sock",
+ "recv_drop_dead_sock",
+ "recv_deliver_raced",
+ "recv_delivered",
+ "recv_queued",
+ "recv_immediate_retry",
+ "recv_delayed_retry",
+ "recv_ack_required",
+ "recv_rdma_bytes",
+ "recv_ping",
+ "send_queue_empty",
+ "send_queue_full",
+ "send_sem_contention",
+ "send_sem_queue_raced",
+ "send_immediate_retry",
+ "send_delayed_retry",
+ "send_drop_acked",
+ "send_ack_required",
+ "send_queued",
+ "send_rdma",
+ "send_rdma_bytes",
+ "send_pong",
+ "page_remainder_hit",
+ "page_remainder_miss",
+ "copy_to_user",
+ "copy_from_user",
+ "cong_update_queued",
+ "cong_update_received",
+ "cong_send_error",
+ "cong_send_blocked",
+};
+
+void
+rdsv3_stats_info_copy(struct rdsv3_info_iterator *iter,
+ uint64_t *values, char **names, size_t nr)
+{
+ struct rdsv3_info_counter ctr;
+ size_t i;
+
+ for (i = 0; i < nr; i++) {
+ ASSERT(!(strlen(names[i]) >= sizeof (ctr.name)));
+ (void) strncpy((char *)ctr.name, names[i],
+ sizeof (ctr.name) - 1);
+ ctr.value = values[i];
+
+ rdsv3_info_copy(iter, &ctr, sizeof (ctr));
+ }
+}
+
+/*
+ * This gives global counters across all the transports. The strings
+ * are copied in so that the tool doesn't need knowledge of the specific
+ * stats that we're exporting. Some are pretty implementation dependent
+ * and may change over time. That doesn't stop them from being useful.
+ *
+ * This is the only function in the chain that knows about the byte granular
+ * length in userspace. It converts it to number of stat entries that the
+ * rest of the functions operate in.
+ */
+/* ARGSUSED */
+static void
+rdsv3_stats_info(struct rsock *sock, unsigned int len,
+ struct rdsv3_info_iterator *iter,
+ struct rdsv3_info_lengths *lens)
+{
+ struct rdsv3_statistics stats = {0, };
+ uint64_t *src;
+ uint64_t *sum;
+ size_t i;
+ int cpu;
+ unsigned int avail;
+
+ avail = len / sizeof (struct rdsv3_info_counter);
+
+ if (avail < ARRAY_SIZE(rdsv3_stat_names)) {
+ avail = 0;
+ goto trans;
+ }
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ src = (uint64_t *)&(rdsv3_per_cpu(rdsv3_stats, cpu));
+ sum = (uint64_t *)&stats;
+ for (i = 0; i < sizeof (stats) / sizeof (uint64_t); i++)
+ *(sum++) += *(src++);
+ }
+
+ rdsv3_stats_info_copy(iter, (uint64_t *)&stats, rdsv3_stat_names,
+ ARRAY_SIZE(rdsv3_stat_names));
+ avail -= ARRAY_SIZE(rdsv3_stat_names);
+
+trans:
+ lens->each = sizeof (struct rdsv3_info_counter);
+ lens->nr = rdsv3_trans_stats_info_copy(iter, avail) +
+ ARRAY_SIZE(rdsv3_stat_names);
+}
+
+void
+rdsv3_stats_exit(void)
+{
+ rdsv3_info_deregister_func(RDSV3_INFO_COUNTERS, rdsv3_stats_info);
+}
+
+int
+rdsv3_stats_init(void)
+{
+ rdsv3_info_register_func(RDSV3_INFO_COUNTERS, rdsv3_stats_info);
+ return (0);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/sysctl.c b/usr/src/uts/common/io/ib/clients/rdsv3/sysctl.c
new file mode 100644
index 0000000000..3115394d0e
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/sysctl.c
@@ -0,0 +1,86 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#define HZ 100
+#define msecs_to_jiffies(a) a
+
+static unsigned long rdsv3_sysctl_reconnect_min = 1;
+
+unsigned long rdsv3_sysctl_reconnect_min_jiffies;
+unsigned long rdsv3_sysctl_reconnect_max_jiffies = HZ;
+
+unsigned int rdsv3_sysctl_max_unacked_packets = 8;
+unsigned int rdsv3_sysctl_max_unacked_bytes = (16 << 20);
+
+unsigned int rdsv3_sysctl_ping_enable = 1;
+
+unsigned long rdsv3_sysctl_trace_flags = 0;
+unsigned int rdsv3_sysctl_trace_level = 0;
+
+void
+rdsv3_sysctl_exit(void)
+{
+}
+
+int
+rdsv3_sysctl_init(void)
+{
+ rdsv3_sysctl_reconnect_min = msecs_to_jiffies(1);
+ rdsv3_sysctl_reconnect_min_jiffies = rdsv3_sysctl_reconnect_min;
+
+ return (0);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/threads.c b/usr/src/uts/common/io/ib/clients/rdsv3/threads.c
new file mode 100644
index 0000000000..3b3bceee96
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/threads.c
@@ -0,0 +1,356 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/rds.h>
+#include <sys/sunddi.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+/*
+ * All of connection management is simplified by serializing it through
+ * work queues that execute in a connection managing thread.
+ *
+ * TCP wants to send acks through sendpage() in response to data_ready(),
+ * but it needs a process context to do so.
+ *
+ * The receive paths need to allocate but can't drop packets (!) so we have
+ * a thread around to block allocating if the receive fast path sees an
+ * allocation failure.
+ */
+
+/*
+ * Grand Unified Theory of connection life cycle:
+ * At any point in time, the connection can be in one of these states:
+ * DOWN, CONNECTING, UP, DISCONNECTING, ERROR
+ *
+ * The following transitions are possible:
+ * ANY -> ERROR
+ * UP -> DISCONNECTING
+ * ERROR -> DISCONNECTING
+ * DISCONNECTING -> DOWN
+ * DOWN -> CONNECTING
+ * CONNECTING -> UP
+ *
+ * Transition to state DISCONNECTING/DOWN:
+ * - Inside the shutdown worker; synchronizes with xmit path
+ * through c_send_lock, and with connection management callbacks
+ * via c_cm_lock.
+ *
+ * For receive callbacks, we rely on the underlying transport
+ * (TCP, IB/RDMA) to provide the necessary synchronisation.
+ */
+struct rdsv3_workqueue_struct_s *rdsv3_wq;
+
+void
+rdsv3_connect_complete(struct rdsv3_connection *conn)
+{
+ RDSV3_DPRINTF4("rdsv3_connect_complete", "Enter(conn: %p)", conn);
+
+ if (!rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING,
+ RDSV3_CONN_UP)) {
+#ifndef __lock_lint
+ RDSV3_DPRINTF0("rdsv3_connect_complete",
+ "%s: Cannot transition to state UP, "
+ "current state is %d",
+ __func__,
+ atomic_get(&conn->c_state));
+#endif
+ conn->c_state = RDSV3_CONN_ERROR;
+ rdsv3_queue_work(rdsv3_wq, &conn->c_down_w);
+ return;
+ }
+
+ RDSV3_DPRINTF2("rdsv3_connect_complete",
+ "conn %p for %u.%u.%u.%u to %u.%u.%u.%u complete",
+ conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr));
+
+ conn->c_reconnect_jiffies = 0;
+ set_bit(0, &conn->c_map_queued);
+ rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0);
+ rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0);
+
+ RDSV3_DPRINTF4("rdsv3_connect_complete", "Return(conn: %p)", conn);
+}
+
+/*
+ * This random exponential backoff is relied on to eventually resolve racing
+ * connects.
+ *
+ * If connect attempts race then both parties drop both connections and come
+ * here to wait for a random amount of time before trying again. Eventually
+ * the backoff range will be so much greater than the time it takes to
+ * establish a connection that one of the pair will establish the connection
+ * before the other's random delay fires.
+ *
+ * Connection attempts that arrive while a connection is already established
+ * are also considered to be racing connects. This lets a connection from
+ * a rebooted machine replace an existing stale connection before the transport
+ * notices that the connection has failed.
+ *
+ * We should *always* start with a random backoff; otherwise a broken connection
+ * will always take several iterations to be re-established.
+ */
+static void
+rdsv3_queue_reconnect(struct rdsv3_connection *conn)
+{
+ unsigned long rand;
+
+ RDSV3_DPRINTF2("rdsv3_queue_reconnect",
+ "conn %p for %u.%u.%u.%u to %u.%u.%u.%u reconnect jiffies %lu",
+ conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr),
+ conn->c_reconnect_jiffies);
+
+ set_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags);
+ if (conn->c_reconnect_jiffies == 0) {
+ conn->c_reconnect_jiffies = rdsv3_sysctl_reconnect_min_jiffies;
+ rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 0);
+ return;
+ }
+
+ (void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand));
+ RDSV3_DPRINTF5("rdsv3",
+ "%lu delay %lu ceil conn %p for %u.%u.%u.%u -> %u.%u.%u.%u",
+ rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies,
+ conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr));
+ rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w,
+ rand % conn->c_reconnect_jiffies);
+
+ conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2,
+ rdsv3_sysctl_reconnect_max_jiffies);
+}
+
+void
+rdsv3_connect_worker(struct rdsv3_work_s *work)
+{
+ struct rdsv3_connection *conn = container_of(work,
+ struct rdsv3_connection, c_conn_w.work);
+ int ret;
+
+ RDSV3_DPRINTF2("rdsv3_connect_worker", "Enter(work: %p)", work);
+
+ clear_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags);
+ if (rdsv3_conn_transition(conn, RDSV3_CONN_DOWN,
+ RDSV3_CONN_CONNECTING)) {
+ ret = conn->c_trans->conn_connect(conn);
+ RDSV3_DPRINTF5("rdsv3",
+ "connect conn %p for %u.%u.%u.%u -> %u.%u.%u.%u "
+ "ret %d", conn, NIPQUAD(conn->c_laddr),
+ NIPQUAD(conn->c_faddr), ret);
+ RDSV3_DPRINTF2("rdsv3_connect_worker",
+ "conn %p for %u.%u.%u.%u to %u.%u.%u.%u dispatched, ret %d",
+ conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), ret);
+
+ if (ret) {
+ if (rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING,
+ RDSV3_CONN_DOWN))
+ rdsv3_queue_reconnect(conn);
+ else {
+ RDSV3_DPRINTF2("rdsv3_connect_worker",
+ "RDS: connect failed: %p", conn);
+ rdsv3_conn_drop(conn);
+ }
+ }
+ }
+
+ RDSV3_DPRINTF2("rdsv3_connect_worker", "Return(work: %p)", work);
+}
+
+extern struct avl_tree rdsv3_conn_hash;
+
+void
+rdsv3_shutdown_worker(struct rdsv3_work_s *work)
+{
+ struct rdsv3_connection *conn = container_of(work,
+ struct rdsv3_connection, c_down_w);
+ struct rdsv3_conn_info_s conn_info;
+
+ RDSV3_DPRINTF2("rdsv3_shutdown_worker", "Enter(work: %p)", work);
+
+ /* shut it down unless it's down already */
+ if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, RDSV3_CONN_DOWN)) {
+ /*
+ * Quiesce the connection mgmt handlers before we start tearing
+ * things down. We don't hold the mutex for the entire
+ * duration of the shutdown operation, else we may be
+ * deadlocking with the CM handler. Instead, the CM event
+ * handler is supposed to check for state DISCONNECTING
+ */
+ mutex_enter(&conn->c_cm_lock);
+ if (!rdsv3_conn_transition(conn, RDSV3_CONN_UP,
+ RDSV3_CONN_DISCONNECTING) &&
+ !rdsv3_conn_transition(conn, RDSV3_CONN_ERROR,
+ RDSV3_CONN_DISCONNECTING)) {
+ RDSV3_DPRINTF2("rdsv3_shutdown_worker",
+ "RDS: connect failed: conn: %p, state: %d",
+ conn, atomic_get(&conn->c_state));
+ rdsv3_conn_drop(conn);
+ mutex_exit(&conn->c_cm_lock);
+ return;
+ }
+ mutex_exit(&conn->c_cm_lock);
+
+ mutex_enter(&conn->c_send_lock);
+ conn->c_trans->conn_shutdown(conn);
+ rdsv3_conn_reset(conn);
+ mutex_exit(&conn->c_send_lock);
+
+ if (!rdsv3_conn_transition(conn, RDSV3_CONN_DISCONNECTING,
+ RDSV3_CONN_DOWN)) {
+ /*
+ * This can happen - eg when we're in the middle of
+ * tearing down the connection, and someone unloads
+ * the rds module. Quite reproduceable with loopback
+ * connections. Mostly harmless.
+ */
+#ifndef __lock_lint
+ RDSV3_DPRINTF2("rdsv3_shutdown_worker",
+ "failed to transition to state DOWN, "
+ "current statis is: %d conn: %p",
+ atomic_get(&conn->c_state), conn);
+ rdsv3_conn_drop(conn);
+#endif
+ return;
+ }
+ }
+
+ /*
+ * Then reconnect if it's still live.
+ * The passive side of an IB loopback connection is never added
+ * to the conn hash, so we never trigger a reconnect on this
+ * conn - the reconnect is always triggered by the active peer.
+ */
+ rdsv3_cancel_delayed_work(&conn->c_conn_w);
+
+ conn_info.c_laddr = conn->c_laddr;
+ conn_info.c_faddr = conn->c_faddr;
+ if (avl_find(&rdsv3_conn_hash, &conn_info, NULL) == conn)
+ rdsv3_queue_reconnect(conn);
+
+ RDSV3_DPRINTF2("rdsv3_shutdown_worker", "Return(work: %p)", work);
+}
+
+void
+rdsv3_send_worker(struct rdsv3_work_s *work)
+{
+ struct rdsv3_connection *conn = container_of(work,
+ struct rdsv3_connection, c_send_w.work);
+ int ret;
+
+ RDSV3_DPRINTF4("rdsv3_send_worker", "Enter(work: %p)", work);
+
+ if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
+ ret = rdsv3_send_xmit(conn);
+ RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret);
+ switch (ret) {
+ case -EAGAIN:
+ rdsv3_stats_inc(s_send_immediate_retry);
+ rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0);
+ break;
+ case -ENOMEM:
+ rdsv3_stats_inc(s_send_delayed_retry);
+ rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 2);
+ default:
+ break;
+ }
+ }
+
+ RDSV3_DPRINTF4("rdsv3_send_worker", "Return(work: %p)", work);
+}
+
+void
+rdsv3_recv_worker(struct rdsv3_work_s *work)
+{
+ struct rdsv3_connection *conn = container_of(work,
+ struct rdsv3_connection, c_recv_w.work);
+ int ret;
+
+ RDSV3_DPRINTF4("rdsv3_recv_worker", "Enter(work: %p)", work);
+
+ if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
+ ret = conn->c_trans->recv(conn);
+ RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret);
+ switch (ret) {
+ case -EAGAIN:
+ rdsv3_stats_inc(s_recv_immediate_retry);
+ rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0);
+ break;
+ case -ENOMEM:
+ rdsv3_stats_inc(s_recv_delayed_retry);
+ rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 2);
+ default:
+ break;
+ }
+ }
+
+ RDSV3_DPRINTF4("rdsv3_recv_worker", "Return(work: %p)", work);
+}
+
+void
+rdsv3_threads_exit(void)
+{
+ rdsv3_destroy_task_workqueue(rdsv3_wq);
+}
+
+int
+rdsv3_threads_init(void)
+{
+ rdsv3_wq = rdsv3_create_task_workqueue("krdsd");
+ if (rdsv3_wq == NULL)
+ return (-ENOMEM);
+
+ return (0);
+}
diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/transport.c b/usr/src/uts/common/io/ib/clients/rdsv3/transport.c
new file mode 100644
index 0000000000..519442a392
--- /dev/null
+++ b/usr/src/uts/common/io/ib/clients/rdsv3/transport.c
@@ -0,0 +1,142 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <sys/ksynch.h>
+#include <sys/list.h>
+#include <sys/rds.h>
+#include <sys/sysmacros.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/loop.h>
+#include <sys/ib/clients/rdsv3/rdsv3_impl.h>
+#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
+
+list_t transports;
+krwlock_t trans_sem; /* this was a semaphore */
+
+int
+rdsv3_trans_register(struct rdsv3_transport *trans)
+{
+ RDSV3_DPRINTF4("rdsv3_trans_register", "Enter(trans: %p)", trans);
+
+ rw_enter(&trans_sem, RW_WRITER);
+
+ list_insert_tail(&transports, trans);
+
+ rw_exit(&trans_sem);
+
+ RDSV3_DPRINTF4("rdsv3_trans_register", "Return(trans: %p)", trans);
+
+ return (0);
+}
+
+void
+rdsv3_trans_unregister(struct rdsv3_transport *trans)
+{
+ RDSV3_DPRINTF4("rdsv3_trans_register", "Enter(trans: %p)", trans);
+
+ rw_enter(&trans_sem, RW_WRITER);
+
+ list_remove(&transports, trans);
+
+ rw_exit(&trans_sem);
+
+ RDSV3_DPRINTF4("rdsv3_trans_register", "Return(trans: %p)", trans);
+}
+
+struct rdsv3_transport *
+rdsv3_trans_get_preferred(uint32_be_t addr)
+{
+ struct rdsv3_transport *trans;
+ struct rdsv3_transport *ret = NULL;
+
+ RDSV3_DPRINTF4("rdsv3_trans_get_preferred", "Enter(addr: %x)",
+ ntohl(addr));
+
+ if (rdsv3_isloopback(addr))
+ return (&rdsv3_loop_transport);
+
+ rw_enter(&trans_sem, RW_READER);
+ RDSV3_FOR_EACH_LIST_NODE(trans, &transports, t_item) {
+ if (trans->laddr_check(addr) == 0) {
+ ret = trans;
+ break;
+ }
+ }
+ rw_exit(&trans_sem);
+
+ RDSV3_DPRINTF4("rdsv3_trans_get_preferred",
+ "Return(addr: %x, ret: %p)", ntohl(addr), ret);
+
+ return (ret);
+}
+
+/*
+ * This returns the number of stats entries in the snapshot and only
+ * copies them using the iter if there is enough space for them. The
+ * caller passes in the global stats so that we can size and copy while
+ * holding the lock.
+ */
+/* ARGSUSED */
+unsigned int
+rdsv3_trans_stats_info_copy(struct rdsv3_info_iterator *iter,
+ unsigned int avail)
+{
+ /*
+ * XXX - Add this when we port info (info.c)
+ */
+ return (0);
+}
diff --git a/usr/src/uts/common/io/warlock/rdsv3.wlcmd b/usr/src/uts/common/io/warlock/rdsv3.wlcmd
new file mode 100644
index 0000000000..8b50fb6de5
--- /dev/null
+++ b/usr/src/uts/common/io/warlock/rdsv3.wlcmd
@@ -0,0 +1,365 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+#
+
+# entry points
+root _init
+root _info
+root _fini
+root __rdsv3_conn_create
+root __rdsv3_conn_error
+root __rdsv3_ib_conn_error
+root __rdsv3_ib_destroy_conns
+root __rdsv3_ib_ring_empty
+root __rdsv3_ib_ring_used
+root __rdsv3_ib_teardown_mr
+root __rdsv3_put_mr_final
+root __rdsv3_rdma_map
+root __rdsv3_rdma_send_complete
+root __rdsv3_wake_sk_sleep
+root _fini
+root _info
+root _init
+root ib_addr_get_dgid
+root ib_addr_get_mgid
+root ib_addr_get_pkey
+root ib_addr_get_sgid
+root ib_addr_set_dgid
+root ib_addr_set_pkey
+root ib_addr_set_sgid
+root ib_width_enum_to_int
+root init_genlist
+root ip_addr_size
+root rdsv3_activate
+root rdsv3_add_bound
+root rdsv3_attach
+root rdsv3_bind
+root rdsv3_bind_node_compare
+root rdsv3_bind_tree_exit
+root rdsv3_bind_tree_init
+root rdsv3_bind_tree_walk
+root rdsv3_cancel_delayed_work
+root rdsv3_capable_interface
+root rdsv3_clear_print_buf
+root rdsv3_clear_recv_queue
+root rds_clif_name
+root rdsv3_clrflowctrl
+root rdsv3_cmsg_rdma_args
+root rdsv3_cmsg_rdma_dest
+root rdsv3_cmsg_rdma_map
+root rdsv3_cmsg_recv
+root rdsv3_cmsg_send
+root rdsv3_cong_add_conn
+root rdsv3_cong_add_socket
+root rdsv3_cong_clear_bit
+root rdsv3_cong_compare
+root rdsv3_cong_exit
+root rdsv3_cong_from_addr
+root rdsv3_cong_get_maps
+root rdsv3_cong_init
+root rdsv3_cong_map_updated
+root rdsv3_cong_monitor
+root rdsv3_cong_queue_updates
+root rdsv3_cong_remove_conn
+root rdsv3_cong_remove_socket
+root rdsv3_cong_set_bit
+root rdsv3_cong_test_bit
+root rdsv3_cong_tree_walk
+root rdsv3_cong_update_alloc
+root rdsv3_cong_updated_since
+root rdsv3_cong_wait
+root rdsv3_conn_compare
+root rdsv3_conn_constructor
+root rdsv3_conn_create
+root rdsv3_conn_create_outgoing
+root rdsv3_conn_destroy
+root rdsv3_conn_destructor
+root rdsv3_conn_drop
+root rdsv3_conn_exit
+root rdsv3_conn_info
+root rdsv3_conn_info_visitor
+root rdsv3_conn_init
+root rdsv3_conn_is_sending
+root rdsv3_conn_lookup
+root rdsv3_conn_message_info
+root rdsv3_conn_message_info_retrans
+root rdsv3_conn_message_info_send
+root rdsv3_conn_transition
+root rdsv3_connect
+root rdsv3_connect_complete
+root rdsv3_connect_worker
+root rdsv3_create_singlethread_workqueue
+root rdsv3_destroy_mr
+root rdsv3_destroy_workqueue
+root rdsv3_detach
+root rdsv3_do_ip_ioctl
+root rdsv3_dprintf_intr
+root rdsv3_dprintf0
+root rdsv3_dprintf1
+root rdsv3_dprintf2
+root rdsv3_dprintf3
+root rdsv3_dprintf4
+root rdsv3_dprintf5
+root rdsv3_exit
+root rdsv3_fast_ip_csum
+root rdsv3_find_bound
+root rdsv3_flush_workqueue
+root rdsv3_for_each_conn_info
+root rdsv3_free_mr
+root rdsv3_get_mr
+root rdsv3_getname
+root rdsv3_getpeername
+root rdsv3_getsockopt
+root rdsv3_ib_ack_send_complete
+root rdsv3_ib_add_conn
+root rdsv3_ib_add_ipaddr
+root rdsv3_ib_add_one
+root rdsv3_ib_advertise_credits
+root rdsv3_ib_alloc_fmr
+root rdsv3_ib_alloc_hdrs
+root rdsv3_ib_attempt_ack
+root rdsv3_ib_cm_connect_complete
+root rdsv3_ib_cm_fill_conn_param
+root rdsv3_ib_cm_handle_connect
+root rdsv3_ib_cm_initiate_connect
+root rdsv3_ib_cong_recv
+root rdsv3_ib_conn_alloc
+root rdsv3_ib_conn_connect
+root rdsv3_ib_conn_free
+root rdsv3_ib_conn_info_visitor
+root rdsv3_ib_conn_shutdown
+root rdsv3_ib_cq_event_handler
+root rdsv3_ib_destroy_conns
+root rdsv3_ib_destroy_nodev_conns
+root rdsv3_ib_dma_map_sg
+root rdsv3_ib_dma_map_sg_rdma
+root rdsv3_ib_dma_unmap_sg
+root rdsv3_ib_dma_unmap_sg_rdma
+root rdsv3_ib_exit
+root rdsv3_ib_flush_mr_pool
+root rdsv3_ib_flush_mrs
+root rdsv3_ib_frag_drop_page
+root rdsv3_ib_frag_free
+root rdsv3_ib_free_hdrs
+root rdsv3_ib_free_mr
+root rdsv3_ib_get_ack
+root rdsv3_ib_get_device
+root rdsv3_ib_get_header
+root rdsv3_ib_get_mr
+root rdsv3_ib_get_mr_info
+root rdsv3_ib_ic_info
+root rdsv3_ib_inc_copy_to_user
+root rdsv3_ib_inc_free
+root rdsv3_ib_inc_purge
+root rdsv3_ib_init
+root rdsv3_ib_laddr_check
+root rdsv3_ib_map_fmr
+root rdsv3_ib_piggyb_ack
+root rdsv3_ib_process_recv
+root rdsv3_ib_protocol_compatible
+root rdsv3_ib_qp_event_handler
+root rdsv3_ib_recv
+root rdsv3_ib_recv_clear_one
+root rdsv3_ib_recv_clear_ring
+root rdsv3_ib_recv_cq_comp_handler
+root rdsv3_ib_recv_exit
+root rdsv3_ib_recv_init
+root rdsv3_ib_recv_init_ack
+root rdsv3_ib_recv_init_ring
+root rdsv3_ib_recv_refill
+root rdsv3_ib_recv_refill_one
+root rdsv3_ib_recv_unmap_page
+root rdsv3_ib_remove_conn
+root rdsv3_ib_remove_ipaddr
+root rdsv3_ib_remove_one
+root rdsv3_ib_ring_alloc
+root rdsv3_ib_ring_completed
+root rdsv3_ib_ring_empty
+root rdsv3_ib_ring_free
+root rdsv3_ib_ring_init
+root rdsv3_ib_ring_low
+root rdsv3_ib_ring_oldest
+root rdsv3_ib_ring_resize
+root rdsv3_ib_ring_unalloc
+root rdsv3_ib_send_ack
+root rdsv3_ib_send_add_credits
+root rdsv3_ib_send_clear_ring
+root rdsv3_ib_send_cq_comp_handler
+root rdsv3_ib_send_grab_credits
+root rdsv3_ib_send_init_ring
+root rdsv3_ib_send_rdma_complete
+root rdsv3_ib_send_unmap_rdma
+root rdsv3_ib_send_unmap_rm
+root rdsv3_ib_set_ack
+root rdsv3_ib_set_flow_control
+root rdsv3_ib_set_protocol
+root rdsv3_ib_setup_qp
+root rdsv3_ib_sg_dma_address
+root rdsv3_ib_stats_info_copy
+root rdsv3_ib_sync_mr
+root rdsv3_ib_sysctl_exit
+root rdsv3_ib_sysctl_init
+root rdsv3_ib_tune_rnr
+root rdsv3_ib_update_ipaddr
+root rdsv3_ib_xmit
+root rdsv3_ib_xmit_complete
+root rdsv3_ib_xmit_populate_wr
+root rdsv3_ib_xmit_rdma
+root rdsv3_if_lookup_by_addr
+root rdsv3_if_lookup_by_name
+root rdsv3_inc_addref
+root rdsv3_inc_info_copy
+root rdsv3_inc_init
+root rdsv3_inc_put
+root rdsv3_info
+root rdsv3_info_deregister_func
+root rdsv3_info_getsockopt
+root rdsv3_info_register_func
+root rdsv3_init
+root rdsv3_ioctl
+root rdsv3_logging_destroy
+root rdsv3_logging_initialization
+root rdsv3_loop_conn_alloc
+root rdsv3_loop_conn_connect
+root rdsv3_loop_conn_free
+root rdsv3_loop_conn_shutdown
+root rdsv3_loop_exit
+root rdsv3_loop_init
+root rdsv3_loop_recv
+root rdsv3_loop_xmit
+root rdsv3_loop_xmit_cong_map
+root rdsv3_message_add_extension
+root rdsv3_message_add_rdma_dest_extension
+root rdsv3_message_add_version_extension
+root rdsv3_message_addref
+root rdsv3_message_alloc
+root rdsv3_message_copy_from_user
+root rdsv3_message_get_version_extension
+root rdsv3_message_inc_copy_to_user
+root rdsv3_message_inc_free
+root rdsv3_message_inc_purge
+root rdsv3_message_map_pages
+root rdsv3_message_next_extension
+root rdsv3_message_populate_header
+root rdsv3_message_purge
+root rdsv3_message_put
+root rdsv3_message_unmapped
+root rdsv3_message_wait
+root rdsv3_mr_compare
+root rdsv3_mr_put
+root rdsv3_mr_tree_walk
+root rdsv3_next_incoming
+root rdsv3_notify_cong
+root rdsv3_notify_queue_get
+root rdsv3_ntop
+root rdsv3_page_remainder_alloc
+root rdsv3_pages_in_vec
+root rds_path_down
+root rds_path_up
+root rdsv3_poll
+root rdsv3_poll_wait
+root rdsv3_put_cmsg
+root rdsv3_queue_delayed_work
+root rdsv3_queue_reconnect
+root rdsv3_queue_work
+root rdsv3_rdma_cm_event_handler
+root rdsv3_rdma_drop_keys
+root rdsv3_rdma_exit
+root rdsv3_rdma_free_op
+root rdsv3_rdma_init
+root rdsv3_rdma_listen_init
+root rdsv3_rdma_listen_stop
+root rdsv3_rdma_prepare
+root rdsv3_rdma_send_complete
+root rdsv3_rdma_unuse
+root rdsv3_recv_incoming
+root rdsv3_recv_incoming_exthdrs
+root rdsv3_recv_rcvbuf_delta
+root rdsv3_recv_uio
+root rdsv3_recv_worker
+root rdsv3_recvmsg
+root rdsv3_release
+root rdsv3_remove_bound
+root rds_sc_path_lookup
+root rdsv3_scaddr_to_ibaddr
+root rdsv3_send_acked_before
+root rdsv3_send_drop_acked
+root rdsv3_send_drop_to
+root rdsv3_send_get_message
+root rdsv3_send_is_acked
+root rdsv3_send_pong
+root rdsv3_send_queue_rm
+root rdsv3_send_remove_from_sock
+root rdsv3_send_reset
+root rdsv3_send_sndbuf_remove
+root rdsv3_send_uio
+root rdsv3_send_worker
+root rdsv3_send_xmit
+root rdsv3_sendmsg
+root rdsv3_set_bool_option
+root rdsv3_setsockopt
+root rdsv3_shutdown
+root rdsv3_shutdown_worker
+root rdsv3_sk_alloc
+root rdsv3_sock_addref
+root rdsv3_sock_exit
+root rdsv3_sock_exit_data
+root rdsv3_sock_inc_info
+root rdsv3_sock_info
+root rdsv3_sock_init
+root rdsv3_sock_init_data
+root rdsv3_sock_put
+root rdsv3_stats_exit
+root rdsv3_stats_info
+root rdsv3_stats_info_copy
+root rdsv3_stats_init
+root rdsv3_still_queued
+root rdsv3_sysctl_exit
+root rdsv3_sysctl_init
+root rdsv3_threads_exit
+root rdsv3_threads_init
+root rdsv3_trace
+root rdsv3_trans_exit
+root rdsv3_trans_get_preferred
+root rdsv3_trans_init
+root rdsv3_trans_register
+root rdsv3_trans_stats_info_copy
+root rdsv3_trans_unregister
+root rdsv3_umem_cb
+root rdsv3_verify_bind_address
+root rdsv3_vlog
+root rdsv3_vprintk
+root rdsv3_wake_sk_sleep
+root rdsv3_work_timeout_handler
+root rdsv3_worker_thread
+root rdsv3_create
+root rdsv3_isloopback
+
+add bus_ops::bus_add_eventcall targets warlock_dummy
+add bus_ops::bus_config targets warlock_dummy
+add bus_ops::bus_get_eventcookie targets warlock_dummy
+add bus_ops::bus_intr_ctl targets warlock_dummy
+add bus_ops::bus_post_event targets warlock_dummy
+add bus_ops::bus_remove_eventcall targets warlock_dummy
+add bus_ops::bus_unconfig targets warlock_dummy
+
+
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index 677a328a49..a130c54ac0 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -465,6 +465,7 @@ CHKHDRS= \
random.h \
rctl.h \
rctl_impl.h \
+ rds.h \
reboot.h \
refstr.h \
refstr_impl.h \
@@ -754,7 +755,8 @@ SOL_UCMAHDRS= \
SOL_OFSHDRS= \
sol_cma.h \
sol_ib_cma.h \
- sol_ofs_common.h
+ sol_ofs_common.h \
+ sol_kverb_impl.h
TAVORHDRS= \
tavor_ioctl.h
diff --git a/usr/src/uts/common/sys/ib/clients/of/rdma/ib_verbs.h b/usr/src/uts/common/sys/ib/clients/of/rdma/ib_verbs.h
index 4f6c5f6829..0106d80848 100644
--- a/usr/src/uts/common/sys/ib/clients/of/rdma/ib_verbs.h
+++ b/usr/src/uts/common/sys/ib/clients/of/rdma/ib_verbs.h
@@ -649,6 +649,135 @@ typedef struct ib_client {
} state;
} ib_client_t;
+int ib_register_client(struct ib_client *client);
+void ib_unregister_client(struct ib_client *client);
+
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client);
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+ void *data);
+
+int ib_query_device(struct ib_device *device,
+ struct ib_device_attr *device_attr);
+
+/*
+ * ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ */
+struct ib_pd *ib_alloc_pd(struct ib_device *device);
+
+/*
+ * ib_dealloc_pd - Deallocates a protection domain.
+ * @pd: The protection domain to deallocate.
+ */
+int ib_dealloc_pd(struct ib_pd *pd);
+
+/*
+ * ib_create_qp - Creates a QP associated with the specified protection
+ * domain.
+ * @pd: The protection domain associated with the QP.
+ * @qp_init_attr: A list of initial attributes required to create the
+ * QP. If QP creation succeeds, then the attributes are updated to
+ * the actual capabilities of the created QP.
+ */
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr);
+
+/*
+ * ib_modify_qp - Modifies the attributes for the specified QP and then
+ * transitions the QP to the given state.
+ * @qp: The QP to modify.
+ * @qp_attr: On input, specifies the QP attributes to modify. On output,
+ * the current values of selected QP attributes are returned.
+ * @qp_attr_mask: A bit-mask used to specify which attributes of the QP
+ * are being modified.
+ */
+int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask);
+
+/*
+ * ib_destroy_qp - Destroys the specified QP.
+ * @qp: The QP to destroy.
+ */
+int ib_destroy_qp(struct ib_qp *qp);
+
+/*
+ * ib_create_cq - Creates a CQ on the specified device.
+ * @device: The device on which to create the CQ.
+ * @comp_handler: A user-specified callback that is invoked when a
+ * completion event occurs on the CQ.
+ * @event_handler: A user-specified callback that is invoked when an
+ * asynchronous event not associated with a completion occurs on the CQ.
+ * @cq_context: Context associated with the CQ returned to the user via
+ * the associated completion and event handlers.
+ * @cqe: The minimum size of the CQ.
+ * @comp_vector - Completion vector used to signal completion events.
+ * Must be >= 0 and < context->num_comp_vectors.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+struct ib_cq *ib_create_cq(struct ib_device *device,
+ ib_comp_handler comp_handler,
+ void (*event_handler)(struct ib_event *, void *),
+ void *cq_context, int cqe, int comp_vector);
+
+/*
+ * ib_destroy_cq - Destroys the specified CQ.
+ * @cq: The CQ to destroy.
+ */
+int ib_destroy_cq(struct ib_cq *cq);
+
+/*
+ * ib_poll_cq - poll a CQ for completion(s)
+ * @cq:the CQ being polled
+ * @num_entries:maximum number of completions to return
+ * @wc:array of at least @num_entries &struct ib_wc where completions
+ * will be returned
+ *
+ * Poll a CQ for (possibly multiple) completions. If the return value
+ * is < 0, an error occurred. If the return value is >= 0, it is the
+ * number of completions returned. If the return value is
+ * non-negative and < num_entries, then the CQ was emptied.
+ */
+int ib_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
+
+/*
+ * ib_req_notify_cq - Request completion notification on a CQ.
+ * @cq: The CQ to generate an event for.
+ * @flags:
+ * Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP
+ * to request an event on the next solicited event or next work
+ * completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS
+ * may also be |ed in to request a hint about missed events, as
+ * described below.
+ *
+ * Return Value:
+ * < 0 means an error occurred while requesting notification
+ * == 0 means notification was requested successfully, and if
+ * IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events
+ * were missed and it is safe to wait for another event. In
+ * this case is it guaranteed that any work completions added
+ * to the CQ since the last CQ poll will trigger a completion
+ * notification event.
+ * > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed
+ * in. It means that the consumer must poll the CQ again to
+ * make sure it is empty to avoid missing an event because of a
+ * race between requesting notification and an entry being
+ * added to the CQ. This return value means it is possible
+ * (but not guaranteed) that a work completion has been added
+ * to the CQ since the last poll without triggering a
+ * completion notification event.
+ */
+int ib_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+
+struct rdma_cm_id;
+ibt_hca_hdl_t ib_get_ibt_hca_hdl(struct ib_device *device);
+
+ibt_channel_hdl_t
+ib_get_ibt_channel_hdl(struct rdma_cm_id *cm);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/ib/clients/of/sol_ofs/sol_cma.h b/usr/src/uts/common/sys/ib/clients/of/sol_ofs/sol_cma.h
index b18d8405ce..d58e31cda2 100644
--- a/usr/src/uts/common/sys/ib/clients/of/sol_ofs/sol_cma.h
+++ b/usr/src/uts/common/sys/ib/clients/of/sol_ofs/sol_cma.h
@@ -140,8 +140,19 @@ typedef enum {
#define SOL_CMA_DISCONNECT_OK(chanp) (((chanp)->chan_connect_flag == \
SOL_CMA_CONNECT_INITIATED) || SOL_CMAID_IS_CONNECTED(chanp))
+/*
+ * CMID_DESTROYED - Flag to indicate rdma_destroy_id has been
+ * called for this CMID
+ *
+ * EVENT_PROGRESS - RDMACM Event for this CMID been passed to
+ * the sol_ofs client.
+ *
+ * API_PROGRESS - rdma_resolve_addr() / rdma_resolve_route() /
+ * rdma_listen() is in progress.
+ */
#define SOL_CMA_CALLER_CMID_DESTROYED 0x01
#define SOL_CMA_CALLER_EVENT_PROGRESS 0x02
+#define SOL_CMA_CALLER_API_PROGRESS 0x04
typedef enum {
REQ_CMID_NONE = 0,
@@ -211,6 +222,9 @@ typedef struct {
/* Session ID for completion */
void *chan_session_id;
+ uint32_t chan_qp_num;
+ uint8_t chan_is_srq;
+
union {
ibcma_chan_t chan_ib_xport;
} un_xport; /* Transport specific fields */
@@ -308,10 +322,8 @@ cma_get_acpt_idp(struct rdma_cm_id *root_idp, void *qp_hdl)
sol_cma_chan_t *root_chanp;
root_chanp = (sol_cma_chan_t *)root_idp;
- mutex_enter(&root_chanp->chan_mutex);
acpt_idp = (struct rdma_cm_id *)avl_find(
&root_chanp->chan_acpt_avl_tree, (void *)qp_hdl, NULL);
- mutex_exit(&root_chanp->chan_mutex);
return (acpt_idp);
}
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/ib/clients/of/sol_ofs/sol_kverb_impl.h b/usr/src/uts/common/sys/ib/clients/of/sol_ofs/sol_kverb_impl.h
new file mode 100644
index 0000000000..07168b0a48
--- /dev/null
+++ b/usr/src/uts/common/sys/ib/clients/of/sol_ofs/sol_kverb_impl.h
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_IB_CLIENTS_OF_SOL_OFS_SOL_KVERB_IMPL_H
+#define _SYS_IB_CLIENTS_OF_SOL_OFS_SOL_KVERB_IMPL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/ib/ibtl/ibvti.h>
+
+/*
+ * If there is yet an active async event, hdl is not freed. However,
+ * if the device state is IB_DEV_CLOSE, the device is about to be closed
+ * so that the event should be discarded.
+ */
+#define FIRE_QP_EVENT(clnt, hdl, ib_event, qpp, type) \
+ rw_enter(&clnt->lock, RW_READER); \
+ if (qpp && qpp->event_handler && \
+ qpp->device->reg_state == IB_DEV_OPEN) { \
+ ib_event.device = qpp->device; \
+ ib_event.event = type; \
+ ib_event.element.qp = qpp; \
+ qpp->event_handler(&ib_event, qpp->qp_context); \
+ } \
+ rw_exit(&clnt->lock)
+
+#define FIRE_CQ_EVENT(clnt, hdl, ib_event, cqp, type) \
+ rw_enter(&clnt->lock, RW_READER); \
+ if (cqp && cqp->event_handler && \
+ cqp->device->reg_state == IB_DEV_OPEN) { \
+ ib_event.device = cqp->device; \
+ ib_event.event = type; \
+ ib_event.element.cq = cqp; \
+ cqp->event_handler(&ib_event, cqp->cq_context); \
+ } \
+ rw_exit(&clnt->lock)
+
+#define IBTF2OF_PGSZ(hca_page_sz) ((hca_page_sz) << 10)
+#define OF2IBTF_STATE(s) ((enum ibt_cep_state_e)(s))
+#define OF2IBTF_SRATE(r) ((enum ibt_srate_e)(r))
+#define OF2IBTF_PATH_MIG_STATE(s) ((ibt_cep_cmstate_t)((s)+1))
+#define OF2IBTF_PATH_MTU(m) ((ib_mtu_t)(m))
+
+typedef unsigned int gfp_t;
+
+typedef struct sol_ofs_client_s {
+ ib_client_t *ib_client;
+ ibt_clnt_modinfo_t ibt_client;
+ ibt_clnt_hdl_t ibt_hdl;
+ uint_t hca_num;
+ uint_t hca_open_num;
+ llist_head_t device_list;
+ llist_head_t client_list;
+ krwlock_t lock;
+ enum {
+ IB_OFS_CLNT_UNINITIALIZED,
+ IB_OFS_CLNT_INITIALIZED
+ } state;
+} ofs_client_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_IB_CLIENTS_OF_SOL_OFS_SOL_KVERB_IMPL_H */
diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/ib.h b/usr/src/uts/common/sys/ib/clients/rdsv3/ib.h
new file mode 100644
index 0000000000..ff52bb29e2
--- /dev/null
+++ b/usr/src/uts/common/sys/ib/clients/rdsv3/ib.h
@@ -0,0 +1,359 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _RDSV3_IB_H
+#define _RDSV3_IB_H
+
+#include <sys/rds.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+#include <sys/ib/clients/rdsv3/rdma_transport.h>
+
+#define RDSV3_FMR_SIZE 256
+#define RDSV3_FMR_POOL_SIZE (12 * 1024)
+
+#define RDSV3_IB_SEND_WRS 64
+
+#define RDSV3_IB_MAX_SGE 8
+#define RDSV3_IB_RECV_SGE 2
+
+#define RDSV3_IB_DEFAULT_RECV_WR 1024
+#define RDSV3_IB_DEFAULT_SEND_WR 256
+
+#define RDSV3_IB_DEFAULT_RETRY_COUNT 2
+
+/* minor versions supported */
+#define RDSV3_IB_SUPPORTED_PROTOCOLS 0x00000003
+
+extern struct list rdsv3_ib_devices;
+
+/*
+ * IB posts RDSV3_FRAG_SIZE fragments of pages to the receive queues to
+ * try and minimize the amount of memory tied up both the device and
+ * socket receive queues.
+ */
+/* page offset of the final full frag that fits in the page */
+#define RDSV3_PAGE_LAST_OFF \
+ (((PAGE_SIZE / RDSV3_FRAG_SIZE) - 1) * RDSV3_FRAG_SIZE)
+struct rdsv3_page_frag {
+ struct list_node f_item;
+ caddr_t f_page;
+ unsigned long f_offset;
+ ibt_mi_hdl_t f_mapped;
+};
+
+struct rdsv3_ib_incoming {
+ struct list ii_frags;
+ struct rdsv3_incoming ii_inc;
+};
+
+struct rdsv3_ib_connect_private {
+ /* Add new fields at the end, and don't permute existing fields. */
+ uint32_be_t dp_saddr;
+ uint32_be_t dp_daddr;
+ uint8_t dp_protocol_major;
+ uint8_t dp_protocol_minor;
+ uint16_be_t dp_protocol_minor_mask; /* bitmask */
+ uint32_be_t dp_reserved1;
+ uint32_be_t dp_ack_seq;
+ uint32_be_t dp_credit; /* non-zero enables flow ctl */
+};
+
+struct rdsv3_ib_send_work {
+ struct rdsv3_message *s_rm;
+ struct rdsv3_rdma_op *s_op;
+ ibt_wrc_opcode_t s_opcode;
+ unsigned long s_queued;
+};
+
+struct rdsv3_ib_recv_work {
+ struct rdsv3_ib_incoming *r_ibinc;
+ struct rdsv3_page_frag *r_frag;
+ ibt_all_wr_t r_wr;
+ ibt_wr_ds_t r_sge[2];
+};
+
+struct rdsv3_ib_work_ring {
+ uint32_t w_nr;
+ uint32_t w_alloc_ptr;
+ uint32_t w_alloc_ctr;
+ uint32_t w_free_ptr;
+ atomic_t w_free_ctr;
+};
+
+struct rdsv3_ib_device;
+
+struct rdsv3_ib_connection {
+
+ struct list_node ib_node;
+ struct rdsv3_ib_device *rds_ibdev;
+ struct rdsv3_connection *conn;
+
+ /* alphabet soup, IBTA style */
+ struct rdma_cm_id *i_cm_id;
+ struct ib_pd *i_pd;
+ struct rdsv3_hdrs_mr *i_mr;
+ struct ib_cq *i_send_cq;
+ struct ib_cq *i_recv_cq;
+
+ /* tx */
+ struct rdsv3_ib_work_ring i_send_ring;
+ struct rdsv3_message *i_rm;
+ struct rdsv3_header *i_send_hdrs;
+ uint64_t i_send_hdrs_dma;
+ struct rdsv3_ib_send_work *i_sends;
+ ibt_send_wr_t *i_send_wrs;
+
+ /* rx */
+ ddi_taskq_t *i_recv_tasklet;
+ struct mutex i_recv_mutex;
+ struct rdsv3_ib_work_ring i_recv_ring;
+ struct rdsv3_ib_incoming *i_ibinc;
+ uint32_t i_recv_data_rem;
+ struct rdsv3_header *i_recv_hdrs;
+ uint64_t i_recv_hdrs_dma;
+ struct rdsv3_ib_recv_work *i_recvs;
+ struct rdsv3_page_frag i_frag;
+ uint64_t i_ack_recv; /* last ACK received */
+ processorid_t i_recv_tasklet_cpuid;
+ /* CPU to which the tasklet taskq should be bound */
+
+ /* sending acks */
+ unsigned long i_ack_flags;
+#ifndef KERNEL_HAS_ATOMIC64
+ kmutex_t i_ack_lock; /* protect i_ack_next */
+ uint64_t i_ack_next; /* next ACK to send */
+#else
+ atomic64_t i_ack_next; /* next ACK to send */
+#endif
+ struct rdsv3_header *i_ack;
+ ibt_send_wr_t i_ack_wr;
+ ibt_wr_ds_t i_ack_sge;
+ uint64_t i_ack_dma;
+ unsigned long i_ack_queued;
+
+ /*
+ * Flow control related information
+ *
+ * Our algorithm uses a pair variables that we need to access
+ * atomically - one for the send credits, and one posted
+ * recv credits we need to transfer to remote.
+ * Rather than protect them using a slow spinlock, we put both into
+ * a single atomic_t and update it using cmpxchg
+ */
+ atomic_t i_credits;
+
+ /* Protocol version specific information */
+ unsigned int i_flowctl:1; /* enable/disable flow ctl */
+
+ /* Batched completions */
+ unsigned int i_unsignaled_wrs;
+ long i_unsignaled_bytes;
+};
+
+/* This assumes that atomic_t is at least 32 bits */
+#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
+#define IB_GET_POST_CREDITS(v) ((v) >> 16)
+#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
+#define IB_SET_POST_CREDITS(v) ((v) << 16)
+
+struct rdsv3_ib_ipaddr {
+ struct list_node list;
+ uint32_be_t ipaddr;
+};
+
+struct rdsv3_ib_device {
+ struct list_node list;
+ struct list ipaddr_list;
+ struct list conn_list;
+ ib_device_t *dev;
+ struct ib_pd *pd;
+ ibt_lkey_t local_dma_lkey;
+ struct rds_ib_mr_pool *mr_pool;
+ unsigned int fmr_max_remaps;
+ unsigned int max_fmrs;
+ unsigned int fmr_message_size;
+ int max_sge;
+ unsigned int max_wrs;
+ ibt_fmr_pool_hdl_t fmr_pool_hdl;
+ kmutex_t spinlock; /* protect the above */
+ ibt_hca_attr_t hca_attr;
+};
+
+/* bits for i_ack_flags */
+#define IB_ACK_IN_FLIGHT 0
+#define IB_ACK_REQUESTED 1
+
+/* Magic WR_ID for ACKs */
+#define RDSV3_IB_ACK_WR_ID (~(uint64_t)0)
+
+struct rdsv3_ib_statistics {
+ uint64_t s_ib_connect_raced;
+ uint64_t s_ib_listen_closed_stale;
+ uint64_t s_ib_tx_cq_call;
+ uint64_t s_ib_tx_cq_event;
+ uint64_t s_ib_tx_ring_full;
+ uint64_t s_ib_tx_throttle;
+ uint64_t s_ib_tx_sg_mapping_failure;
+ uint64_t s_ib_tx_stalled;
+ uint64_t s_ib_tx_credit_updates;
+ uint64_t s_ib_rx_cq_call;
+ uint64_t s_ib_rx_cq_event;
+ uint64_t s_ib_rx_ring_empty;
+ uint64_t s_ib_rx_refill_from_cq;
+ uint64_t s_ib_rx_refill_from_thread;
+ uint64_t s_ib_rx_alloc_limit;
+ uint64_t s_ib_rx_credit_updates;
+ uint64_t s_ib_ack_sent;
+ uint64_t s_ib_ack_send_failure;
+ uint64_t s_ib_ack_send_delayed;
+ uint64_t s_ib_ack_send_piggybacked;
+ uint64_t s_ib_ack_received;
+ uint64_t s_ib_rdma_mr_alloc;
+ uint64_t s_ib_rdma_mr_free;
+ uint64_t s_ib_rdma_mr_used;
+ uint64_t s_ib_rdma_mr_pool_flush;
+ uint64_t s_ib_rdma_mr_pool_wait;
+ uint64_t s_ib_rdma_mr_pool_depleted;
+};
+
+extern struct rdsv3_workqueue_struct_s *rds_ib_wq;
+
+/* ib.c */
+extern struct rdsv3_transport rdsv3_ib_transport;
+extern void rdsv3_ib_add_one(ib_device_t *device);
+extern void rdsv3_ib_remove_one(ib_device_t *device);
+extern struct ib_client rdsv3_ib_client;
+
+extern unsigned int fmr_pool_size;
+extern unsigned int fmr_message_size;
+extern unsigned int rdsv3_ib_retry_count;
+
+extern kmutex_t ib_nodev_conns_lock;
+extern struct list ib_nodev_conns;
+
+/* ib_cm.c */
+int rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp);
+void rdsv3_ib_conn_free(void *arg);
+int rdsv3_ib_conn_connect(struct rdsv3_connection *conn);
+void rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn);
+void rdsv3_conn_drop(struct rdsv3_connection *conn);
+int rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event);
+int rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
+void rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn,
+ struct rdma_cm_event *event);
+
+/* ib_rdma.c */
+int rdsv3_ib_update_ipaddr(struct rdsv3_ib_device *rds_ibdev,
+ uint32_be_t ipaddr);
+void rdsv3_ib_add_conn(struct rdsv3_ib_device *rds_ibdev,
+ struct rdsv3_connection *conn);
+void rdsv3_ib_remove_conn(struct rdsv3_ib_device *rds_ibdev,
+ struct rdsv3_connection *conn);
+void __rdsv3_ib_destroy_conns(struct list *list, kmutex_t *list_lock);
+static inline void rdsv3_ib_destroy_nodev_conns(void)
+{
+ __rdsv3_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock);
+}
+static inline void rdsv3_ib_destroy_conns(struct rdsv3_ib_device *rds_ibdev)
+{
+ __rdsv3_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock);
+}
+
+int rdsv3_ib_create_mr_pool(struct rdsv3_ib_device *);
+void rdsv3_ib_destroy_mr_pool(struct rdsv3_ib_device *);
+void rdsv3_ib_get_mr_info(struct rdsv3_ib_device *rds_ibdev,
+ struct rdsv3_info_rdma_connection *iinfo);
+void *rdsv3_ib_get_mr(struct rdsv3_iovec *args, unsigned long nents,
+ struct rdsv3_sock *rs, uint32_t *key_ret);
+void rdsv3_ib_sync_mr(void *trans_private, int dir);
+void rdsv3_ib_free_mr(void *trans_private, int invalidate);
+void rdsv3_ib_flush_mrs(void);
+
+/* ib_recv.c */
+int rdsv3_ib_recv_init(void);
+void rdsv3_ib_recv_exit(void);
+int rdsv3_ib_recv(struct rdsv3_connection *conn);
+int rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int kptr_gfp,
+ int page_gfp, int prefill);
+void rdsv3_ib_inc_purge(struct rdsv3_incoming *inc);
+void rdsv3_ib_inc_free(struct rdsv3_incoming *inc);
+int rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming *inc, uio_t *uiop,
+ size_t size);
+void rdsv3_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context);
+void rdsv3_ib_recv_tasklet_fn(void *data);
+void rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection *ic);
+void rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection *ic);
+void rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection *ic);
+void rdsv3_ib_attempt_ack(struct rdsv3_ib_connection *ic);
+void rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection *ic);
+uint64_t rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection *ic);
+
+/* ib_ring.c */
+void rdsv3_ib_ring_init(struct rdsv3_ib_work_ring *ring, uint32_t nr);
+void rdsv3_ib_ring_resize(struct rdsv3_ib_work_ring *ring, uint32_t nr);
+uint32_t rdsv3_ib_ring_alloc(struct rdsv3_ib_work_ring *ring, uint32_t val,
+ uint32_t *pos);
+void rdsv3_ib_ring_free(struct rdsv3_ib_work_ring *ring, uint32_t val);
+void rdsv3_ib_ring_unalloc(struct rdsv3_ib_work_ring *ring, uint32_t val);
+int rdsv3_ib_ring_empty(struct rdsv3_ib_work_ring *ring);
+int rdsv3_ib_ring_low(struct rdsv3_ib_work_ring *ring);
+uint32_t rdsv3_ib_ring_oldest(struct rdsv3_ib_work_ring *ring);
+uint32_t rdsv3_ib_ring_completed(struct rdsv3_ib_work_ring *ring,
+ uint32_t wr_id, uint32_t oldest);
+extern rdsv3_wait_queue_t rdsv3_ib_ring_empty_wait;
+
+/* ib_send.c */
+void rdsv3_ib_xmit_complete(struct rdsv3_connection *conn);
+int rdsv3_ib_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rdsv3_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
+void rdsv3_ib_send_init_ring(struct rdsv3_ib_connection *ic);
+void rdsv3_ib_send_clear_ring(struct rdsv3_ib_connection *ic);
+int rdsv3_ib_xmit_rdma(struct rdsv3_connection *conn, struct rdsv3_rdma_op *op);
+void rdsv3_ib_send_add_credits(struct rdsv3_connection *conn,
+ unsigned int credits);
+void rdsv3_ib_advertise_credits(struct rdsv3_connection *conn,
+ unsigned int posted);
+int rdsv3_ib_send_grab_credits(struct rdsv3_ib_connection *ic, uint32_t wanted,
+ uint32_t *adv_credits, int need_posted, int max_posted);
+
+/* ib_stats.c */
+RDSV3_DECLARE_PER_CPU(struct rdsv3_ib_statistics, rdsv3_ib_stats);
+#define rdsv3_ib_stats_inc(member) rdsv3_stats_inc_which(rdsv3_ib_stats, member)
+unsigned int rdsv3_ib_stats_info_copy(struct rdsv3_info_iterator *iter,
+ unsigned int avail);
+
+/* ib_sysctl.c */
+int rdsv3_ib_sysctl_init(void);
+void rdsv3_ib_sysctl_exit(void);
+extern unsigned long rdsv3_ib_sysctl_max_send_wr;
+extern unsigned long rdsv3_ib_sysctl_max_recv_wr;
+extern unsigned long rdsv3_ib_sysctl_max_unsig_wrs;
+extern unsigned long rdsv3_ib_sysctl_max_unsig_bytes;
+extern unsigned long rdsv3_ib_sysctl_max_recv_allocation;
+extern unsigned int rdsv3_ib_sysctl_flow_control;
+
+#endif /* _RDSV3_IB_H */
diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/info.h b/usr/src/uts/common/sys/ib/clients/rdsv3/info.h
new file mode 100644
index 0000000000..cda82e0b0f
--- /dev/null
+++ b/usr/src/uts/common/sys/ib/clients/rdsv3/info.h
@@ -0,0 +1,59 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _RDSV3_INFO_H
+#define _RDSV3_INFO_H
+
+struct rdsv3_info_iterator {
+ char *addr;
+ unsigned long offset;
+};
+
+struct rdsv3_info_lengths {
+ unsigned int nr;
+ unsigned int each;
+};
+
+struct rdsv3_sock;
+
+/*
+ * These functions must fill in the fields of @lens to reflect the size
+ * of the available info source. If the snapshot fits in @len then it
+ * should be copied using @iter. The caller will deduce if it was copied
+ * or not by comparing the lengths.
+ */
+typedef void (*rdsv3_info_func)(struct rsock *sock, unsigned int len,
+ struct rdsv3_info_iterator *iter,
+ struct rdsv3_info_lengths *lens);
+
+#define rdsv3_info_copy(iter, data, bytes) \
+ bcopy(data, iter->addr + iter->offset, bytes); \
+ iter->offset += bytes
+
+void rdsv3_info_register_func(int optname, rdsv3_info_func func);
+void rdsv3_info_deregister_func(int optname, rdsv3_info_func func);
+int rdsv3_info_getsockopt(struct rsock *sock, int optname, char *optval,
+ socklen_t *optlen);
+
+#endif /* _RDSV3_INFO_H */
diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/loop.h b/usr/src/uts/common/sys/ib/clients/rdsv3/loop.h
new file mode 100644
index 0000000000..240a57aed5
--- /dev/null
+++ b/usr/src/uts/common/sys/ib/clients/rdsv3/loop.h
@@ -0,0 +1,33 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _RDSV3_LOOP_H
+#define _RDSV3_LOOP_H
+
+/* loop.c */
+extern struct rdsv3_transport rdsv3_loop_transport;
+
+void rdsv3_loop_exit(void);
+
+#endif /* _RDSV3_LOOP_H */
diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/rdma.h b/usr/src/uts/common/sys/ib/clients/rdsv3/rdma.h
new file mode 100644
index 0000000000..b2e6322808
--- /dev/null
+++ b/usr/src/uts/common/sys/ib/clients/rdsv3/rdma.h
@@ -0,0 +1,120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _RDSV3_RDMA_H
+#define _RDSV3_RDMA_H
+
+#include <sys/rds.h>
+#include <sys/uio.h>
+
+#include <sys/ib/clients/rdsv3/rdsv3.h>
+
+struct rdsv3_mr {
+ /* for AVL tree */
+ avl_node_t r_rb_node;
+ atomic_t r_refcount;
+ uint32_t r_key;
+
+ /* A copy of the creation flags */
+ unsigned int r_use_once:1;
+ unsigned int r_invalidate:1;
+ unsigned int r_write:1;
+
+ /*
+ * This is for RDS_MR_DEAD.
+ * It would be nice & consistent to make this part of the above
+ * bit field here, but we need to use test_and_set_bit.
+ */
+ unsigned long r_state;
+ /* back pointer to the socket that owns us */
+ struct rdsv3_sock *r_sock;
+ struct rdsv3_transport *r_trans;
+ void *r_trans_private;
+};
+
+/* Flags for mr->r_state */
+#define RDSV3_MR_DEAD 0
+
+struct rdsv3_rdma_sg {
+ ddi_umem_cookie_t umem_cookie;
+ struct rdsv3_iovec iovec;
+ ibt_send_wr_t swr;
+ ibt_mi_hdl_t mihdl;
+ ibt_hca_hdl_t hca_hdl;
+};
+
+struct rdsv3_rdma_op {
+ uint32_t r_key;
+ uint64_t r_remote_addr;
+ unsigned int r_write:1;
+ unsigned int r_fence:1;
+ unsigned int r_notify:1;
+ unsigned int r_recverr:1;
+ unsigned int r_mapped:1;
+ struct rdsv3_notifier *r_notifier;
+ unsigned int r_bytes;
+ unsigned int r_nents;
+ unsigned int r_count;
+ struct rdsv3_scatterlist *r_sg;
+ struct rdsv3_rdma_sg r_rdma_sg[1];
+};
+
+inline rdsv3_rdma_cookie_t
+rdsv3_rdma_make_cookie(uint32_t r_key, uint32_t offset)
+{
+ return (r_key | (((uint64_t)offset) << 32));
+}
+
+inline uint32_t
+rdsv3_rdma_cookie_key(rdsv3_rdma_cookie_t cookie)
+{
+ return ((uint32_t)cookie);
+}
+
+inline uint32_t
+rdsv3_rdma_cookie_offset(rdsv3_rdma_cookie_t cookie)
+{
+ return (cookie >> 32);
+}
+
+int rdsv3_get_mr(struct rdsv3_sock *rs, const void *optval, int optlen);
+int rdsv3_free_mr(struct rdsv3_sock *rs, const void *optval, int optlen);
+void rdsv3_rdma_drop_keys(struct rdsv3_sock *rs);
+int rdsv3_cmsg_rdma_args(struct rdsv3_sock *rs, struct rdsv3_message *rm,
+ struct cmsghdr *cmsg);
+int rdsv3_cmsg_rdma_dest(struct rdsv3_sock *rs, struct rdsv3_message *rm,
+ struct cmsghdr *cmsg);
+int rdsv3_cmsg_rdma_map(struct rdsv3_sock *rs, struct rdsv3_message *rm,
+ struct cmsghdr *cmsg);
+void rdsv3_rdma_free_op(struct rdsv3_rdma_op *ro);
+void rdsv3_rdma_send_complete(struct rdsv3_message *rm, int);
+
+extern void __rdsv3_put_mr_final(struct rdsv3_mr *mr);
+static inline void rdsv3_mr_put(struct rdsv3_mr *mr)
+{
+ if (atomic_dec_and_test(&mr->r_refcount))
+ __rdsv3_put_mr_final(mr);
+}
+
+#endif /* _RDSV3_RDMA_H */
diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/rdma_transport.h b/usr/src/uts/common/sys/ib/clients/rdsv3/rdma_transport.h
new file mode 100644
index 0000000000..b3e5283cdb
--- /dev/null
+++ b/usr/src/uts/common/sys/ib/clients/rdsv3/rdma_transport.h
@@ -0,0 +1,44 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _RDSV3_RDMA_TRANSPORT_H
+#define _RDSV3_RDMA_TRANSPORT_H
+
+#include "rdsv3.h"
+
+#define RDSV3_RDMA_RESOLVE_TIMEOUT_MS 5000
+
+int rdsv3_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event);
+
+/* from rdma_transport.c */
+void rdsv3_rdma_init();
+void rdsv3_rdma_exit(void *);
+
+/* from ib.c */
+extern struct rdsv3_transport rdsv3_ib_transport;
+int rdsv3_ib_init(void);
+void rdsv3_ib_exit(void);
+
+#endif /* _RDSV3_RDMA_TRANSPORT_H */
diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3.h b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3.h
new file mode 100644
index 0000000000..498852bc70
--- /dev/null
+++ b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3.h
@@ -0,0 +1,790 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _RDSV3_RDSV3_H
+#define _RDSV3_RDSV3_H
+
+/*
+ * The name of this file is rds.h in ofed.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/sunndi.h>
+#include <netinet/in.h>
+#include <sys/synch.h>
+#include <sys/stropts.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <inet/ip.h>
+#include <sys/avl.h>
+#include <sys/param.h>
+#include <sys/rds.h>
+
+#include <sys/ib/ibtl/ibti.h>
+#include <sys/ib/clients/of/rdma/ib_verbs.h>
+#include <sys/ib/clients/of/rdma/ib_addr.h>
+#include <sys/ib/clients/of/rdma/rdma_cm.h>
+#include <sys/ib/clients/rdsv3/rdsv3_impl.h>
+#include <sys/ib/clients/rdsv3/info.h>
+
+#define NIPQUAD(addr) \
+ (unsigned char)((ntohl(addr) >> 24) & 0xFF), \
+ (unsigned char)((ntohl(addr) >> 16) & 0xFF), \
+ (unsigned char)((ntohl(addr) >> 8) & 0xFF), \
+ (unsigned char)(ntohl(addr) & 0xFF)
+
+/*
+ * RDS Network protocol version
+ */
+#define RDS_PROTOCOL_3_0 0x0300
+#define RDS_PROTOCOL_3_1 0x0301
+#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1
+#define RDS_PROTOCOL_MAJOR(v) ((v) >> 8)
+#define RDS_PROTOCOL_MINOR(v) ((v) & 255)
+#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min)
+
+/*
+ * XXX randomly chosen, but at least seems to be unused:
+ * # 18464-18768 Unassigned
+ * We should do better. We want a reserved port to discourage unpriv'ed
+ * userspace from listening.
+ *
+ * port 18633 was the version that had ack frames on the wire.
+ */
+#define RDSV3_PORT 18634
+
+#include <sys/ib/clients/rdsv3/info.h>
+
+/*
+ * RDS trace facilities
+ */
+enum {
+ RDSV3_BIND = 0,
+ RDSV3_CONG,
+ RDSV3_CONNECTION,
+ RDSV3_RDMA,
+ RDSV3_PAGE,
+ RDSV3_SEND,
+ RDSV3_RECV,
+ RDSV3_THREADS,
+ RDSV3_INFO,
+ RDSV3_MESSAGE,
+ RDSV3_IB,
+ RDSV3_IB_CM,
+ RDSV3_IB_RDMA,
+ RDSV3_IB_RING,
+ RDSV3_IB_RECV,
+ RDSV3_IB_SEND,
+ RDSV3_TCP,
+ RDSV3_TCP_CONNECT,
+ RDSV3_TCP_LISTEN,
+ RDSV3_TCP_RECV,
+ RDSV3_TCP_SEND
+};
+
+enum {
+ RDSV3_ALWAYS = 0,
+ RDSV3_MINIMAL,
+ RDSV3_LOW,
+ RDSV3_MEDIUM,
+ RDSV3_HIGH,
+ RDSV3_VERBOSE
+};
+
+/*
+ * This is the sad making. Some kernels have a bug in the per_cpu() api which
+ * makes DEFINE_PER_CPU trigger an oops on insmod because the per-cpu section
+ * in the module is not cacheline-aligned. As much as we'd like to tell users
+ * with older kernels to stuff it, that's not reasonable. We'll roll our own
+ * until this doesn't have to build against older kernels.
+ */
+#define RDSV3_DEFINE_PER_CPU(type, var) type var[NR_CPUS]
+#define RDSV3_DECLARE_PER_CPU(type, var) extern type var[NR_CPUS]
+#define rdsv3_per_cpu(var, cpu) var[cpu]
+
+static inline ulong_t
+ceil(ulong_t x, ulong_t y)
+{
+ return ((x + y - 1) / y);
+}
+
+#define RDSV3_FRAG_SHIFT 12
+#define RDSV3_FRAG_SIZE ((unsigned int)(1 << RDSV3_FRAG_SHIFT))
+
+#define RDSV3_CONG_MAP_BYTES (65536 / 8)
+#define RDSV3_CONG_MAP_LONGS (RDSV3_CONG_MAP_BYTES / sizeof (unsigned long))
+#define RDSV3_CONG_MAP_PAGES (RDSV3_CONG_MAP_BYTES / PAGE_SIZE)
+#define RDSV3_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
+
+struct rdsv3_cong_map {
+ struct avl_node m_rb_node;
+ uint32_be_t m_addr;
+ rdsv3_wait_queue_t m_waitq;
+ struct list m_conn_list;
+ unsigned long m_page_addrs[RDSV3_CONG_MAP_PAGES];
+};
+
+
+/*
+ * This is how we will track the connection state:
+ * A connection is always in one of the following
+ * states. Updates to the state are atomic and imply
+ * a memory barrier.
+ */
+enum {
+ RDSV3_CONN_DOWN = 0,
+ RDSV3_CONN_CONNECTING,
+ RDSV3_CONN_DISCONNECTING,
+ RDSV3_CONN_UP,
+ RDSV3_CONN_ERROR,
+};
+
+/* Bits for c_flags */
+#define RDSV3_LL_SEND_FULL 0
+#define RDSV3_RECONNECT_PENDING 1
+
+struct rdsv3_connection {
+ struct avl_node c_hash_node;
+ uint32_be_t c_laddr;
+ uint32_be_t c_faddr;
+ unsigned int c_loopback:1;
+ struct rdsv3_connection *c_passive;
+
+ struct rdsv3_cong_map *c_lcong;
+ struct rdsv3_cong_map *c_fcong;
+
+ struct mutex c_send_lock; /* protect send ring */
+ struct rdsv3_message *c_xmit_rm;
+ unsigned long c_xmit_sg;
+ unsigned int c_xmit_hdr_off;
+ unsigned int c_xmit_data_off;
+ unsigned int c_xmit_rdma_sent;
+
+ kmutex_t c_lock; /* protect msg queues */
+ uint64_t c_next_tx_seq;
+ struct list c_send_queue;
+ struct list c_retrans;
+
+ uint64_t c_next_rx_seq;
+
+ struct rdsv3_transport *c_trans;
+ void *c_transport_data;
+
+ atomic_t c_state;
+ unsigned long c_flags;
+ unsigned long c_reconnect_jiffies;
+ struct rdsv3_delayed_work_s c_send_w;
+ struct rdsv3_delayed_work_s c_recv_w;
+ struct rdsv3_delayed_work_s c_conn_w;
+ struct rdsv3_work_s c_down_w;
+ struct mutex c_cm_lock; /* protect conn state & cm */
+
+ struct list_node c_map_item;
+ unsigned long c_map_queued;
+ unsigned long c_map_offset;
+ unsigned long c_map_bytes;
+
+ unsigned int c_unacked_packets;
+ unsigned int c_unacked_bytes;
+
+ /* Protocol version */
+ unsigned int c_version;
+};
+
+#define RDSV3_FLAG_CONG_BITMAP 0x01
+#define RDSV3_FLAG_ACK_REQUIRED 0x02
+#define RDSV3_FLAG_RETRANSMITTED 0x04
+#define RDSV3_MAX_ADV_CREDIT 255
+
+/*
+ * Maximum space available for extension headers.
+ */
+#define RDSV3_HEADER_EXT_SPACE 16
+
+struct rdsv3_header {
+ uint64_be_t h_sequence;
+ uint64_be_t h_ack;
+ uint32_be_t h_len;
+ uint16_be_t h_sport;
+ uint16_be_t h_dport;
+ uint8_t h_flags;
+ uint8_t h_credit;
+ uint8_t h_padding[4];
+ uint16_be_t h_csum;
+
+ uint8_t h_exthdr[RDSV3_HEADER_EXT_SPACE];
+};
+
+/* Reserved - indicates end of extensions */
+#define RDSV3_EXTHDR_NONE 0
+
+/*
+ * This extension header is included in the very
+ * first message that is sent on a new connection,
+ * and identifies the protocol level. This will help
+ * rolling updates if a future change requires breaking
+ * the protocol.
+ */
+#define RDSV3_EXTHDR_VERSION 1
+struct rdsv3_ext_header_version {
+ uint32_be_t h_version;
+};
+
+/*
+ * This extension header is included in the RDS message
+ * chasing an RDMA operation.
+ */
+#define RDSV3_EXTHDR_RDMA 2
+struct rdsv3_ext_header_rdma {
+ uint32_be_t h_rdma_rkey;
+};
+
+/*
+ * This extension header tells the peer about the
+ * destination <R_Key,offset> of the requested RDMA
+ * operation.
+ */
+#define RDSV3_EXTHDR_RDMA_DEST 3
+struct rdsv3_ext_header_rdma_dest {
+ uint32_be_t h_rdma_rkey;
+ uint32_be_t h_rdma_offset;
+};
+
+#define __RDSV3_EXTHDR_MAX 16 /* for now */
+
+struct rdsv3_incoming {
+ atomic_t i_refcount;
+ struct list_node i_item;
+ struct rdsv3_connection *i_conn;
+ struct rdsv3_header i_hdr;
+ unsigned long i_rx_jiffies;
+ uint32_be_t i_saddr;
+
+ rdsv3_rdma_cookie_t i_rdma_cookie;
+};
+
+/*
+ * m_sock_item and m_conn_item are on lists that are serialized under
+ * conn->c_lock. m_sock_item has additional meaning in that once it is empty
+ * the message will not be put back on the retransmit list after being sent.
+ * messages that are canceled while being sent rely on this.
+ *
+ * m_inc is used by loopback so that it can pass an incoming message straight
+ * back up into the rx path. It embeds a wire header which is also used by
+ * the send path, which is kind of awkward.
+ *
+ * m_sock_item indicates the message's presence on a socket's send or receive
+ * queue. m_rs will point to that socket.
+ *
+ * m_daddr is used by cancellation to prune messages to a given destination.
+ *
+ * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock
+ * nesting. As paths iterate over messages on a sock, or conn, they must
+ * also lock the conn, or sock, to remove the message from those lists too.
+ * Testing the flag to determine if the message is still on the lists lets
+ * us avoid testing the list_head directly. That means each path can use
+ * the message's list_head to keep it on a local list while juggling locks
+ * without confusing the other path.
+ *
+ * m_ack_seq is an optional field set by transports who need a different
+ * sequence number range to invalidate. They can use this in a callback
+ * that they pass to rdsv3_send_drop_acked() to see if each message has been
+ * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't
+ * had ack_seq set yet.
+ */
+#define RDSV3_MSG_ON_SOCK 1
+#define RDSV3_MSG_ON_CONN 2
+#define RDSV3_MSG_HAS_ACK_SEQ 3
+#define RDSV3_MSG_ACK_REQUIRED 4
+#define RDSV3_MSG_RETRANSMITTED 5
+#define RDSV3_MSG_MAPPED 6
+#define RDSV3_MSG_PAGEVEC 7
+
+struct rdsv3_message {
+ atomic_t m_refcount;
+ struct list_node m_sock_item;
+ struct list_node m_conn_item;
+ struct rdsv3_incoming m_inc;
+ uint64_t m_ack_seq;
+ uint32_be_t m_daddr;
+ unsigned long m_flags;
+
+ /*
+ * Never access m_rs without holding m_rs_lock.
+ * Lock nesting is
+ * rm->m_rs_lock
+ * -> rs->rs_lock
+ */
+ kmutex_t m_rs_lock;
+ struct rdsv3_sock *m_rs;
+ struct rdsv3_rdma_op *m_rdma_op;
+ rdsv3_rdma_cookie_t m_rdma_cookie;
+ struct rdsv3_mr *m_rdma_mr;
+ unsigned int m_nents;
+ unsigned int m_count;
+ struct rdsv3_scatterlist m_sg[1];
+};
+
+/*
+ * The RDS notifier is used (optionally) to tell the application about
+ * completed RDMA operations. Rather than keeping the whole rds message
+ * around on the queue, we allocate a small notifier that is put on the
+ * socket's notifier_list. Notifications are delivered to the application
+ * through control messages.
+ */
+struct rdsv3_notifier {
+ list_node_t n_list;
+ uint64_t n_user_token;
+ int n_status;
+};
+
+/*
+ * struct rdsv3_transport - transport specific behavioural hooks
+ *
+ * @xmit: .xmit is called by rdsv3_send_xmit() to tell the transport to send
+ * part of a message. The caller serializes on the send_sem so this
+ * doesn't need to be reentrant for a given conn. The header must be
+ * sent before the data payload. .xmit must be prepared to send a
+ * message with no data payload. .xmit should return the number of
+ * bytes that were sent down the connection, including header bytes.
+ * Returning 0 tells the caller that it doesn't need to perform any
+ * additional work now. This is usually the case when the transport has
+ * filled the sending queue for its connection and will handle
+ * triggering the rds thread to continue the send when space becomes
+ * available. Returning -EAGAIN tells the caller to retry the send
+ * immediately. Returning -ENOMEM tells the caller to retry the send at
+ * some point in the future.
+ *
+ * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once
+ * it returns the connection can not call rdsv3_recv_incoming().
+ * This will only be called once after conn_connect returns
+ * non-zero success and will The caller serializes this with
+ * the send and connecting paths (xmit_* and conn_*). The
+ * transport is responsible for other serialization, including
+ * rdsv3_recv_incoming(). This is called in process context but
+ * should try hard not to block.
+ *
+ * @xmit_cong_map: This asks the transport to send the local bitmap down the
+ * given connection. XXX get a better story about the bitmap
+ * flag and header.
+ */
+
+struct rdsv3_transport {
+ struct list_node t_item;
+ char *t_name;
+ unsigned int t_prefer_loopback:1;
+
+ int (*laddr_check)(uint32_be_t addr);
+ int (*conn_alloc)(struct rdsv3_connection *conn, int gfp);
+ void (*conn_free)(void *data);
+ int (*conn_connect)(struct rdsv3_connection *conn);
+ void (*conn_shutdown)(struct rdsv3_connection *conn);
+ void (*xmit_prepare)(struct rdsv3_connection *conn);
+ void (*xmit_complete)(struct rdsv3_connection *conn);
+ int (*xmit)(struct rdsv3_connection *conn, struct rdsv3_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off);
+ int (*xmit_cong_map)(struct rdsv3_connection *conn,
+ struct rdsv3_cong_map *map, unsigned long offset);
+ int (*xmit_rdma)(struct rdsv3_connection *conn,
+ struct rdsv3_rdma_op *op);
+ int (*recv)(struct rdsv3_connection *conn);
+ int (*inc_copy_to_user)(struct rdsv3_incoming *inc, uio_t *uio,
+ size_t size);
+ void (*inc_purge)(struct rdsv3_incoming *inc);
+ void (*inc_free)(struct rdsv3_incoming *inc);
+
+ int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event);
+ int (*cm_initiate_connect)(struct rdma_cm_id *cm_id);
+ void (*cm_connect_complete)(struct rdsv3_connection *conn,
+ struct rdma_cm_event *event);
+
+ unsigned int (*stats_info_copy)(struct rdsv3_info_iterator *iter,
+ unsigned int avail);
+ void (*exit)(void);
+ void *(*get_mr)(struct rdsv3_iovec *sg, unsigned long nr_sg,
+ struct rdsv3_sock *rs, uint32_t *key_ret);
+ void (*sync_mr)(void *trans_private, int direction);
+ void (*free_mr)(void *trans_private, int invalidate);
+ void (*flush_mrs)(void);
+};
+
+struct rdsv3_sock {
+ struct rsock *rs_sk;
+
+ uint64_t rs_user_addr;
+ uint64_t rs_user_bytes;
+
+ /*
+ * bound_addr used for both incoming and outgoing, no INADDR_ANY
+ * support.
+ */
+ struct avl_node rs_bound_node;
+ uint32_be_t rs_bound_addr;
+ uint32_be_t rs_conn_addr;
+ uint16_be_t rs_bound_port;
+ uint16_be_t rs_conn_port;
+
+ /*
+ * This is only used to communicate the transport between bind and
+ * initiating connections. All other trans use is referenced through
+ * the connection.
+ */
+ struct rdsv3_transport *rs_transport;
+
+ /*
+ * rdsv3_sendmsg caches the conn it used the last time around.
+ * This helps avoid costly lookups.
+ */
+ struct rdsv3_connection *rs_conn;
+ kmutex_t rs_conn_lock;
+
+ /* flag indicating we were congested or not */
+ int rs_congested;
+
+ /* rs_lock protects all these adjacent members before the newline */
+ kmutex_t rs_lock;
+ struct list rs_send_queue;
+ uint32_t rs_snd_bytes;
+ int rs_rcv_bytes;
+ /* currently used for failed RDMAs */
+ struct list rs_notify_queue;
+
+ /*
+ * Congestion wake_up. If rs_cong_monitor is set, we use cong_mask
+ * to decide whether the application should be woken up.
+ * If not set, we use rs_cong_track to find out whether a cong map
+ * update arrived.
+ */
+ uint64_t rs_cong_mask;
+ uint64_t rs_cong_notify;
+ struct list_node rs_cong_list;
+ unsigned long rs_cong_track;
+
+ /*
+ * rs_recv_lock protects the receive queue, and is
+ * used to serialize with rdsv3_release.
+ */
+ krwlock_t rs_recv_lock;
+ struct list rs_recv_queue;
+
+ /* just for stats reporting */
+ struct list_node rs_item;
+
+ /* these have their own lock */
+ kmutex_t rs_rdma_lock;
+ struct avl_tree rs_rdma_keys;
+
+ /* Socket options - in case there will be more */
+ unsigned char rs_recverr,
+ rs_cong_monitor;
+
+ cred_t *rs_cred;
+ zoneid_t rs_zoneid;
+};
+
+inline struct rdsv3_sock *
+rdsv3_sk_to_rs(const struct rsock *sk)
+{
+ return ((struct rdsv3_sock *)sk->sk_protinfo);
+}
+
+inline struct rsock *
+rdsv3_rs_to_sk(const struct rdsv3_sock *rs)
+{
+ return ((struct rsock *)rs->rs_sk);
+}
+
+/*
+ * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value
+ * to account for overhead. We don't account for overhead, we just apply
+ * the number of payload bytes to the specified value.
+ */
+inline int
+rdsv3_sk_sndbuf(struct rdsv3_sock *rs)
+{
+ /* XXX */
+ return (rdsv3_rs_to_sk(rs)->sk_sndbuf);
+}
+
+inline int
+rdsv3_sk_rcvbuf(struct rdsv3_sock *rs)
+{
+ /* XXX */
+ return (rdsv3_rs_to_sk(rs)->sk_rcvbuf);
+}
+
+struct rdsv3_statistics {
+ uint64_t s_conn_reset;
+ uint64_t s_recv_drop_bad_checksum;
+ uint64_t s_recv_drop_old_seq;
+ uint64_t s_recv_drop_no_sock;
+ uint64_t s_recv_drop_dead_sock;
+ uint64_t s_recv_deliver_raced;
+ uint64_t s_recv_delivered;
+ uint64_t s_recv_queued;
+ uint64_t s_recv_immediate_retry;
+ uint64_t s_recv_delayed_retry;
+ uint64_t s_recv_ack_required;
+ uint64_t s_recv_rdma_bytes;
+ uint64_t s_recv_ping;
+ uint64_t s_send_queue_empty;
+ uint64_t s_send_queue_full;
+ uint64_t s_send_sem_contention;
+ uint64_t s_send_sem_queue_raced;
+ uint64_t s_send_immediate_retry;
+ uint64_t s_send_delayed_retry;
+ uint64_t s_send_drop_acked;
+ uint64_t s_send_ack_required;
+ uint64_t s_send_queued;
+ uint64_t s_send_rdma;
+ uint64_t s_send_rdma_bytes;
+ uint64_t s_send_pong;
+ uint64_t s_page_remainder_hit;
+ uint64_t s_page_remainder_miss;
+ uint64_t s_copy_to_user;
+ uint64_t s_copy_from_user;
+ uint64_t s_cong_update_queued;
+ uint64_t s_cong_update_received;
+ uint64_t s_cong_send_error;
+ uint64_t s_cong_send_blocked;
+};
+
+/* af_rds.c */
+void rdsv3_sock_addref(struct rdsv3_sock *rs);
+void rdsv3_sock_put(struct rdsv3_sock *rs);
+void rdsv3_wake_sk_sleep(struct rdsv3_sock *rs);
+void __rdsv3_wake_sk_sleep(struct rsock *sk);
+
+extern rdsv3_wait_queue_t rdsv3_poll_waitq;
+
+/* bind.c */
+int rdsv3_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
+ socklen_t len, cred_t *cr);
+void rdsv3_remove_bound(struct rdsv3_sock *rs);
+struct rdsv3_sock *rdsv3_find_bound(uint32_be_t addr, uint16_be_t port);
+
+/* conn.c */
+int rdsv3_conn_init(void);
+void rdsv3_conn_exit(void);
+struct rdsv3_connection *rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr,
+ struct rdsv3_transport *trans, int gfp);
+struct rdsv3_connection *rdsv3_conn_create_outgoing(uint32_be_t laddr,
+ uint32_be_t faddr,
+ struct rdsv3_transport *trans, int gfp);
+void rdsv3_conn_destroy(struct rdsv3_connection *conn);
+void rdsv3_conn_reset(struct rdsv3_connection *conn);
+void rdsv3_conn_drop(struct rdsv3_connection *conn);
+void rdsv3_for_each_conn_info(struct rsock *sock, unsigned int len,
+ struct rdsv3_info_iterator *iter,
+ struct rdsv3_info_lengths *lens,
+ int (*visitor)(struct rdsv3_connection *, void *),
+ size_t item_len);
+
+static inline int
+rdsv3_conn_transition(struct rdsv3_connection *conn, int old, int new)
+{
+ return (atomic_cmpxchg(&conn->c_state, old, new) == old);
+}
+
+inline int
+rdsv3_conn_state(struct rdsv3_connection *conn)
+{
+ return (atomic_get(&conn->c_state));
+}
+
+inline int
+rdsv3_conn_up(struct rdsv3_connection *conn)
+{
+ return (atomic_get(&conn->c_state) == RDSV3_CONN_UP);
+}
+
+inline int
+rdsv3_conn_connecting(struct rdsv3_connection *conn)
+{
+ return (atomic_get(&conn->c_state) == RDSV3_CONN_CONNECTING);
+}
+
+/* recv.c */
+void rdsv3_inc_init(struct rdsv3_incoming *inc, struct rdsv3_connection *conn,
+ uint32_be_t saddr);
+void rdsv3_inc_addref(struct rdsv3_incoming *inc);
+void rdsv3_inc_put(struct rdsv3_incoming *inc);
+void rdsv3_recv_incoming(struct rdsv3_connection *conn, uint32_be_t saddr,
+ uint32_be_t daddr,
+ struct rdsv3_incoming *inc, int gfp);
+int rdsv3_recvmsg(struct rdsv3_sock *rs, uio_t *uio,
+ struct msghdr *msg, size_t size, int msg_flags);
+void rdsv3_clear_recv_queue(struct rdsv3_sock *rs);
+int rdsv3_notify_queue_get(struct rdsv3_sock *rs, struct msghdr *msg);
+void rdsv3_inc_info_copy(struct rdsv3_incoming *inc,
+ struct rdsv3_info_iterator *iter,
+ uint32_be_t saddr, uint32_be_t daddr, int flip);
+
+/* page.c */
+int rdsv3_page_remainder_alloc(struct rdsv3_scatterlist *scat,
+ unsigned long bytes, int gfp);
+
+/* send.c */
+int rdsv3_sendmsg(struct rdsv3_sock *rs, uio_t *uio, struct nmsghdr *msg,
+ size_t payload_len);
+void rdsv3_send_reset(struct rdsv3_connection *conn);
+int rdsv3_send_xmit(struct rdsv3_connection *conn);
+struct sockaddr_in;
+void rdsv3_send_drop_to(struct rdsv3_sock *rs, struct sockaddr_in *dest);
+typedef int (*is_acked_func)(struct rdsv3_message *rm, uint64_t ack);
+void rdsv3_send_drop_acked(struct rdsv3_connection *conn, uint64_t ack,
+ is_acked_func is_acked);
+int rdsv3_send_acked_before(struct rdsv3_connection *conn, uint64_t seq);
+void rdsv3_send_remove_from_sock(struct list *messages, int status);
+int rdsv3_send_pong(struct rdsv3_connection *conn, uint16_be_t dport);
+struct rdsv3_message *rdsv3_send_get_message(struct rdsv3_connection *,
+ struct rdsv3_rdma_op *);
+
+/* rdma.c */
+void rdsv3_rdma_unuse(struct rdsv3_sock *rs, uint32_t r_key, int force);
+
+/* cong.c */
+void rdsv3_cong_init(void);
+int rdsv3_cong_get_maps(struct rdsv3_connection *conn);
+void rdsv3_cong_add_conn(struct rdsv3_connection *conn);
+void rdsv3_cong_remove_conn(struct rdsv3_connection *conn);
+void rdsv3_cong_set_bit(struct rdsv3_cong_map *map, uint16_be_t port);
+void rdsv3_cong_clear_bit(struct rdsv3_cong_map *map, uint16_be_t port);
+int rdsv3_cong_wait(struct rdsv3_cong_map *map, uint16_be_t port, int nonblock,
+ struct rdsv3_sock *rs);
+void rdsv3_cong_queue_updates(struct rdsv3_cong_map *map);
+void rdsv3_cong_map_updated(struct rdsv3_cong_map *map, uint64_t);
+int rdsv3_cong_updated_since(unsigned long *recent);
+void rdsv3_cong_add_socket(struct rdsv3_sock *);
+void rdsv3_cong_remove_socket(struct rdsv3_sock *);
+void rdsv3_cong_exit(void);
+struct rdsv3_message *rdsv3_cong_update_alloc(struct rdsv3_connection *conn);
+
+/* stats.c */
+RDSV3_DECLARE_PER_CPU(struct rdsv3_statistics, rdsv3_stats);
+#define rdsv3_stats_inc_which(which, member) do { \
+ rdsv3_per_cpu(which, get_cpu()).member++; \
+ put_cpu(); \
+} while (0)
+#define rdsv3_stats_inc(member) rdsv3_stats_inc_which(rdsv3_stats, member)
+#define rdsv3_stats_add_which(which, member, count) do { \
+ rdsv3_per_cpu(which, get_cpu()).member += count; \
+ put_cpu(); \
+} while (0)
+#define rdsv3_stats_add(member, count) \
+ rdsv3_stats_add_which(rdsv3_stats, member, count)
+int rdsv3_stats_init(void);
+void rdsv3_stats_exit(void);
+void rdsv3_stats_info_copy(struct rdsv3_info_iterator *iter,
+ uint64_t *values, char **names, size_t nr);
+
+
+/* sysctl.c */
+int rdsv3_sysctl_init(void);
+void rdsv3_sysctl_exit(void);
+extern unsigned long rdsv3_sysctl_sndbuf_min;
+extern unsigned long rdsv3_sysctl_sndbuf_default;
+extern unsigned long rdsv3_sysctl_sndbuf_max;
+extern unsigned long rdsv3_sysctl_reconnect_min_jiffies;
+extern unsigned long rdsv3_sysctl_reconnect_max_jiffies;
+extern unsigned int rdsv3_sysctl_max_unacked_packets;
+extern unsigned int rdsv3_sysctl_max_unacked_bytes;
+extern unsigned int rdsv3_sysctl_ping_enable;
+extern unsigned long rdsv3_sysctl_trace_flags;
+extern unsigned int rdsv3_sysctl_trace_level;
+
+/* threads.c */
+int rdsv3_threads_init();
+void rdsv3_threads_exit(void);
+extern struct rdsv3_workqueue_struct_s *rdsv3_wq;
+void rdsv3_connect_worker(struct rdsv3_work_s *);
+void rdsv3_shutdown_worker(struct rdsv3_work_s *);
+void rdsv3_send_worker(struct rdsv3_work_s *);
+void rdsv3_recv_worker(struct rdsv3_work_s *);
+void rdsv3_connect_complete(struct rdsv3_connection *conn);
+
+/* transport.c */
+int rdsv3_trans_register(struct rdsv3_transport *trans);
+void rdsv3_trans_unregister(struct rdsv3_transport *trans);
+struct rdsv3_transport *rdsv3_trans_get_preferred(uint32_be_t addr);
+unsigned int rdsv3_trans_stats_info_copy(struct rdsv3_info_iterator *iter,
+ unsigned int avail);
+void rdsv3_trans_exit(void);
+
+/* message.c */
+struct rdsv3_message *rdsv3_message_alloc(unsigned int nents, int gfp);
+struct rdsv3_message *rdsv3_message_copy_from_user(struct uio *uiop,
+ size_t total_len);
+struct rdsv3_message *rdsv3_message_map_pages(unsigned long *page_addrs,
+ unsigned int total_len);
+void rdsv3_message_populate_header(struct rdsv3_header *hdr, uint16_be_t sport,
+ uint16_be_t dport, uint64_t seq);
+int rdsv3_message_add_extension(struct rdsv3_header *hdr,
+ unsigned int type, const void *data, unsigned int len);
+int rdsv3_message_next_extension(struct rdsv3_header *hdr,
+ unsigned int *pos, void *buf, unsigned int *buflen);
+int rdsv3_message_add_version_extension(struct rdsv3_header *hdr,
+ unsigned int version);
+int rdsv3_message_get_version_extension(struct rdsv3_header *hdr,
+ unsigned int *version);
+int rdsv3_message_add_rdma_dest_extension(struct rdsv3_header *hdr,
+ uint32_t r_key, uint32_t offset);
+int rdsv3_message_inc_copy_to_user(struct rdsv3_incoming *inc,
+ uio_t *uio, size_t size);
+void rdsv3_message_inc_purge(struct rdsv3_incoming *inc);
+void rdsv3_message_inc_free(struct rdsv3_incoming *inc);
+void rdsv3_message_addref(struct rdsv3_message *rm);
+void rdsv3_message_put(struct rdsv3_message *rm);
+void rdsv3_message_wait(struct rdsv3_message *rm);
+void rdsv3_message_unmapped(struct rdsv3_message *rm);
+
+inline void
+rdsv3_message_make_checksum(struct rdsv3_header *hdr)
+{
+ hdr->h_csum = 0;
+ hdr->h_csum =
+ rdsv3_ip_fast_csum((void *)hdr, sizeof (*hdr) >> 2);
+}
+
+inline int
+rdsv3_message_verify_checksum(const struct rdsv3_header *hdr)
+{
+ return (!hdr->h_csum ||
+ rdsv3_ip_fast_csum((void *)hdr, sizeof (*hdr) >> 2) == 0);
+}
+
+/* rdsv3_sc.c */
+extern boolean_t rdsv3_if_lookup_by_name(char *if_name);
+extern int rdsv3_sc_path_lookup(ipaddr_t *localip, ipaddr_t *remip);
+extern ipaddr_t rdsv3_scaddr_to_ibaddr(ipaddr_t addr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RDSV3_RDSV3_H */
diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_debug.h b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_debug.h
new file mode 100644
index 0000000000..f970d70209
--- /dev/null
+++ b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_debug.h
@@ -0,0 +1,139 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _RDSV3_DEBUG_H
+#define _RDSV3_DEBUG_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LABEL "RDSV3"
+
+/*
+ * warnings, console & syslog buffer.
+ * For Non recoverable or Major Errors
+ */
+#define RDSV3_LOG_L0 0
+
+/*
+ * syslog buffer or RDS trace buffer (console if booted /w debug)
+ * For additional information on Non recoverable errors and
+ * warnings/informational message for sys-admin types.
+ */
+#define RDSV3_LOG_L1 1
+
+/*
+ * debug only
+ * for more verbose trace than L1, for e.g. recoverable errors,
+ * or intersting trace
+ */
+#define RDSV3_LOG_L2 2
+
+/*
+ * debug only
+ * for more verbose trace than L2, for e.g. informational messages
+ */
+#define RDSV3_LOG_L3 3
+
+/*
+ * debug only
+ * for more verbose trace than L3, for e.g. printing function entries...
+ */
+#define RDSV3_LOG_L4 4
+
+/*
+ * debug only
+ * most verbose level. Used only for excessive trace, for e.g.
+ * printing structures etc.
+ */
+#define RDSV3_LOG_L5 5
+
+/*
+ * debug only
+ * for messages from softints, taskqs, intr handlers, timeout handlers etc.
+ */
+#define RDSV3_LOG_LINTR 6
+
+
+#ifdef DEBUG
+#define RDSV3_DPRINTF_INTR rdsv3_dprintf_intr
+#define RDSV3_DPRINTF5 rdsv3_dprintf5
+#define RDSV3_DPRINTF4 rdsv3_dprintf4
+#define RDSV3_DPRINTF3 rdsv3_dprintf3
+#define RDSV3_DPRINTF2 rdsv3_dprintf2
+#define RDSV3_DPRINTF1 rdsv3_dprintf1
+#define RDSV3_DPRINTF0 rdsv3_dprintf0
+
+void rdsv3_dprintf_intr(
+ char *name,
+ char *fmt, ...);
+void rdsv3_dprintf5(
+ char *name,
+ char *fmt, ...);
+void rdsv3_dprintf4(
+ char *name,
+ char *fmt, ...);
+void rdsv3_dprintf3(
+ char *name,
+ char *fmt, ...);
+void rdsv3_dprintf2(
+ char *name,
+ char *fmt, ...);
+void rdsv3_dprintf1(
+ char *name,
+ char *fmt, ...);
+void rdsv3_dprintf0(
+ char *name,
+ char *fmt, ...);
+#else
+#define RDSV3_DPRINTF_INTR 0 &&
+#define RDSV3_DPRINTF5 0 &&
+#define RDSV3_DPRINTF4 0 &&
+#define RDSV3_DPRINTF3 0 &&
+#define RDSV3_DPRINTF2 0 &&
+#define RDSV3_DPRINTF1 0 &&
+#define RDSV3_DPRINTF0 0 &&
+#endif
+
+void rdsv3_trace(
+ char *name,
+ uint8_t lvl,
+ char *fmt, ...);
+
+void rdsv3_vprintk(
+ char *name,
+ uint8_t lvl,
+ const char *fmt,
+ va_list ap);
+
+/* defined in rds_debug.c */
+void rdsv3_logging_initialization();
+void rdsv3_logging_destroy();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RDSV3_DEBUG_H */
diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_impl.h b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_impl.h
new file mode 100644
index 0000000000..d7a734138f
--- /dev/null
+++ b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_impl.h
@@ -0,0 +1,402 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _RDSV3_IMPL_H
+#define _RDSV3_IMPL_H
+
+#include <sys/atomic.h>
+
+/*
+ * This file is only present in Solaris
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern dev_info_t *rdsv3_dev_info;
+
+#define uint16_be_t uint16_t
+#define uint32_be_t uint32_t
+#define uint64_be_t uint64_t
+
+/*
+ * RDS Well known service id
+ * Format: 0x1h00144Fhhhhhhhh
+ * "00144F" is the Sun OUI
+ * 'h' can be any hex-decimal digit.
+ */
+#define RDS_SERVICE_ID 0x1000144F00000001ULL
+
+/*
+ * Atomic operations
+ */
+typedef unsigned int atomic_t;
+#define ATOMIC_INIT(a) a
+
+#define atomic_get(p) (*(p))
+
+#define atomic_cmpset_long(p, c, n) \
+ ((c == atomic_cas_uint(p, c, n)) ? c : -1)
+
+#define atomic_dec_and_test(a) \
+ (atomic_dec_uint_nv((a)) == 0)
+
+#define atomic_cmpxchg(a, o, n) \
+ atomic_cas_uint(a, o, n)
+
+#ifdef _LP64
+#define set_bit(b, p) \
+ atomic_or_ulong(((volatile ulong_t *)(void *)(p)) + ((b) >> 6), \
+ 1ul << ((b) & 0x3f))
+
+#define clear_bit(b, p) \
+ atomic_and_ulong(((volatile ulong_t *)(void *)(p)) + ((b) >> 6), \
+ ~(1ul << ((b) & 0x3f)))
+
+#define test_bit(b, p) \
+ (((volatile ulong_t *)(void *)(p))[(b) >> 6] & (1ul << ((b) & 0x3f)))
+
+#define test_and_set_bit(b, p) \
+ atomic_set_long_excl(((ulong_t *)(void *)(p)) + \
+ ((b) >> 6), ((b) & 0x3f))
+#define test_and_clear_bit(b, p) \
+ !atomic_clear_long_excl(((ulong_t *)(void *)(p)) + ((b) >> 6), \
+ ((b) & 0x3f))
+#else
+#define set_bit(b, p) \
+ atomic_or_uint(((volatile uint_t *)(void *)p) + (b >> 5), \
+ 1ul << (b & 0x1f))
+
+#define clear_bit(b, p) \
+ atomic_and_uint(((volatile uint_t *)(void *)p) + (b >> 5), \
+ ~(1ul << (b & 0x1f)))
+
+#define test_bit(b, p) \
+ (((volatile uint_t *)(void *)p)[b >> 5] & (1ul << (b & 0x1f)))
+
+#define test_and_set_bit(b, p) \
+ atomic_set_long_excl(((ulong_t *)(void *)p) + (b >> 5), (b & 0x1f))
+#define test_and_clear_bit(b, p) \
+ !atomic_clear_long_excl(((ulong_t *)(void *)p) + (b >> 5), (b & 0x1f))
+#endif
+
+
+uint_t rdsv3_one_sec_in_hz;
+
+#define jiffies 100
+#define HZ (drv_hztousec(1))
+#define container_of(m, s, name) \
+ (void *)((uintptr_t)(m) - (uintptr_t)offsetof(s, name))
+#define ARRAY_SIZE(x) (sizeof (x) / sizeof (x[0]))
+/* setting this to PAGESIZE throws build errors */
+#define PAGE_SIZE 4096 /* xxx - fix this */
+#define BITS_PER_LONG (sizeof (unsigned long) * 8)
+
+/* debug */
+#define RDSV3_PANIC() cmn_err(CE_PANIC, "Panic forced by RDSV3");
+
+/* ERR */
+#define MAX_ERRNO 4095
+#define ERR_PTR(x) ((void *)(uintptr_t)x)
+#define IS_ERR(ptr) (((uintptr_t)ptr) >= (uintptr_t)-MAX_ERRNO)
+#define PTR_ERR(ptr) (int)(uintptr_t)ptr
+
+/* cpu */
+#define NR_CPUS 1
+#define put_cpu()
+#define get_cpu() 0
+
+#define MAX_SCHEDULE_TIMEOUT (~0UL>>1)
+
+#define RDMA_CM_EVENT_ADDR_CHANGE 14
+
+/* list */
+/* copied and modified list_remove_node */
+#define list_remove_node(node) \
+ if ((node)->list_next != NULL) { \
+ (node)->list_prev->list_next = (node)->list_next; \
+ (node)->list_next->list_prev = (node)->list_prev; \
+ (node)->list_next = (node)->list_prev = NULL; \
+ }
+
+#define list_splice(src, dst) { \
+ list_create(dst, (src)->list_size, (src)->list_offset); \
+ list_move_tail(dst, src); \
+ }
+
+#define RDSV3_FOR_EACH_LIST_NODE(objp, listp, member) \
+ for (objp = list_head(listp); objp; objp = list_next(listp, objp))
+#define RDSV3_FOR_EACH_LIST_NODE_SAFE(objp, tmp, listp, member) \
+ for (objp = list_head(listp), tmp = (objp != NULL) ? \
+ list_next(listp, objp) : NULL; \
+ objp; \
+ objp = tmp, tmp = (objp != NULL) ? \
+ list_next(listp, objp) : NULL)
+
+/* simulate wait_queue_head_t */
+typedef struct rdsv3_wait_queue_s {
+ kmutex_t waitq_mutex;
+ kcondvar_t waitq_cv;
+} rdsv3_wait_queue_t;
+
+#define rdsv3_init_waitqueue(waitqp) \
+ mutex_init(&(waitqp)->waitq_mutex, NULL, MUTEX_DRIVER, NULL); \
+ cv_init(&(waitqp)->waitq_cv, NULL, CV_DRIVER, NULL)
+
+#define rdsv3_exit_waitqueue(waitqp) \
+ mutex_destroy(&(waitqp)->waitq_mutex); \
+ cv_destroy(&(waitqp)->waitq_cv)
+
+#define rdsv3_wake_up(waitqp) { \
+ mutex_enter(&(waitqp)->waitq_mutex); \
+ cv_signal(&(waitqp)->waitq_cv); \
+ mutex_exit(&(waitqp)->waitq_mutex); \
+ }
+
+#define rdsv3_wake_up_all(waitqp) { \
+ mutex_enter(&(waitqp)->waitq_mutex); \
+ cv_broadcast(&(waitqp)->waitq_cv); \
+ mutex_exit(&(waitqp)->waitq_mutex); \
+ }
+
+#define rdsv3_wait_event(waitq, condition) \
+{ \
+ mutex_enter(&(waitq).waitq_mutex); \
+ while (!(condition)) { \
+ cv_wait(&(waitq).waitq_cv, &(waitq).waitq_mutex); \
+ } \
+ mutex_exit(&(waitq).waitq_mutex); \
+} \
+
+#ifndef __lock_lint
+#define rdsv3_wait_event_interruptible_timeout(waitq, condition, timeo) \
+( \
+{ \
+ long cv_return; \
+ mutex_enter(&((waitq).waitq_mutex)); \
+ cv_return = condition; \
+ while (!(cv_return)) { \
+ cv_return = cv_timedwait_sig(&((waitq).waitq_cv), \
+ &((waitq).waitq_mutex), \
+ timeo * drv_usectohz(1000000) + ddi_get_lbolt()); \
+ if (cv_return == 0) { \
+ break; \
+ } \
+ cv_return = condition; \
+ } \
+ mutex_exit(&((waitq).waitq_mutex)); \
+ cv_return; \
+} \
+)
+#else
+#define rdsv3_wait_event_interruptible(waitq, condition) 0
+#define rdsv3_wait_event_interruptible_timeout(waitq, condition, timeo) 0
+#endif
+
+#define SOCK_DEAD 1ul
+
+/* socket */
+typedef struct rsock {
+ sock_upper_handle_t sk_upper_handle;
+ sock_upcalls_t *sk_upcalls;
+
+ kmutex_t sk_lock;
+ ulong_t sk_flag;
+ rdsv3_wait_queue_t *sk_sleep;
+ int sk_sndbuf;
+ int sk_rcvbuf;
+ atomic_t sk_refcount;
+
+ struct rdsv3_sock *sk_protinfo;
+} rsock_t;
+
+typedef struct rdsv3_conn_info_s {
+ uint32_be_t c_laddr;
+ uint32_be_t c_faddr;
+} rdsv3_conn_info_t;
+
+/* WQ */
+typedef struct rdsv3_workqueue_struct_s {
+ kmutex_t wq_lock;
+ uint_t wq_state;
+ int wq_pending;
+ list_t wq_queue;
+} rdsv3_workqueue_struct_t;
+
+struct rdsv3_work_s;
+typedef void (*rdsv3_work_func_t)(struct rdsv3_work_s *);
+typedef struct rdsv3_work_s {
+ list_node_t work_item;
+ rdsv3_work_func_t func;
+} rdsv3_work_t;
+
+/* simulate delayed_work */
+typedef struct rdsv3_delayed_work_s {
+ kmutex_t lock;
+ rdsv3_work_t work;
+ timeout_id_t timeid;
+ rdsv3_workqueue_struct_t *wq;
+} rdsv3_delayed_work_t;
+
+#define RDSV3_INIT_WORK(wp, f) (wp)->func = f
+#define RDSV3_INIT_DELAYED_WORK(dwp, f) \
+ (dwp)->work.func = f; \
+ mutex_init(&(dwp)->lock, NULL, MUTEX_DRIVER, NULL); \
+ (dwp)->timeid = 0
+
+/* simulate scatterlist */
+struct rdsv3_scatterlist {
+ caddr_t vaddr;
+ uint_t length;
+ ibt_wr_ds_t *sgl;
+ ibt_mi_hdl_t mihdl;
+};
+#define rdsv3_sg_page(scat) (scat)->vaddr
+#define rdsv3_sg_len(scat) (scat)->length
+#define rdsv3_sg_set_page(scat, pg, len, off) \
+ (scat)->vaddr = (caddr_t)(pg + off); \
+ (scat)->length = len
+#define rdsv3_ib_sg_dma_len(dev, scat) rdsv3_sg_len(scat)
+
+/* copied from sys/socket.h */
+#if defined(__sparc)
+/* To maintain backward compatibility, alignment needs to be 8 on sparc. */
+#define _CMSG_HDR_ALIGNMENT 8
+#else
+/* for __i386 (and other future architectures) */
+#define _CMSG_HDR_ALIGNMENT 4
+#endif /* defined(__sparc) */
+
+/*
+ * The cmsg headers (and macros dealing with them) were made available as
+ * part of UNIX95 and hence need to be protected with a _XPG4_2 define.
+ */
+#define _CMSG_DATA_ALIGNMENT (sizeof (int))
+#define _CMSG_HDR_ALIGN(x) (((uintptr_t)(x) + _CMSG_HDR_ALIGNMENT - 1) & \
+ ~(_CMSG_HDR_ALIGNMENT - 1))
+#define _CMSG_DATA_ALIGN(x) (((uintptr_t)(x) + _CMSG_DATA_ALIGNMENT - 1) & \
+ ~(_CMSG_DATA_ALIGNMENT - 1))
+#define CMSG_DATA(c) \
+ ((unsigned char *)_CMSG_DATA_ALIGN((struct cmsghdr *)(c) + 1))
+
+#define CMSG_FIRSTHDR(m) \
+ (((m)->msg_controllen < sizeof (struct cmsghdr)) ? \
+ (struct cmsghdr *)0 : (struct cmsghdr *)((m)->msg_control))
+
+#define CMSG_NXTHDR(m, c) \
+ (((c) == 0) ? CMSG_FIRSTHDR(m) : \
+ ((((uintptr_t)_CMSG_HDR_ALIGN((char *)(c) + \
+ ((struct cmsghdr *)(c))->cmsg_len) + sizeof (struct cmsghdr)) > \
+ (((uintptr_t)((struct msghdr *)(m))->msg_control) + \
+ ((uintptr_t)((struct msghdr *)(m))->msg_controllen))) ? \
+ ((struct cmsghdr *)0) : \
+ ((struct cmsghdr *)_CMSG_HDR_ALIGN((char *)(c) + \
+ ((struct cmsghdr *)(c))->cmsg_len))))
+
+/* Amount of space + padding needed for a message of length l */
+#define CMSG_SPACE(l) \
+ ((unsigned int)_CMSG_HDR_ALIGN(sizeof (struct cmsghdr) + (l)))
+
+/* Value to be used in cmsg_len, does not include trailing padding */
+#define CMSG_LEN(l) \
+ ((unsigned int)_CMSG_DATA_ALIGN(sizeof (struct cmsghdr)) + (l))
+
+/* OFUV -> IB */
+#define RDSV3_IBDEV2HCAHDL(device) (device)->hca_hdl
+#define RDSV3_QP2CHANHDL(qp) (qp)->ibt_qp
+#define RDSV3_PD2PDHDL(pd) (pd)->ibt_pd
+#define RDSV3_CQ2CQHDL(cq) (cq)->ibt_cq
+
+struct rdsv3_hdrs_mr {
+ ibt_lkey_t lkey;
+ caddr_t addr;
+ size_t size;
+ ibt_mr_hdl_t hdl;
+};
+
+/* rdsv3_impl.c */
+void rdsv3_trans_init();
+boolean_t rdsv3_capable_interface(struct lifreq *lifrp);
+int rdsv3_do_ip_ioctl(ksocket_t so4, void **ipaddrs, int *size, int *nifs);
+int rdsv3_do_ip_ioctl_old(ksocket_t so4, void **ipaddrs, int *size, int *nifs);
+boolean_t rdsv3_isloopback(ipaddr_t addr);
+void rdsv3_cancel_delayed_work(rdsv3_delayed_work_t *dwp);
+void rdsv3_flush_workqueue(rdsv3_workqueue_struct_t *wq);
+void rdsv3_queue_work(rdsv3_workqueue_struct_t *wq, rdsv3_work_t *wp);
+void rdsv3_queue_delayed_work(rdsv3_workqueue_struct_t *wq,
+ rdsv3_delayed_work_t *dwp, uint_t delay);
+struct rsock *rdsv3_sk_alloc();
+void rdsv3_sock_init_data(struct rsock *sk);
+void rdsv3_sock_exit_data(struct rsock *sk);
+void rdsv3_poll_wait(struct rsock *sk, rdsv3_wait_queue_t *waitq, short events);
+void rdsv3_destroy_task_workqueue(rdsv3_workqueue_struct_t *wq);
+rdsv3_workqueue_struct_t *rdsv3_create_task_workqueue(char *name);
+int rdsv3_conn_constructor(void *buf, void *arg, int kmflags);
+void rdsv3_conn_destructor(void *buf, void *arg);
+int rdsv3_conn_compare(const void *conn1, const void *conn2);
+void rdsv3_loop_init();
+int rdsv3_mr_compare(const void *mr1, const void *mr2);
+int rdsv3_put_cmsg(struct nmsghdr *msg, int level, int type, size_t size,
+ void *payload);
+int rdsv3_verify_bind_address(ipaddr_t addr);
+int rdsv3_bind_node_compare(const void *a, const void *b);
+void rdsv3_bind_tree_init();
+void rdsv3_bind_tree_exit();
+uint16_t rdsv3_ip_fast_csum(void *buffer, size_t length);
+uint_t rdsv3_ib_dma_map_sg(struct ib_device *dev, struct rdsv3_scatterlist
+ *scat, uint_t num);
+void rdsv3_ib_dma_unmap_sg(ib_device_t *dev, struct rdsv3_scatterlist *scat,
+ uint_t num);
+inline void
+rdsv3_sk_sock_hold(struct rsock *sk)
+{
+ atomic_add_32(&sk->sk_refcount, 1);
+}
+inline void
+rdsv3_sk_sock_put(struct rsock *sk)
+{
+ if (atomic_dec_and_test(&sk->sk_refcount))
+ rdsv3_sock_exit_data(sk);
+}
+inline int
+rdsv3_sk_sock_flag(struct rsock *sk, uint_t flag)
+{
+ return (test_bit(flag, &sk->sk_flag));
+}
+inline void
+rdsv3_sk_sock_orphan(struct rsock *sk)
+{
+ set_bit(SOCK_DEAD, &sk->sk_flag);
+}
+
+#define rdsv3_rcvtimeo(a, b) 3600 /* check this value on linux */
+
+void rdsv3_ib_free_conn(void *arg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RDSV3_IMPL_H */
diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_sc.h b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_sc.h
new file mode 100644
index 0000000000..dc7b7924d7
--- /dev/null
+++ b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_sc.h
@@ -0,0 +1,55 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _RDSV3_SC_H
+#define _RDSV3_SC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <netinet/in.h>
+#include <net/if.h>
+
+typedef struct rds_path_endpoint_s {
+ uint32_t iftype;
+ ipaddr_t ipaddr;
+ ipaddr_t node_ipaddr;
+ char *ifname;
+} rds_path_endpoint_t;
+
+typedef struct rds_path_s {
+ rds_path_endpoint_t local;
+ rds_path_endpoint_t remote;
+} rds_path_t;
+
+extern void rds_clif_name(char *name);
+extern void rds_path_up(struct rds_path_s *path);
+extern void rds_path_down(struct rds_path_s *path);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RDSV3_SC_H */
diff --git a/usr/src/uts/common/sys/rds.h b/usr/src/uts/common/sys/rds.h
new file mode 100644
index 0000000000..99743e5bbf
--- /dev/null
+++ b/usr/src/uts/common/sys/rds.h
@@ -0,0 +1,375 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2008 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/*
+ * Include this file if the application uses rdsv3 sockets.
+ */
+
+/*
+ * This file contains definitions from the ofed rds.h and rds_rdma.h
+ * header file.
+ */
+#ifndef _RDSV3_RDS_H
+#define _RDSV3_RDS_H
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RDS_IB_ABI_VERSION 0x301
+
+#define AF_RDS AF_INET_OFFLOAD
+#define PF_RDS AF_INET_OFFLOAD
+
+#define SOL_RDS 272
+
+/*
+ * setsockopt/getsockopt for SOL_RDS
+ */
+#define RDSV3_CANCEL_SENT_TO 1
+#define RDSV3_GET_MR 2
+#define RDSV3_FREE_MR 3
+/* deprecated: RDS_BARRIER 4 */
+#define RDSV3_RECVERR 5
+#define RDSV3_CONG_MONITOR 6
+
+/*
+ * Control message types for SOL_RDS.
+ *
+ * RDS_CMSG_RDMA_ARGS (sendmsg)
+ * Request a RDMA transfer to/from the specified
+ * memory ranges.
+ * The cmsg_data is a struct rdsv3_rdma_args.
+ * RDS_CMSG_RDMA_DEST (recvmsg, sendmsg)
+ * Kernel informs application about intended
+ * source/destination of a RDMA transfer
+ * RDS_CMSG_RDMA_MAP (sendmsg)
+ * Application asks kernel to map the given
+ * memory range into a IB MR, and send the
+ * R_Key along in an RDS extension header.
+ * The cmsg_data is a struct rdsv3_get_mr_args,
+ * the same as for the GET_MR setsockopt.
+ * RDS_CMSG_RDMA_STATUS (recvmsg)
+ * Returns the status of a completed RDMA operation.
+ */
+#define RDSV3_CMSG_RDMA_ARGS 1
+#define RDSV3_CMSG_RDMA_DEST 2
+#define RDSV3_CMSG_RDMA_MAP 3
+#define RDSV3_CMSG_RDMA_STATUS 4
+#define RDSV3_CMSG_CONG_UPDATE 5
+
+/*
+ * RDMA related types
+ */
+
+/*
+ * This encapsulates a remote memory location.
+ * In the current implementation, it contains the R_Key
+ * of the remote memory region, and the offset into it
+ * (so that the application does not have to worry about
+ * alignment).
+ */
+typedef uint64_t rdsv3_rdma_cookie_t;
+
+struct rdsv3_iovec {
+ uint64_t addr;
+ uint64_t bytes;
+};
+
+struct rdsv3_get_mr_args {
+ struct rdsv3_iovec vec;
+ uint64_t cookie_addr;
+ uint64_t flags;
+};
+
+struct rdsv3_free_mr_args {
+ rdsv3_rdma_cookie_t cookie;
+ uint64_t flags;
+};
+
+struct rdsv3_rdma_args {
+ rdsv3_rdma_cookie_t cookie;
+ struct rdsv3_iovec remote_vec;
+ uint64_t local_vec_addr;
+ uint64_t nr_local;
+ uint64_t flags;
+ uint64_t user_token;
+};
+
+struct rdsv3_rdma_notify {
+ uint64_t user_token;
+ int32_t status;
+};
+
+#define RDSV3_RDMA_SUCCESS 0
+#define RDSV3_RDMA_REMOTE_ERROR 1
+#define RDSV3_RDMA_CANCELED 2
+#define RDSV3_RDMA_DROPPED 3
+#define RDSV3_RDMA_OTHER_ERROR 4
+
+/*
+ * Common set of flags for all RDMA related structs
+ */
+#define RDSV3_RDMA_READWRITE 0x0001
+#define RDSV3_RDMA_FENCE 0x0002 /* use FENCE for immediate send */
+#define RDSV3_RDMA_INVALIDATE 0x0004 /* invalidate R_Key after freeing MR */
+#define RDSV3_RDMA_USE_ONCE 0x0008 /* free MR after use */
+#define RDSV3_RDMA_DONTWAIT 0x0010 /* Don't wait in SET_BARRIER */
+#define RDSV3_RDMA_NOTIFY_ME 0x0020 /* Notify when operation completes */
+
+/*
+ * Congestion monitoring.
+ * Congestion control in RDS happens at the host connection
+ * level by exchanging a bitmap marking congested ports.
+ * By default, a process sleeping in poll() is always woken
+ * up when the congestion map is updated.
+ * With explicit monitoring, an application can have more
+ * fine-grained control.
+ * The application installs a 64bit mask value in the socket,
+ * where each bit corresponds to a group of ports.
+ * When a congestion update arrives, RDS checks the set of
+ * ports that are now uncongested against the list bit mask
+ * installed in the socket, and if they overlap, we queue a
+ * cong_notification on the socket.
+ *
+ * To install the congestion monitor bitmask, use RDS_CONG_MONITOR
+ * with the 64bit mask.
+ * Congestion updates are received via RDS_CMSG_CONG_UPDATE
+ * control messages.
+ *
+ * The correspondence between bits and ports is
+ * 1 << (portnum % 64)
+ */
+#define RDSV3_CONG_MONITOR_SIZE 64
+#define RDSV3_CONG_MONITOR_BIT(port) \
+ (((unsigned int) port) % RDSV3_CONG_MONITOR_SIZE)
+#define RDSV3_CONG_MONITOR_MASK(port) (1ULL << RDSV3_CONG_MONITOR_BIT(port))
+
+/* rds-info related */
+
+#define RDSV3_INFO_FIRST 10000
+#define RDSV3_INFO_COUNTERS 10000
+#define RDSV3_INFO_CONNECTIONS 10001
+/* 10002 aka RDS_INFO_FLOWS is deprecated */
+#define RDSV3_INFO_SEND_MESSAGES 10003
+#define RDSV3_INFO_RETRANS_MESSAGES 10004
+#define RDSV3_INFO_RECV_MESSAGES 10005
+#define RDSV3_INFO_SOCKETS 10006
+#define RDSV3_INFO_TCP_SOCKETS 10007
+#define RDSV3_INFO_IB_CONNECTIONS 10008
+#define RDSV3_INFO_CONNECTION_STATS 10009
+#define RDSV3_INFO_IWARP_CONNECTIONS 10010
+#define RDSV3_INFO_LAST 10010
+
+#ifndef __lock_lint
+#pragma pack(1)
+struct rdsv3_info_counter {
+ uint8_t name[32];
+ uint64_t value;
+} __attribute__((packed));
+#pragma pack()
+#else
+struct rdsv3_info_counter {
+ uint8_t name[32];
+ uint64_t value;
+};
+#endif
+
+#define RDSV3_INFO_CONNECTION_FLAG_SENDING 0x01
+#define RDSV3_INFO_CONNECTION_FLAG_CONNECTING 0x02
+#define RDSV3_INFO_CONNECTION_FLAG_CONNECTED 0x04
+
+#ifndef __lock_lint
+#pragma pack(1)
+struct rdsv3_info_connection {
+ uint64_t next_tx_seq;
+ uint64_t next_rx_seq;
+ uint32_t laddr; /* network order */
+ uint32_t faddr; /* network order */
+ uint8_t transport[15]; /* null term ascii */
+ uint8_t flags;
+} __attribute__((packed));
+#pragma pack()
+#else
+struct rdsv3_info_connection {
+ uint64_t next_tx_seq;
+ uint64_t next_rx_seq;
+ uint32_t laddr; /* network order */
+ uint32_t faddr; /* network order */
+ uint8_t transport[15]; /* null term ascii */
+ uint8_t flags;
+};
+#endif
+
+#ifndef __lock_lint
+#pragma pack(1)
+struct rdsv3_info_flow {
+ uint32_t laddr; /* network order */
+ uint32_t faddr; /* network order */
+ uint32_t bytes;
+ uint16_t lport; /* network order */
+ uint16_t fport; /* network order */
+} __attribute__((packed));
+#pragma pack()
+#else
+struct rdsv3_info_flow {
+ uint32_t laddr; /* network order */
+ uint32_t faddr; /* network order */
+ uint32_t bytes;
+ uint16_t lport; /* network order */
+ uint16_t fport; /* network order */
+};
+#endif
+
+#define RDSV3_INFO_MESSAGE_FLAG_ACK 0x01
+#define RDSV3_INFO_MESSAGE_FLAG_FAST_ACK 0x02
+
+#ifndef __lock_lint
+#pragma pack(1)
+struct rdsv3_info_message {
+ uint64_t seq;
+ uint32_t len;
+ uint32_t laddr; /* network order */
+ uint32_t faddr; /* network order */
+ uint16_t lport; /* network order */
+ uint16_t fport; /* network order */
+ uint8_t flags;
+} __attribute__((packed));
+#pragma pack()
+#else
+struct rdsv3_info_message {
+ uint64_t seq;
+ uint32_t len;
+ uint32_t laddr; /* network order */
+ uint32_t faddr; /* network order */
+ uint16_t lport; /* network order */
+ uint16_t fport; /* network order */
+ uint8_t flags;
+};
+#endif
+
+#ifndef __lock_lint
+#pragma pack(1)
+struct rdsv3_info_socket {
+ uint32_t sndbuf;
+ uint32_t bound_addr; /* network order */
+ uint32_t connected_addr; /* network order */
+ uint16_t bound_port; /* network order */
+ uint16_t connected_port; /* network order */
+ uint32_t rcvbuf;
+ uint64_t inum;
+} __attribute__((packed));
+#pragma pack()
+#else
+struct rdsv3_info_socket {
+ uint32_t sndbuf;
+ uint32_t bound_addr; /* network order */
+ uint32_t connected_addr; /* network order */
+ uint16_t bound_port; /* network order */
+ uint16_t connected_port; /* network order */
+ uint32_t rcvbuf;
+ uint64_t inum;
+};
+#endif
+
+#ifndef __lock_lint
+#pragma pack(1)
+struct rdsv3_info_socket_v1 {
+ uint32_t sndbuf;
+ uint32_t bound_addr; /* network order */
+ uint32_t connected_addr; /* network order */
+ uint16_t bound_port; /* network order */
+ uint16_t connected_port; /* network order */
+ uint32_t rcvbuf;
+} __attribute__((packed));
+#pragma pack()
+#else
+struct rdsv3_info_socket_v1 {
+ uint32_t sndbuf;
+ uint32_t bound_addr; /* network order */
+ uint32_t connected_addr; /* network order */
+ uint16_t bound_port; /* network order */
+ uint16_t connected_port; /* network order */
+ uint32_t rcvbuf;
+};
+#endif
+
+#define RDS_IB_GID_LEN 16
+struct rdsv3_info_rdma_connection {
+ uint32_t src_addr; /* network order */
+ uint32_t dst_addr; /* network order */
+ uint8_t src_gid[RDS_IB_GID_LEN];
+ uint8_t dst_gid[RDS_IB_GID_LEN];
+
+ uint32_t max_send_wr;
+ uint32_t max_recv_wr;
+ uint32_t max_send_sge;
+ uint32_t rdma_mr_max;
+ uint32_t rdma_mr_size;
+};
+
+#define rdsv3_info_ib_connection rdsv3_info_rdma_connection
+#define rdma_fmr_max rdma_mr_max
+#define rdma_fmr_size rdma_mr_size
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RDSV3_RDS_H */
diff --git a/usr/src/uts/intel/Makefile.intel.shared b/usr/src/uts/intel/Makefile.intel.shared
index 3d4ac210ce..3c7e4def72 100644
--- a/usr/src/uts/intel/Makefile.intel.shared
+++ b/usr/src/uts/intel/Makefile.intel.shared
@@ -309,6 +309,7 @@ DRV_KMODS += ral
DRV_KMODS += ramdisk
DRV_KMODS += random
DRV_KMODS += rds
+DRV_KMODS += rdsv3
DRV_KMODS += rpcib
DRV_KMODS += rsm
DRV_KMODS += rts
@@ -732,6 +733,7 @@ MAC_KMODS += mac_ib
SOCKET_KMODS += sockpfp
SOCKET_KMODS += socksctp
SOCKET_KMODS += socksdp
+SOCKET_KMODS += sockrds
#
# kiconv modules (/kernel/kiconv):
diff --git a/usr/src/uts/intel/rdsv3/Makefile b/usr/src/uts/intel/rdsv3/Makefile
new file mode 100644
index 0000000000..238ba56640
--- /dev/null
+++ b/usr/src/uts/intel/rdsv3/Makefile
@@ -0,0 +1,94 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = rdsv3
+OBJECTS = $(RDSV3_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(RDSV3_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE)
+CFLAGS += $(CCVERBOSE) $(_XPG4_2)
+LDFLAGS += -dy -Nfs/sockfs -Nmisc/ksocket -Nmisc/ip -Nmisc/ibtl -Nmisc/ibcm -Nmisc/sol_ofs
+CONF_SRCDIR = $(UTSBASE)/common/io/ib/clients/rdsv3
+#
+# Include common rules.
+#
+include $(UTSBASE)/intel/Makefile.intel
+
+# CFLAGS += -DOFA_SOLARIS
+
+#
+# Disable these lint checks since some errors suppressed here are
+# in the OFED code, but we'd like to keep it as is as much as possible.
+# Note. maintainers should endeavor to investigate and remove these for
+# maximum lint coverage, but please do not carry these forward to new
+# Makefiles blindly.
+#
+LINTTAGS += -erroff=E_STATIC_UNUSED
+LINTTAGS += -erroff=E_CONSTANT_CONDITION
+LINTTAGS += -erroff=E_FUNC_VAR_UNUSED
+LINTTAGS += -erroff=E_SUSPICIOUS_COMPARISON
+LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN
+LINTTAGS += -erroff=E_ASSIGN_NARROW_CONV
+LINTTAGS += -erroff=E_FUNC_SET_NOT_USED
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY) $(SRC_CONFILE)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/intel/Makefile.targ
+
diff --git a/usr/src/uts/intel/sockrds/Makefile b/usr/src/uts/intel/sockrds/Makefile
new file mode 100644
index 0000000000..64a8ccd3c8
--- /dev/null
+++ b/usr/src/uts/intel/sockrds/Makefile
@@ -0,0 +1,86 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+#
+# This makefile drives the production of the nca driver
+# kernel module.
+#
+# intel architecture dependent
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = sockrds
+OBJECTS = $(RDS_SOCK_MOD_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(RDS_SOCK_MOD_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_SOCK_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/intel/Makefile.intel
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement and OS version
+#
+CFLAGS += $(CCVERBOSE)
+
+LDFLAGS += -dy -Nfs/sockfs -Ndrv/ip -Ndrv/rdsv3
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/intel/Makefile.targ
diff --git a/usr/src/uts/sparc/Makefile.sparc.shared b/usr/src/uts/sparc/Makefile.sparc.shared
index 5a9b506817..ada0e7d643 100644
--- a/usr/src/uts/sparc/Makefile.sparc.shared
+++ b/usr/src/uts/sparc/Makefile.sparc.shared
@@ -219,6 +219,7 @@ DRV_KMODS += dlpistub
DRV_KMODS += vnic
DRV_KMODS += xge
DRV_KMODS += rds
+DRV_KMODS += rdsv3
DRV_KMODS += chxge
DRV_KMODS += smbsrv
DRV_KMODS += vscan
@@ -504,6 +505,7 @@ MAC_KMODS += mac_ib
SOCKET_KMODS += sockpfp
SOCKET_KMODS += socksctp
SOCKET_KMODS += socksdp
+SOCKET_KMODS += sockrds
#
# kiconv modules (/kernel/kiconv):
diff --git a/usr/src/uts/sparc/rdsv3/Makefile b/usr/src/uts/sparc/rdsv3/Makefile
new file mode 100644
index 0000000000..959610d555
--- /dev/null
+++ b/usr/src/uts/sparc/rdsv3/Makefile
@@ -0,0 +1,143 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+#
+# This makefile drives the production of the rds driver
+# kernel module.
+#
+# sparc architecture dependent
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = rdsv3
+OBJECTS = $(RDSV3_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(RDSV3_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE)
+CONF_SRCDIR = $(UTSBASE)/common/io/ib/clients/rdsv3
+WARLOCK_OUT = $(RDSV3_OBJS:%.o=%.ll)
+WARLOCK_OK = $(MODULE).ok
+WLCMD_DIR = $(UTSBASE)/common/io/warlock
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/sparc/Makefile.sparc
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY) $(SRC_CONFILE)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
+
+#
+# lint pass one enforcement and OS version
+#
+CFLAGS += $(CCVERBOSE)
+
+#
+# Disable these lint checks since some errors suppressed here are
+# in the OFED code, but we'd like to keep it as is as much as possible.
+# Note. maintainers should endeavor to investigate and remove these for
+# maximum lint coverage, but please do not carry these forward to new
+# Makefiles blindly.
+#
+LINTTAGS += -erroff=E_STATIC_UNUSED
+LINTTAGS += -erroff=E_CONSTANT_CONDITION
+LINTTAGS += -erroff=E_FUNC_VAR_UNUSED
+LINTTAGS += -erroff=E_SUSPICIOUS_COMPARISON
+LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN
+LINTTAGS += -erroff=E_ASSIGN_NARROW_CONV
+LINTTAGS += -erroff=E_FUNC_SET_NOT_USED
+LINTTAGS += -erroff=E_FUNC_USED_VAR_ARG2
+LINTTAGS += -erroff=E_INCONS_ARG_USED2
+
+LDFLAGS += -dy -Nfs/sockfs -Nmisc/ksocket -Ndrv/ip -Nmisc/ibtl -Nmisc/ibcm -Nmisc/sol_ofs
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS) lint64
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE)
+ -$(RM) $@; ln $(ROOTMODULE) $@
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/sparc/Makefile.targ
+
+#
+# Defines for local commands.
+#
+WARLOCK = warlock
+WLCC = wlcc
+TOUCH = touch
+TEST = test
+
+warlock: $(WARLOCK_OK) $(WARLOCK_OUT)
+
+$(WARLOCK_OK): $(WARLOCK_OUT) $(WLCMD_DIR)/rdsv3.wlcmd warlock_ddi.files
+ $(WARLOCK) -c $(WLCMD_DIR)/rdsv3.wlcmd $(WARLOCK_OUT) \
+ -l ../warlock/ddi_dki_impl.ll
+ $(TOUCH) $@
+
+%.ll: $(UTSBASE)/common/io/ib/clients/rdsv3/%.c \
+ $(UTSBASE)/common/sys/ib/clients/rdsv3/ib.h \
+ $(UTSBASE)/common/sys/ib/clients/rdsv3/info.h \
+ $(UTSBASE)/common/sys/ib/clients/rdsv3/loop.h \
+ $(UTSBASE)/common/sys/ib/clients/rdsv3/rdma.h \
+ $(UTSBASE)/common/sys/ib/clients/rdsv3/rdma_transport.h \
+ $(UTSBASE)/common/sys/ib/clients/rdsv3/rds.h \
+ $(UTSBASE)/common/sys/ib/clients/rdsv3/rds_rdma.h \
+ $(UTSBASE)/common/sys/ib/clients/rdsv3/rdsv3_atomic.h \
+ $(UTSBASE)/common/sys/ib/clients/rdsv3/rdsv3_debug.h \
+ $(UTSBASE)/common/sys/ib/clients/rdsv3/rdsv3_impl.h \
+ $(UTSBASE)/common/sys/ib/clients/rdsv3/rdsv3_ofed_types.h \
+ $(UTSBASE)/common/sys/ib/clients/rdsv3/rdsv3_sc.h
+ $(WLCC) $(CPPFLAGS) -DDEBUG -Dinline= -o $@ $<
+
+warlock_ddi.files:
+ @cd ../warlock; pwd; $(MAKE) warlock
diff --git a/usr/src/uts/sparc/sockrds/Makefile b/usr/src/uts/sparc/sockrds/Makefile
new file mode 100644
index 0000000000..a8e1a0702f
--- /dev/null
+++ b/usr/src/uts/sparc/sockrds/Makefile
@@ -0,0 +1,88 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+#
+
+#
+# This makefile drives the production of the nca driver
+# kernel module.
+#
+# sparc architecture dependent
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = sockrds
+OBJECTS = $(RDS_SOCK_MOD_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(RDS_SOCK_MOD_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_SOCK_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/sparc/Makefile.sparc
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement and OS version
+#
+CFLAGS += $(CCVERBOSE)
+
+LDFLAGS += -dy -Nfs/sockfs -Ndrv/ip -Ndrv/rdsv3
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/sparc/Makefile.targ