diff options
author | Eiji Ota <Eiji.Ota@Sun.COM> | 2010-04-21 07:25:52 -0700 |
---|---|---|
committer | Eiji Ota <Eiji.Ota@Sun.COM> | 2010-04-21 07:25:52 -0700 |
commit | c0dd49bdd68c0d758a67d56f07826f3b45cfc664 (patch) | |
tree | bd39a182ec430367040aaee8df5c188a3a05399d | |
parent | b4756084ba1ef238a1e15b1585b853a7c1f85582 (diff) | |
download | illumos-gate-c0dd49bdd68c0d758a67d56f07826f3b45cfc664.tar.gz |
PSARC/2010/043 Reliable Datagram Service v3
6850013 RDS driver upgrade to version 3
6902396 su_recv does not call pollwakeup() for zero-len datagrams when protocol uses uio recv
66 files changed, 20306 insertions, 274 deletions
diff --git a/usr/src/cmd/cmd-inet/etc/sock2path b/usr/src/cmd/cmd-inet/etc/sock2path index a56b540b1b..555d5ec340 100644 --- a/usr/src/cmd/cmd-inet/etc/sock2path +++ b/usr/src/cmd/cmd-inet/etc/sock2path @@ -17,8 +17,7 @@ # # CDDL HEADER END # -# Copyright 2009 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. # # socket configuration information # @@ -53,6 +52,8 @@ 28 2 0 /dev/nca 29 4 1 /dev/spdsock + 30 6 0 sockrds + 31 1 0 trill 32 1 0 sockpfp 32 4 0 sockpfp diff --git a/usr/src/cmd/rcm_daemon/Makefile.com b/usr/src/cmd/rcm_daemon/Makefile.com index 6940bbc87e..4fec58013e 100644 --- a/usr/src/cmd/rcm_daemon/Makefile.com +++ b/usr/src/cmd/rcm_daemon/Makefile.com @@ -65,7 +65,8 @@ COMMON_PERL_SCRIPT_SRC = sparc_PERL_SCRIPT_SRC = SUNW,vdevices.pl -COMMON_SHELL_SCRIPT_SRC = SUNW,ibsdpu.sh +COMMON_SHELL_SCRIPT_SRC = SUNW,ibsdpu.sh \ + SUNW,rdsv3u.sh COMMON_MOD_OBJ = \ filesys_rcm.o \ diff --git a/usr/src/cmd/rcm_daemon/common/SUNW,rdsv3u.sh b/usr/src/cmd/rcm_daemon/common/SUNW,rdsv3u.sh new file mode 100644 index 0000000000..c54565f860 --- /dev/null +++ b/usr/src/cmd/rcm_daemon/common/SUNW,rdsv3u.sh @@ -0,0 +1,109 @@ +#!/sbin/sh +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# + +# +# RCM script to inform if RDSv3 is currently used +# +rcm_script_version=1 +rcm_script_func_info="RDSv3 (un)configuration rcm script" +rcm_cmd_timeout=10 +rcm_resource_name=/devices/ib/rdsv3@0:rdsv3 + +do_scriptinfo() +{ + printf "rcm_script_version=%d\n" $rcm_script_version; + printf "rcm_script_func_info=$rcm_script_func_info\n"; + printf "rcm_cmd_timeout=%d\n" $rcm_cmd_timeout; + exit 0; +} + +do_register() +{ + printf "rcm_resource_name=%s\n" $rcm_resource_name; + exit 0; +} + +do_resourceinfo() +{ + if [ x"$1" = x"/devices/ib/rdsv3@0:rdsv3" ] + then + printf "rcm_resource_usage_info=RDSv3 IB device 0\n"; + exit 0; + else + printf "rcm_failure_reason=Unknown RDSv3 device\n"; + exit 3; + fi +} + +do_queryremove() +{ + output=`/usr/sbin/fuser $rcm_resource_name 2>&1` + ret=$? + + sockrds=`echo "$output" | grep 'sockrds'` + + if [ $ret -eq 0 ] && [ ! -z "$sockrds" ] + then + printf "rcm_log_warn=RDSv3 is being used currently. " + printf "Please stop processes currently running on it " + printf "before un-configuring IB HCA/RDSv3.\n"; + printf "rcm_failure_reason=RDSv3 is being used on this system\n"; + exit 3; + elif [ $ret -ne 0 ] + then + printf "rcm_log_warn='fuser $rcm_resource_name' command failed." + printf "rcm_failure_reason='fuser $rcm_resource_name' command " + printf "failed.\n"; + exit 1; + fi + exit 0; +} + +do_preremove() +{ + exit 0; +} + +do_undoremove() +{ + exit 0; +} + +do_postremove() +{ + exit 0; +} + +case "$1" in + scriptinfo) do_scriptinfo;; + register) do_register;; + resourceinfo) do_resourceinfo $2;; + queryremove) do_queryremove $2;; + preremove) do_preremove $2;; + undoremove) do_undoremove $2;; + postremove) do_postremove $2;; + *) echo Unknown option $1;; +esac diff --git a/usr/src/pkg/manifests/driver-network-rdsv3.mf b/usr/src/pkg/manifests/driver-network-rdsv3.mf new file mode 100644 index 0000000000..802015fae3 --- /dev/null +++ b/usr/src/pkg/manifests/driver-network-rdsv3.mf @@ -0,0 +1,57 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# + +# +# This package will install successfully into any zone, global or +# non-global. The files, directories, links, and hardlinks, however, +# will only be installed into the global zone. +# +<include hollow_zone_pkg> +set name=pkg.fmri value=pkg:/driver/network/rdsv3@$(PKGVERS) +set name=pkg.description value="The RDS driver is an implementation of the Reliable Datagram Sockets API. It provides reliable, in-order datagram and RDMA data delivery between sockets." +set name=pkg.summary value="Solaris Reliable Datagram Sockets" +set name=info.classification value=org.opensolaris.category.2008:System/Core +set name=variant.arch value=$(ARCH) +set name=variant.opensolaris.zone value=global value=nonglobal +dir path=usr group=sys +dir path=usr/lib group=bin +dir path=usr/lib/rcm group=bin +dir path=usr/lib/rcm/scripts group=bin +file path=/usr/lib/rcm/scripts/SUNW,rdsv3u.sh group=bin mode=0555 +dir path=kernel group=sys +dir path=kernel/drv group=sys +dir path=kernel/drv/$(ARCH64) group=sys +driver name=rdsv3 perms="* 0644 root sys" +$(i386_ONLY)file path=kernel/drv/rdsv3 group=sys +file path=kernel/drv/$(ARCH64)/rdsv3 group=sys +file path=kernel/drv/rdsv3.conf group=sys preserve=renamenew +dir path=kernel/socketmod group=sys +dir path=kernel/socketmod/$(ARCH64) group=sys +$(i386_ONLY)file path=kernel/socketmod/sockrds mode=0755 group=sys +file path=kernel/socketmod/$(ARCH64)/sockrds mode=0755 group=sys +license cr_Sun license=cr_Sun +license lic_CDDL license=lic_CDDL +license uts/common/io/ib/clients/rdsv3/LICENSE \ + license=uts/common/io/ib/clients/rdsv3/LICENSE diff --git a/usr/src/pkg/manifests/system-header.mf b/usr/src/pkg/manifests/system-header.mf index 5a11fda9e8..e679fb1744 100644 --- a/usr/src/pkg/manifests/system-header.mf +++ b/usr/src/pkg/manifests/system-header.mf @@ -1085,6 +1085,7 @@ file path=usr/include/sys/ib/clients/of/rdma/rdma_user_cm.h file path=usr/include/sys/ib/clients/of/sol_ofs/sol_cma.h file path=usr/include/sys/ib/clients/of/sol_ofs/sol_ib_cma.h file path=usr/include/sys/ib/clients/of/sol_ofs/sol_ofs_common.h +file path=usr/include/sys/ib/clients/of/sol_ofs/sol_kverb_impl.h file path=usr/include/sys/ib/clients/of/sol_ucma/sol_rdma_user_cm.h file path=usr/include/sys/ib/clients/of/sol_ucma/sol_ucma.h file path=usr/include/sys/ib/clients/of/sol_uverbs/sol_uverbs.h @@ -1325,6 +1326,7 @@ file path=usr/include/sys/ramdisk.h file path=usr/include/sys/random.h file path=usr/include/sys/rctl.h file path=usr/include/sys/rctl_impl.h +file path=usr/include/sys/rds.h file path=usr/include/sys/reboot.h file path=usr/include/sys/refstr.h file path=usr/include/sys/refstr_impl.h diff --git a/usr/src/pkg/manifests/system-network.mf b/usr/src/pkg/manifests/system-network.mf index fe17448d86..cdabc70544 100644 --- a/usr/src/pkg/manifests/system-network.mf +++ b/usr/src/pkg/manifests/system-network.mf @@ -20,8 +20,7 @@ # # -# Copyright 2010 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. # set name=pkg.fmri value=pkg:/system/network@$(PKGVERS) @@ -70,7 +69,7 @@ file path=etc/inet/secret/ike.preshared group=sys mode=0600 \ original_name=SUNWcnetr:etc/inet/secret/ike.preshared preserve=true file path=etc/inet/secret/ipseckeys.sample group=sys mode=0600 file path=etc/inet/sock2path group=sys \ - original_name=SUNWcnetr:etc/inet/sock2path preserve=true + original_name=SUNWcnetr:etc/inet/sock2path preserve=renameold file path=etc/ipadm/ipadm.conf group=netadm owner=netadm preserve=true file path=etc/nwam/loc/NoNet/ipf.conf.dfl group=netadm owner=netadm \ preserve=true diff --git a/usr/src/tools/opensolaris/license-list b/usr/src/tools/opensolaris/license-list index e48abfd7fe..0c87b1927d 100644 --- a/usr/src/tools/opensolaris/license-list +++ b/usr/src/tools/opensolaris/license-list @@ -148,6 +148,7 @@ usr/src/uts/common/io/drm/THIRDPARTYLICENSE usr/src/uts/common/io/elxl/THIRDPARTYLICENSE usr/src/uts/common/io/ib/clients/of/lic_of usr/src/uts/common/io/ib/clients/rds/THIRDPARTYLICENSE +usr/src/uts/common/io/ib/clients/rdsv3/LICENSE usr/src/uts/common/io/ipw/THIRDPARTYLICENSE usr/src/uts/common/io/ipw/fw-ipw2100/LICENSE usr/src/uts/common/io/iwh/THIRDPARTYLICENSE diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 09560f032d..b514153403 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -593,11 +593,19 @@ SCTP_SOCK_MOD_OBJS += sockmod_sctp.o socksctp.o socksctpsubr.o PFP_SOCK_MOD_OBJS += sockmod_pfp.o +RDS_SOCK_MOD_OBJS += sockmod_rds.o + RDS_OBJS += rdsddi.o rdssubr.o rds_opt.o rds_ioctl.o RDSIB_OBJS += rdsib.o rdsib_ib.o rdsib_cm.o rdsib_ep.o rdsib_buf.o \ rdsib_debug.o rdsib_sc.o +RDSV3_OBJS += af_rds.o rdsv3_ddi.o bind.o loop.o threads.o connection.o \ + transport.o cong.o sysctl.o message.o rds_recv.o send.o \ + stats.o info.o page.o rdma_transport.o ib_ring.o ib_rdma.o \ + ib_recv.o ib.o ib_send.o ib_sysctl.o ib_stats.o ib_cm.o \ + rdsv3_sc.o rdsv3_debug.o rdsv3_impl.o rdma.o + ISER_OBJS += iser.o iser_cm.o iser_cq.o iser_ib.o iser_idm.o \ iser_resource.o iser_xfer.o @@ -695,7 +703,8 @@ HERMON_OBJS += hermon.o hermon_agents.o hermon_cfg.o hermon_ci.o hermon_cmd.o \ DAPLT_OBJS += daplt.o SOL_OFS_OBJS += sol_cma.o sol_ib_cma.o sol_uobj.o \ - sol_ofs_debug_util.o sol_ofs_gen_util.o + sol_ofs_debug_util.o sol_ofs_gen_util.o \ + sol_kverbs.o SOL_UCMA_OBJS += sol_ucma.o diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index 8db7e6ef44..fa12fcd9c7 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -738,6 +738,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/ib/clients/rds/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/ib/clients/rdsv3/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/ib/clients/iser/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -2041,6 +2045,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/hotplug/pcihp/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/ib/clients/rds/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/ib/clients/rdsv3/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/ib/clients/iser/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c index 4521fdd352..64ea59c4b5 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/types.h> @@ -1156,8 +1155,7 @@ so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp, ASSERT(errorp != NULL); *errorp = 0; if (mp == NULL) { - if (msg_size > 0) { - ASSERT(so->so_downcalls->sd_recv_uio != NULL); + if (so->so_downcalls->sd_recv_uio != NULL) { mutex_enter(&so->so_lock); /* the notify functions will drop the lock */ if (flags & MSG_OOB) @@ -1166,6 +1164,7 @@ so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp, so_notify_data(so, msg_size); return (0); } + ASSERT(msg_size == 0); /* * recv space check */ diff --git a/usr/src/uts/common/inet/sockmods/sockmod_rds.c b/usr/src/uts/common/inet/sockmods/sockmod_rds.c new file mode 100644 index 0000000000..f8fc2e42d0 --- /dev/null +++ b/usr/src/uts/common/inet/sockmods/sockmod_rds.c @@ -0,0 +1,106 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/modctl.h> +#include <sys/sunldi.h> +#include <inet/common.h> +#include <sys/strsubr.h> +#include <sys/socketvar.h> + +extern sock_lower_handle_t rdsv3_create(int, int, int, sock_downcalls_t **, + uint_t *, int *, int, cred_t *); + +#define INET_NAME "sockrds" +#define INET_DEVMINOR 0 +#define INET_MODMTFLAGS D_MP +#define INET_SOCKDESC "RDSv3 socket module" +#define INET_SOCK_PROTO_CREATE_FUNC (*rdsv3_create) + +#include "../inetddi.c" + +ldi_ident_t sockrds_li; +ldi_handle_t rdsv3_transport_handle = NULL; + +#define RDSV3_DEVICE_NAME "/devices/ib/rdsv3@0:rdsv3" + +int +_init(void) +{ + int ret; + + ret = ldi_ident_from_mod(&modlinkage, &sockrds_li); + if (ret != 0) { + sockrds_li = NULL; + goto done; + } + + ret = ldi_open_by_name(RDSV3_DEVICE_NAME, FREAD | FWRITE, kcred, + &rdsv3_transport_handle, sockrds_li); + if (ret != 0) { + ldi_ident_release(sockrds_li); + sockrds_li = NULL; + rdsv3_transport_handle = NULL; + goto done; + } + + ret = mod_install(&modlinkage); + if (ret != 0) { + (void) ldi_close(rdsv3_transport_handle, FNDELAY, kcred); + ldi_ident_release(sockrds_li); + sockrds_li = NULL; + rdsv3_transport_handle = NULL; + } + +done: + return (ret); +} + +int +_fini(void) +{ + int ret; + + ret = mod_remove(&modlinkage); + if (ret != 0) { + return (ret); + } + + if (rdsv3_transport_handle != NULL) { + (void) ldi_close(rdsv3_transport_handle, FNDELAY, kcred); + rdsv3_transport_handle = NULL; + } + + if (sockrds_li != NULL) + ldi_ident_release(sockrds_li); + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_cma.c b/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_cma.c index 81b1edcb50..2405af462d 100644 --- a/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_cma.c +++ b/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_cma.c @@ -49,6 +49,7 @@ #include <sys/ib/clients/of/rdma/rdma_cm.h> #include <sys/ib/clients/of/sol_ofs/sol_cma.h> +#include <sys/ib/clients/of/sol_ofs/sol_kverb_impl.h> /* Modload support */ static struct modlmisc sol_ofs_modmisc = { @@ -62,18 +63,7 @@ struct modlinkage sol_ofs_modlinkage = { NULL }; -static void sol_ofs_ibt_async_hdlr(void *clnt, ibt_hca_hdl_t hdl, - ibt_async_code_t code, ibt_async_event_t *event); - -static ibt_clnt_modinfo_t sol_ofs_ibt_modinfo = { - IBTI_V_CURR, - IBT_GENERIC_MISC, - sol_ofs_ibt_async_hdlr, - NULL, - "sol_ofs" -}; - -ibt_clnt_hdl_t sol_ofs_ibt_hdl; +static ib_client_t *sol_cma_ib_client; sol_cma_glbl_listen_t sol_cma_glbl_listen; avl_tree_t sol_cma_glbl_listen_tree; @@ -106,7 +96,6 @@ static void cma_handle_nomore_events(sol_cma_chan_t *); extern void sol_ofs_dprintf_init(); extern void sol_ofs_dprintf_fini(); -static void ibcma_init_rdma_devs(); cma_chan_state_t cma_get_chan_state(sol_cma_chan_t *); extern int ibcma_init_root_chan(sol_cma_chan_t *, sol_cma_glbl_listen_t *); extern int ibcma_fini_root_chan(sol_cma_chan_t *); @@ -133,7 +122,6 @@ int _init(void) { int err; - ibt_status_t status; sol_ofs_dprintf_init(); SOL_OFS_DPRINTF_L5(sol_ofs_dbg_str, "_init()"); @@ -144,23 +132,39 @@ _init(void) sol_cma_svc_cmp, sizeof (sol_cma_glbl_listen_t), offsetof(sol_cma_glbl_listen_t, cma_listen_node)); + sol_cma_ib_client = kmem_zalloc(sizeof (ib_client_t), KM_NOSLEEP); + if (!sol_cma_ib_client) { + SOL_OFS_DPRINTF_L2(sol_ofs_dbg_str, + "_init() - mem alloc failed"); + avl_destroy(&sol_cma_glbl_listen_tree); + mutex_destroy(&sol_cma_dev_mutex); + mutex_destroy(&sol_cma_glob_mutex); + sol_ofs_dprintf_fini(); + return (ENOMEM); + } + + sol_cma_ib_client->name = "sol_ofs"; + sol_cma_ib_client->add = sol_cma_add_dev; + sol_cma_ib_client->remove = sol_cma_rem_dev; + sol_cma_ib_client->dip = NULL; - if ((status = ibt_attach(&sol_ofs_ibt_modinfo, NULL, NULL, - &sol_ofs_ibt_hdl)) != IBT_SUCCESS) { - cmn_err(CE_WARN, "_init: ibt_attach failed"); + if ((err = ib_register_client(sol_cma_ib_client)) != 0) { SOL_OFS_DPRINTF_L2(sol_ofs_dbg_str, - "_init() ibt_attach() failed with status %d", - status); + "_init() ib_register_client() failed with err %d", + err); + kmem_free(sol_cma_ib_client, sizeof (ib_client_t)); avl_destroy(&sol_cma_glbl_listen_tree); mutex_destroy(&sol_cma_dev_mutex); mutex_destroy(&sol_cma_glob_mutex); sol_ofs_dprintf_fini(); - return (ENODEV); + return (err); } if ((err = mod_install(&sol_ofs_modlinkage)) != 0) { - SOL_OFS_DPRINTF_L2(sol_ofs_dbg_str, "_init() failed"); - (void) ibt_detach(sol_ofs_ibt_hdl); + SOL_OFS_DPRINTF_L2(sol_ofs_dbg_str, + "_init() - mod_install() failed"); + ib_unregister_client(sol_cma_ib_client); + kmem_free(sol_cma_ib_client, sizeof (ib_client_t)); avl_destroy(&sol_cma_glbl_listen_tree); mutex_destroy(&sol_cma_dev_mutex); mutex_destroy(&sol_cma_glob_mutex); @@ -168,8 +172,6 @@ _init(void) return (err); } - ibcma_init_rdma_devs(); - SOL_OFS_DPRINTF_L5(sol_ofs_dbg_str, "_init() - ret"); return (err); } @@ -191,7 +193,9 @@ _fini(void) "_fini: mod_remove failed"); return (err); } - (void) ibt_detach(sol_ofs_ibt_hdl); + + ib_unregister_client(sol_cma_ib_client); + kmem_free(sol_cma_ib_client, sizeof (ib_client_t)); avl_destroy(&sol_cma_glbl_listen_tree); mutex_destroy(&sol_cma_dev_mutex); mutex_destroy(&sol_cma_glob_mutex); @@ -234,7 +238,7 @@ sol_cma_add_dev(struct ib_device *dev) init_genlist(&new_device->cma_epchan_list); new_device->cma_device = dev; - dev->data = new_device; + ib_set_client_data(dev, sol_cma_ib_client, new_device); mutex_enter(&sol_cma_dev_mutex); llist_add_tail(&new_device->cma_list, &sol_cma_dev_list); @@ -247,7 +251,9 @@ sol_cma_rem_dev(struct ib_device *dev) cma_device_t *rem_device; genlist_entry_t *entry; - rem_device = (cma_device_t *)dev->data; + SOL_OFS_DPRINTF_L5(sol_ofs_dbg_str, "sol_rem_dev(%p)", dev); + + rem_device = (cma_device_t *)ib_get_client_data(dev, sol_cma_ib_client); if (!rem_device) { SOL_OFS_DPRINTF_L2(sol_ofs_dbg_str, "sol_cma_rem_dev() " "NULL cma_dev!!"); @@ -384,45 +390,6 @@ sol_cma_add_hca_list(sol_cma_chan_t *ep_chanp, ib_guid_t hca_guid) "No matching HCA in list!!", ep_chanp, hca_guid); } -/*ARGSUSED*/ -static void -sol_ofs_ibt_async_hdlr(void *clnt, ibt_hca_hdl_t hdl, - ibt_async_code_t code, ibt_async_event_t *event) -{ - struct ib_device *device; - llist_head_t *entry; - cma_device_t *cma_devp; - - SOL_OFS_DPRINTF_L3(sol_ofs_dbg_str, - "ibt_async_hdlr(%p, %p, %x, %p)", - clnt, hdl, code, event); - - switch (code) { - case IBT_HCA_ATTACH_EVENT: - device = kmem_zalloc(sizeof (struct ib_device), - KM_SLEEP); - device->node_guid = htonll(event->ev_hca_guid); - sol_cma_add_dev(device); - break; - case IBT_HCA_DETACH_EVENT: - mutex_enter(&sol_cma_dev_mutex); - list_for_each(entry, &sol_cma_dev_list) { - cma_devp = (cma_device_t *)entry->ptr; - - if (cma_devp->cma_device->node_guid == - htonll(event->ev_hca_guid)) { - mutex_exit(&sol_cma_dev_mutex); - sol_cma_rem_dev(cma_devp->cma_device); - mutex_enter(&sol_cma_dev_mutex); - break; - } - } - mutex_exit(&sol_cma_dev_mutex); - - break; - } -} - /* * rdma_cm.h API functions. */ @@ -474,6 +441,7 @@ rdma_map_id2qphdl(struct rdma_cm_id *rdma_idp, void *qp_hdl) chanp->chan_qp_hdl = qp_hdl; } + void rdma_destroy_id(struct rdma_cm_id *rdma_idp) { @@ -494,7 +462,15 @@ rdma_destroy_id(struct rdma_cm_id *rdma_idp) rdma_idp, root_chanp); mutex_enter(&chanp->chan_mutex); - chanp->chan_cmid_destroy_state = SOL_CMA_CALLER_CMID_DESTROYED; + chanp->chan_cmid_destroy_state |= SOL_CMA_CALLER_CMID_DESTROYED; + + /* + * Wait in destroy of CMID when rdma_resolve_addr() / rdma_listen() + * rdma_resolve_route() API is in progress. + */ + while (chanp->chan_cmid_destroy_state & SOL_CMA_CALLER_API_PROGRESS) + cv_wait(&chanp->chan_destroy_cv, &chanp->chan_mutex); + /* Wait if Event is been notified to consumer */ while (chanp->chan_cmid_destroy_state & SOL_CMA_CALLER_EVENT_PROGRESS) cv_wait(&chanp->chan_destroy_cv, &chanp->chan_mutex); @@ -541,6 +517,10 @@ rdma_destroy_id(struct rdma_cm_id *rdma_idp) chanp->chan_req_cnt--; chanp->chan_req_total_cnt--; mutex_exit(&chanp->chan_mutex); + mutex_enter(&req_cmid_chan->chan_mutex); + req_cmid_chan->chan_req_state = + REQ_CMID_NONE; + mutex_exit(&req_cmid_chan->chan_mutex); (void) rdma_disconnect( (struct rdma_cm_id *)req_cmid_chan); mutex_enter(&chanp->chan_mutex); @@ -578,14 +558,20 @@ rdma_destroy_id(struct rdma_cm_id *rdma_idp) cv_wait(&chanp->chan_destroy_cv, &chanp->chan_mutex); } + if (root_chanp) + mutex_enter(&root_chanp->chan_mutex); +#ifdef DEBUG SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "rdma_destroy_id: " "root_idp %p, cnt %x, state %x", root_chanp, root_chanp ? root_chanp->chan_req_total_cnt : 0, root_chanp ? cma_get_chan_state(root_chanp) : 0); +#endif if (root_chanp && root_chanp->chan_req_total_cnt == 1 && cma_get_chan_state(root_chanp) == SOL_CMA_CHAN_DESTROY_PENDING) do_wait = 1; + if (root_chanp) + mutex_exit(&root_chanp->chan_mutex); skip_passive_handling : state = cma_get_chan_state(chanp); @@ -697,7 +683,10 @@ rdma_bind_addr(struct rdma_cm_id *idp, struct sockaddr *addr) * iWARP. */ if (chanp->chan_ib_client_hdl == NULL) { - chanp->chan_ib_client_hdl = sol_ofs_ibt_hdl; + ofs_client_t *ofs_clnt; + + ofs_clnt = (ofs_client_t *)sol_cma_ib_client->clnt_hdl; + chanp->chan_ib_client_hdl = ofs_clnt->ibt_hdl; } if (chanp->chan_ib_client_hdl && rdma_ib_bind_addr(idp, addr) == 0) { SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, @@ -726,8 +715,6 @@ rdma_resolve_addr(struct rdma_cm_id *idp, struct sockaddr *src_addr, sol_cma_chan_t *chanp; struct rdma_addr *addrp; cma_chan_state_t state; - enum rdma_cm_event_type event; - int rc = 0; ASSERT(idp); chanp = (sol_cma_chan_t *)idp; @@ -740,10 +727,18 @@ rdma_resolve_addr(struct rdma_cm_id *idp, struct sockaddr *src_addr, if (state != SOL_CMA_CHAN_IDLE && state != SOL_CMA_CHAN_BOUND) { SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str, "rdma_resolve_addr : invalid chan state %x", state); - rc = EINVAL; mutex_exit(&chanp->chan_mutex); - goto resolve_exit; + return (EINVAL); } + if (chanp->chan_cmid_destroy_state & + SOL_CMA_CALLER_CMID_DESTROYED) { + SOL_OFS_DPRINTF_L3(sol_rdmacm_dbg_str, + "rdma_resolve_addr : CMID %p, destroy called", chanp); + mutex_exit(&chanp->chan_mutex); + return (EINVAL); + } + chanp->chan_cmid_destroy_state |= SOL_CMA_CALLER_API_PROGRESS; + if (chanp->chan_xport_type == SOL_CMA_XPORT_NONE) { bcopy((void *)src_addr, (void *)&(addrp->src_addr), sizeof (struct sockaddr)); @@ -757,31 +752,52 @@ rdma_resolve_addr(struct rdma_cm_id *idp, struct sockaddr *src_addr, * if this fails, resolve this as an @ corresponding to iWARP */ if (chanp->chan_ib_client_hdl == NULL) { - chanp->chan_ib_client_hdl = sol_ofs_ibt_hdl; + ofs_client_t *ofs_clnt; + + ofs_clnt = (ofs_client_t *)sol_cma_ib_client->clnt_hdl; + chanp->chan_ib_client_hdl = ofs_clnt->ibt_hdl; } if (chanp->chan_ib_client_hdl && rdma_ib_resolve_addr(idp, src_addr, dst_addr, timeout_ms) == 0) { SOL_OFS_DPRINTF_L4(sol_rdmacm_dbg_str, "rdma_resolve_addr: ret IB @"); - goto resolve_exit; #ifdef IWARP_SUPPORT } else if (chanp->chan_iw_client_hdl && rdma_iw_resolve_addr(idp, src_addr, dst_addr, timeout_ms) == 0) { SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str, "rdma_resolve_addr: ret iWARP @"); - goto resolve_exit; #endif /* IWARP_SUPPORT */ } else { SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str, "rdma_resolve_addr: Invalid @"); - rc = EINVAL; + return (EINVAL); } + SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "rdma_resolve_addr: ret 0"); + return (0); +} + +static void cma_generate_event_sync(struct rdma_cm_id *, + enum rdma_cm_event_type, int, struct rdma_conn_param *, + struct rdma_ud_param *); + +void +cma_resolve_addr_callback(sol_cma_chan_t *chanp, int rc) +{ + enum rdma_cm_event_type event; -resolve_exit: + mutex_enter(&chanp->chan_mutex); + if (chanp->chan_cmid_destroy_state & + SOL_CMA_CALLER_CMID_DESTROYED) { + SOL_OFS_DPRINTF_L3(sol_rdmacm_dbg_str, + "cma_resolve_addr : CMID %p, destroy called", chanp); + chanp->chan_cmid_destroy_state &= + ~SOL_CMA_CALLER_API_PROGRESS; + cv_broadcast(&chanp->chan_destroy_cv); + mutex_exit(&chanp->chan_mutex); + return; + } if (rc == 0) { - mutex_enter(&chanp->chan_mutex); cma_set_chan_state(chanp, SOL_CMA_CHAN_ADDR_RESLVD); - mutex_exit(&chanp->chan_mutex); event = RDMA_CM_EVENT_ADDR_RESOLVED; } else event = RDMA_CM_EVENT_ADDR_ERROR; @@ -791,9 +807,16 @@ resolve_exit: * This will result in RDMA_USER_CM_CMD_RESOLVE_ROUTE in * userland. */ - cma_generate_event(idp, event, 0, NULL, NULL); - SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "rdma_resolve_addr: ret 0"); - return (0); + chanp->chan_cmid_destroy_state |= SOL_CMA_CALLER_EVENT_PROGRESS; + mutex_exit(&chanp->chan_mutex); + cma_generate_event_sync((struct rdma_cm_id *)chanp, event, 0, + NULL, NULL); + + mutex_enter(&chanp->chan_mutex); + chanp->chan_cmid_destroy_state &= ~SOL_CMA_CALLER_API_PROGRESS; + if (chanp->chan_cmid_destroy_state & SOL_CMA_CALLER_CMID_DESTROYED) + cv_broadcast(&chanp->chan_destroy_cv); + mutex_exit(&chanp->chan_mutex); } int @@ -814,6 +837,14 @@ rdma_resolve_route(struct rdma_cm_id *idp, int timeout_ms) "resolve_route: Invalid state"); return (EINVAL); } + if (chanp->chan_cmid_destroy_state & + SOL_CMA_CALLER_CMID_DESTROYED) { + SOL_OFS_DPRINTF_L3(sol_rdmacm_dbg_str, + "rdma_resolve_route : CMID %p, destroy called", chanp); + mutex_exit(&chanp->chan_mutex); + return (EINVAL); + } + chanp->chan_cmid_destroy_state |= SOL_CMA_CALLER_API_PROGRESS; mutex_exit(&chanp->chan_mutex); /* @@ -823,6 +854,13 @@ rdma_resolve_route(struct rdma_cm_id *idp, int timeout_ms) */ cma_generate_event(idp, RDMA_CM_EVENT_ROUTE_RESOLVED, 0, NULL, NULL); + + mutex_enter(&chanp->chan_mutex); + chanp->chan_cmid_destroy_state &= ~SOL_CMA_CALLER_API_PROGRESS; + if (chanp->chan_cmid_destroy_state & SOL_CMA_CALLER_CMID_DESTROYED) + cv_broadcast(&chanp->chan_destroy_cv); + mutex_exit(&chanp->chan_mutex); + SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "resolve_route: ret 0"); return (0); } @@ -905,11 +943,16 @@ rdma_listen(struct rdma_cm_id *idp, int bklog) } cma_set_chan_state(chanp, SOL_CMA_CHAN_LISTEN); - if (chanp->chan_listenp) { - SOL_OFS_DPRINTF_L4(sol_rdmacm_dbg_str, "rdma_listen: " - "NON NULL listen_list"); - goto listen_from_list; + if (chanp->chan_cmid_destroy_state & + SOL_CMA_CALLER_CMID_DESTROYED) { + SOL_OFS_DPRINTF_L3(sol_rdmacm_dbg_str, + "rdma_listen : CMID %p, destroy called", chanp); + mutex_exit(&chanp->chan_mutex); + return (EINVAL); } + chanp->chan_cmid_destroy_state |= SOL_CMA_CALLER_API_PROGRESS; + + ASSERT(chanp->chan_listenp == NULL); chanp->chan_listenp = kmem_zalloc(sizeof (sol_cma_listen_info_t), KM_SLEEP); @@ -917,12 +960,12 @@ rdma_listen(struct rdma_cm_id *idp, int bklog) (chanp->chan_listenp)->listen_is_root = 1; ret = cma_init_listen_root(chanp); if (ret) { + chanp->chan_listenp = NULL; + mutex_exit(&chanp->chan_mutex); SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str, "rdma_listen: " "cma_init_listen_root: failed"); kmem_free(chanp->chan_listenp, sizeof (sol_cma_listen_info_t)); - chanp->chan_listenp = NULL; - mutex_exit(&chanp->chan_mutex); return (EINVAL); } @@ -949,7 +992,13 @@ rdma_listen(struct rdma_cm_id *idp, int bklog) mutex_exit(&chanp->chan_mutex); return (0); } -listen_from_list: + + if (chanp->chan_cmid_destroy_state & SOL_CMA_CALLER_CMID_DESTROYED) { + chanp->chan_cmid_destroy_state &= + ~SOL_CMA_CALLER_API_PROGRESS; + cv_broadcast(&chanp->chan_destroy_cv); + } + genlist_for_each(entry, &(CHAN_LISTEN_LIST(chanp))) { struct rdma_cm_id *ep_idp; sol_cma_chan_t *ep_chanp; @@ -965,6 +1014,10 @@ listen_from_list: if (ret) break; } + + chanp->chan_cmid_destroy_state &= ~SOL_CMA_CALLER_API_PROGRESS; + if (chanp->chan_cmid_destroy_state & SOL_CMA_CALLER_CMID_DESTROYED) + cv_broadcast(&chanp->chan_destroy_cv); mutex_exit(&chanp->chan_mutex); SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "rdma_listen: ret %x", ret); @@ -1005,6 +1058,28 @@ rdma_accept(struct rdma_cm_id *idp, struct rdma_conn_param *conn_param) "REQ AVL remove %p", root_chanp, idp); mutex_enter(&root_chanp->chan_mutex); avl_remove(&root_chanp->chan_req_avl_tree, idp); + + /* For TCP, insert into ACPT_AVL_TREE */ + if (idp->ps == RDMA_PS_TCP) { + void *find_ret; + avl_index_t where; + + SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, + "Add to ACPT AVL of %p IDP, idp %p, qp_hdl %p", + root_idp, idp, chanp->chan_qp_hdl); + find_ret = avl_find(&root_chanp->chan_acpt_avl_tree, + (void *)chanp->chan_qp_hdl, &where); + if (find_ret) { + mutex_exit(&root_chanp->chan_mutex); + SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str, + "DUPLICATE ENTRY in ACPT AVL : root %p, " + "idp %p, qp_hdl %p", + root_idp, idp, chanp->chan_qp_hdl); + return (EINVAL); + } + avl_insert(&root_chanp->chan_acpt_avl_tree, + (void *)idp, where); + } mutex_exit(&root_chanp->chan_mutex); mutex_enter(&chanp->chan_mutex); @@ -1013,26 +1088,7 @@ rdma_accept(struct rdma_cm_id *idp, struct rdma_conn_param *conn_param) mutex_exit(&chanp->chan_mutex); } - /* For TCP, insert into ACPT_AVL_TREE */ - if (root_idp && idp->ps == RDMA_PS_TCP) { - void *find_ret; - avl_index_t where; - - SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, - "Add to ACPT AVL of %p IDP, idp %p, qp_hdl %p", - root_idp, idp, chanp->chan_qp_hdl); - mutex_enter(&root_chanp->chan_mutex); - find_ret = avl_find(&root_chanp->chan_acpt_avl_tree, - (void *)chanp->chan_qp_hdl, &where); - if (find_ret) - SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str, - "DUPLICATE ENTRY in ACPT AVL : root %p, " - "idp %p, qp_hdl %p", - root_idp, idp, chanp->chan_qp_hdl); - avl_insert(&root_chanp->chan_acpt_avl_tree, - (void *)idp, where); - mutex_exit(&root_chanp->chan_mutex); - } else if (root_idp && IS_UDP_CMID(root_idp)) { + if (root_idp && IS_UDP_CMID(root_idp)) { cma_chan_state_t chan_state; /* @@ -1062,6 +1118,9 @@ rdma_accept(struct rdma_cm_id *idp, struct rdma_conn_param *conn_param) mutex_enter(&root_chanp->chan_mutex); avl_remove(&root_chanp->chan_acpt_avl_tree, idp); mutex_exit(&root_chanp->chan_mutex); + mutex_enter(&chanp->chan_mutex); + chanp->chan_req_state = REQ_CMID_NONE; + mutex_exit(&chanp->chan_mutex); } SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "rdma_accept: ret %x", ret); @@ -1123,10 +1182,6 @@ rdma_reject(struct rdma_cm_id *idp, const void *priv_data, ret = rdma_iw_reject(idp, priv_data, priv_data_len); #endif /* IWARP_SUPPORT */ - mutex_enter(&chanp->chan_mutex); - if (!ret) - chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE; - mutex_exit(&chanp->chan_mutex); if (!ret && root_idp) { cma_chan_state_t chan_state; @@ -1303,41 +1358,6 @@ rdma_leave_multicast(struct rdma_cm_id *idp, struct sockaddr *addr) SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "rdma_join_multicast: ret"); } -/*ARGSUSED*/ -int -rdma_create_qp(struct rdma_cm_id *idp, struct ib_pd *pd, - struct ib_qp_init_attr *qp_init_attr) -{ - return (-EINVAL); -} - -/*ARGSUSED*/ -void -rdma_destroy_qp(struct rdma_cm_id *idp) -{ -} - -void -ibcma_init_rdma_devs() -{ - uint_t i, nhcas; - ib_guid_t *guidp; - struct ib_device *device; - - if ((nhcas = ibt_get_hca_list(&guidp)) == 0) { - SOL_OFS_DPRINTF_L3(sol_rdmacm_dbg_str, - "ibcma_init_rdma_devs() - NO HCAs"); - return; - } - - for (i = 0; i < nhcas; i++) { - device = kmem_zalloc(sizeof (struct ib_device), KM_SLEEP); - device->node_guid = htonll(guidp[i]); - sol_cma_add_dev(device); - } - ibt_free_hca_list(guidp, nhcas); -} - /* * Functions to compare to rdma_cm_id *, used by AVL tree * routines. @@ -1643,6 +1663,27 @@ cma_generate_event_sync(struct rdma_cm_id *idp, enum rdma_cm_event_type event, } mutex_exit(&chanp->chan_mutex); + root_idp = CHAN_LISTEN_ROOT(chanp); + root_chanp = (sol_cma_chan_t *)root_idp; + SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "gen_event: root_idp %p", + root_idp); + + if (event == RDMA_CM_EVENT_CONNECT_REQUEST) { + /* + * Update chan_req_state for the REQ CMID. Decrement + * count of REQ CMIDs not notifed to consumer. + */ + ASSERT(root_idp); + mutex_enter(&root_chanp->chan_mutex); + root_chanp->chan_req_cnt--; +#ifdef DEBUG + SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, + "Dec req_cnt of %p IDP, idp %p, req_cnt %x", + root_idp, idp, root_chanp->chan_req_cnt); +#endif + mutex_exit(&root_chanp->chan_mutex); + } + /* Pass the event to the client */ ret = (idp->event_handler) (idp, &cm_event); @@ -1666,6 +1707,7 @@ cma_generate_event_sync(struct rdma_cm_id *idp, enum rdma_cm_event_type event, event); mutex_enter(&chanp->chan_mutex); + chanp->chan_req_state = REQ_CMID_NONE; chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE; chanp->chan_cmid_destroy_state &= ~SOL_CMA_CALLER_EVENT_PROGRESS; @@ -1680,42 +1722,17 @@ cma_generate_event_sync(struct rdma_cm_id *idp, enum rdma_cm_event_type event, return; } ofs_consume_event: - root_idp = CHAN_LISTEN_ROOT(chanp); - root_chanp = (sol_cma_chan_t *)root_idp; - SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "gen_event: root_idp %p", - root_idp); - if (event == RDMA_CM_EVENT_CONNECT_REQUEST) { - /* - * Update chan_req_state for the REQ CMID. Decrement - * count of REQ CMIDs not notifed to consumer. - */ - if (!root_idp) { - mutex_enter(&chanp->chan_mutex); - chanp->chan_cmid_destroy_state &= - ~SOL_CMA_CALLER_EVENT_PROGRESS; - if (chanp->chan_cmid_destroy_state & - SOL_CMA_CALLER_CMID_DESTROYED) - cv_broadcast(&chanp->chan_destroy_cv); - mutex_exit(&chanp->chan_mutex); - return; - } - + if (event == RDMA_CM_EVENT_DISCONNECTED || event == + RDMA_CM_EVENT_REJECTED) { mutex_enter(&chanp->chan_mutex); - chanp->chan_req_state = REQ_CMID_NOTIFIED; + chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE; + chanp->chan_qp_hdl = NULL; mutex_exit(&chanp->chan_mutex); - mutex_enter(&root_chanp->chan_mutex); - root_chanp->chan_req_cnt--; -#ifdef DEBUG - SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, - "Dec req_cnt of %p IDP, idp %p, req_cnt %x", - root_idp, idp, root_chanp->chan_req_cnt); -#endif - mutex_exit(&root_chanp->chan_mutex); - } else if (event == RDMA_CM_EVENT_DISCONNECTED && root_idp) { + } + if (event == RDMA_CM_EVENT_DISCONNECTED && root_idp) { cma_chan_state_t chan_state; mutex_enter(&chanp->chan_mutex); - chanp->chan_qp_hdl = NULL; cma_handle_nomore_events(chanp); chan_state = cma_get_chan_state(chanp); chanp->chan_cmid_destroy_state &= @@ -2054,13 +2071,16 @@ cma_handle_nomore_events(sol_cma_chan_t *chanp) root_chanp->chan_req_total_cnt--; if (!root_chanp->chan_req_total_cnt) root_chanp->chan_req_state = REQ_CMID_NONE; - if (root_idp->ps == RDMA_PS_TCP && (chanp->chan_req_state == - REQ_CMID_ACCEPTED || chanp->chan_req_state == - REQ_CMID_DISCONNECTED)) + if (root_idp->ps == RDMA_PS_TCP && chanp->chan_req_state == + REQ_CMID_ACCEPTED) { avl_remove(&root_chanp->chan_acpt_avl_tree, idp); + chanp->chan_req_state = REQ_CMID_NONE; + } if (chanp->chan_req_state == REQ_CMID_CREATED || - chanp->chan_req_state == REQ_CMID_NOTIFIED) + chanp->chan_req_state == REQ_CMID_NOTIFIED) { avl_remove(&root_chanp->chan_req_avl_tree, idp); + chanp->chan_req_state = REQ_CMID_NONE; + } state = cma_get_chan_state(root_chanp); req_nodes = avl_numnodes(&root_chanp->chan_req_avl_tree); acpt_nodes = avl_numnodes(&root_chanp->chan_acpt_avl_tree); @@ -2069,3 +2089,104 @@ cma_handle_nomore_events(sol_cma_chan_t *chanp) acpt_nodes == 0UL) cma_destroy_id(root_idp); } + +extern int ib_modify_qp(struct ib_qp *, struct ib_qp_attr *, int); +extern int rdma_init_qp_attr(struct rdma_cm_id *, struct ib_qp_attr *, + int *); + +static int +cma_init_ud_qp(sol_cma_chan_t *chanp, struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IB_QPS_INIT; + ret = rdma_init_qp_attr(&chanp->chan_rdma_cm, &qp_attr, &qp_attr_mask); + if (ret) + return (ret); + + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) + return (ret); + + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); + if (ret) + return (ret); + + qp_attr.qp_state = IB_QPS_RTS; + qp_attr.sq_psn = 0; + ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); + + return (ret); +} + +static int +cma_init_conn_qp(sol_cma_chan_t *chanp, struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IB_QPS_INIT; + ret = rdma_init_qp_attr(&chanp->chan_rdma_cm, &qp_attr, &qp_attr_mask); + if (ret) + return (ret); + + return (ib_modify_qp(qp, &qp_attr, qp_attr_mask)); +} + +static inline int +cma_is_ud_ps(enum rdma_port_space ps) +{ + return (ps == RDMA_PS_UDP || ps == RDMA_PS_IPOIB); +} + +int +rdma_create_qp(struct rdma_cm_id *idp, struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr) +{ + sol_cma_chan_t *chanp; + struct ib_qp *qp; + int ret; + ofs_client_t *dev_ofs_client; + + ASSERT(idp); + chanp = (sol_cma_chan_t *)idp; + if (idp->device->node_guid != pd->device->node_guid) + return (-EINVAL); + + dev_ofs_client = (ofs_client_t *)pd->device->clnt_hdl; + rdma_map_id2clnthdl(idp, dev_ofs_client->ibt_hdl, NULL); + + qp = ib_create_qp(pd, qp_init_attr); + if ((uintptr_t)qp >= (uintptr_t)-0xFFF) { + return ((intptr_t)qp); + } + rdma_map_id2qphdl(idp, (void *)qp->ibt_qp); + + if (cma_is_ud_ps(idp->ps)) { + ret = cma_init_ud_qp(chanp, qp); + } else { + ret = cma_init_conn_qp(chanp, qp); + } + + if (ret) { + goto err; + } + + idp->qp = qp; + chanp->chan_qp_num = qp->qp_num; + chanp->chan_is_srq = (qp->srq != NULL); + return (0); +err: + (void) ib_destroy_qp(qp); + return (ret); +} + +void +rdma_destroy_qp(struct rdma_cm_id *idp) +{ + ASSERT(idp); + (void) ib_destroy_qp(idp->qp); + idp->qp = NULL; +} diff --git a/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_ib_cma.c b/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_ib_cma.c index 2fdc1e266e..31749ebb96 100644 --- a/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_ib_cma.c +++ b/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_ib_cma.c @@ -748,9 +748,6 @@ rdma_ib_reject(struct rdma_cm_id *idp, const void *private_data, kmem_free(privp, SOL_REP_PRIV_DATA_SZ); return (EINVAL); } - mutex_enter(&chanp->chan_mutex); - chanp->chan_connect_flag = SOL_CMA_CONNECT_SERVER_DONE; - mutex_exit(&chanp->chan_mutex); } else { SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "rdma_ib_reject :" "calling ibt_cm_ud_proceed"); @@ -763,9 +760,6 @@ rdma_ib_reject(struct rdma_cm_id *idp, const void *private_data, kmem_free(privp, SOL_REP_PRIV_DATA_SZ); return (EINVAL); } - mutex_enter(&chanp->chan_mutex); - chanp->chan_connect_flag = SOL_CMA_CONNECT_SERVER_DONE; - mutex_exit(&chanp->chan_mutex); } if (privp) @@ -804,8 +798,8 @@ rdma_ib_disconnect(struct rdma_cm_id *idp) mutex_enter(&root_chanp->chan_mutex); avl_remove(&root_chanp->chan_req_avl_tree, idp); mutex_exit(&root_chanp->chan_mutex); + chanp->chan_req_state = REQ_CMID_NONE; } - chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE; } if (idp->ps == RDMA_PS_TCP && chanp->chan_connect_flag == SOL_CMA_CONNECT_SERVER_RCVD && chanp->chan_session_id) { @@ -828,8 +822,8 @@ rdma_ib_disconnect(struct rdma_cm_id *idp) mutex_enter(&root_chanp->chan_mutex); avl_remove(&root_chanp->chan_req_avl_tree, idp); mutex_exit(&root_chanp->chan_mutex); + chanp->chan_req_state = REQ_CMID_NONE; } - chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE; } /* @@ -1223,32 +1217,74 @@ ibcma_query_local_ip(struct rdma_cm_id *idp, sol_cma_chan_t *chanp, return (0); } +extern void cma_resolve_addr_callback(sol_cma_chan_t *, int); + +static void +ibcma_path_hdlr(void *arg, ibt_status_t retval, ibt_path_info_t *pathp, + uint8_t num_paths, ibt_path_ip_src_t *src_ip_p) +{ + struct rdma_cm_id *idp = (struct rdma_cm_id *)arg; + sol_cma_chan_t *chanp = (sol_cma_chan_t *)arg; + ibcma_chan_t *ibchanp = &(chanp->chan_ib); + int i; + ibcma_dev_t *devp; + ib_lid_t base_lid; + + if (retval != IBT_SUCCESS && retval != IBT_INSUFF_DATA) { + cma_resolve_addr_callback(chanp, 1); + return; + } + + ibchanp->chan_path_size = 2 * sizeof (ibt_path_info_t); + ibchanp->chan_pathp = kmem_zalloc(ibchanp->chan_path_size, KM_SLEEP); + bcopy(pathp, ibchanp->chan_pathp, num_paths * + sizeof (ibt_path_info_t)); + ibchanp->chan_numpaths = num_paths; + + if (ibchanp->chan_devp == NULL && src_ip_p) { + ipaddr2sockaddr(&(src_ip_p[0].ip_primary), + &(idp->route.addr.src_addr), NULL); + bcopy(&(src_ip_p[0].ip_primary), &ibchanp->chan_local_addr, + sizeof (ibt_ip_addr_t)); + if (ibcma_init_devinfo((struct rdma_cm_id *)chanp, + ibchanp, pathp)) { + kmem_free(ibchanp->chan_pathp, + ibchanp->chan_path_size); + cma_resolve_addr_callback(chanp, 1); + return; + } + } + + if (ibchanp->chan_devp == NULL) { + cma_resolve_addr_callback(chanp, 1); + return; + } + + devp = ibchanp->chan_devp; + (idp->route).num_paths = ibchanp->chan_numpaths; + idp->route.path_rec = kmem_zalloc(sizeof (struct ib_sa_path_rec) * + ibchanp->chan_numpaths, KM_SLEEP); + base_lid = ibt_get_port_state_byguid(devp->dev_node_guid, + devp->dev_port_num, NULL, &base_lid); + for (i = 0; i < ibchanp->chan_numpaths; i++) + ibt_path2sa_path(&((ibchanp->chan_pathp)[i]), + &((idp->route.path_rec)[i]), base_lid); + + cma_resolve_addr_callback(chanp, 0); +} + static int ibcma_get_paths(struct rdma_cm_id *idp, sol_cma_chan_t *chanp, ibcma_chan_t *ibchanp) { ibt_ip_path_attr_t path_attr; ibt_status_t status; - ibt_path_ip_src_t *src_ip_p = NULL; - uint8_t max_paths; - ibcma_dev_t *devp; ibt_ip_addr_t *dst_addrp; - ib_lid_t base_lid; - int i; ASSERT(ibchanp); SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "ibcma_get_paths(%p, %p)", idp, ibchanp); - max_paths = 2; - ibchanp->chan_path_size = max_paths * sizeof (ibt_path_info_t); - ibchanp->chan_pathp = kmem_zalloc(ibchanp->chan_path_size, KM_SLEEP); - - devp = ibchanp->chan_devp; - if (devp == NULL) { - src_ip_p = kmem_zalloc(sizeof (ibt_path_ip_src_t) * max_paths, - KM_SLEEP); - } bzero(&path_attr, sizeof (ibt_ip_path_attr_t)); dst_addrp = kmem_zalloc(sizeof (ibt_ip_addr_t), KM_SLEEP); bcopy(&ibchanp->chan_remote_addr, dst_addrp, sizeof (ibt_ip_addr_t)); @@ -1256,56 +1292,19 @@ ibcma_get_paths(struct rdma_cm_id *idp, sol_cma_chan_t *chanp, bcopy(&ibchanp->chan_local_addr, &path_attr.ipa_src_ip, sizeof (ibt_ip_addr_t)); path_attr.ipa_ndst = 1; - path_attr.ipa_max_paths = max_paths; + path_attr.ipa_max_paths = 2; if (ibcma_any_addr(&path_attr.ipa_src_ip)) path_attr.ipa_src_ip.family = AF_UNSPEC; - status = ibt_get_ip_paths(chanp->chan_ib_client_hdl, IBT_PATH_NO_FLAGS, - &path_attr, ibchanp->chan_pathp, &ibchanp->chan_numpaths, - src_ip_p); - if (status != IBT_SUCCESS && status != IBT_INSUFF_DATA) { + status = ibt_aget_ip_paths(chanp->chan_ib_client_hdl, IBT_PATH_NO_FLAGS, + &path_attr, ibcma_path_hdlr, idp); + if (status != IBT_SUCCESS) { SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str, - "cma_get_paths : failed %d", status); + "cma_get_paths : ibt_aget_paths() failed %d", status); kmem_free(dst_addrp, sizeof (ibt_ip_addr_t)); - if (src_ip_p) - kmem_free(src_ip_p, - sizeof (ibt_path_ip_src_t) * max_paths); - kmem_free(ibchanp->chan_pathp, ibchanp->chan_path_size); - ibchanp->chan_pathp = NULL; return (EINVAL); } - if (src_ip_p) { - ipaddr2sockaddr(&(src_ip_p[0].ip_primary), - &(idp->route.addr.src_addr), NULL); - bcopy(&(src_ip_p[0].ip_primary), &ibchanp->chan_local_addr, - sizeof (ibt_ip_addr_t)); - if (ibcma_init_devinfo(idp, ibchanp, ibchanp->chan_pathp)) { - kmem_free(src_ip_p, sizeof (ibt_path_ip_src_t) * - max_paths); - kmem_free(dst_addrp, sizeof (ibt_ip_addr_t)); - kmem_free(ibchanp->chan_pathp, - ibchanp->chan_path_size); - return (EINVAL); - } - kmem_free(src_ip_p, sizeof (ibt_path_ip_src_t) * max_paths); - } - if (!ibchanp->chan_devp) { - SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str, - "cma_get_paths : devp ERROR"); - kmem_free(dst_addrp, sizeof (ibt_ip_addr_t)); - return (EINVAL); - } - devp = ibchanp->chan_devp; - (idp->route).num_paths = ibchanp->chan_numpaths; - idp->route.path_rec = kmem_zalloc(sizeof (struct ib_sa_path_rec) * - ibchanp->chan_numpaths, KM_SLEEP); - base_lid = ibt_get_port_state_byguid(devp->dev_node_guid, - devp->dev_port_num, NULL, &base_lid); - for (i = 0; i < ibchanp->chan_numpaths; i++) - ibt_path2sa_path(&((ibchanp->chan_pathp)[i]), - &((idp->route.path_rec)[i]), base_lid); - kmem_free(dst_addrp, sizeof (ibt_ip_addr_t)); return (0); } @@ -1447,6 +1446,7 @@ ibcma_ud_hdlr(void *inp, ibt_cm_ud_event_t *eventp, ASSERT(chanp->chan_connect_flag == SOL_CMA_CONNECT_INITIATED); mutex_enter(&chanp->chan_mutex); chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE; + chanp->chan_cmid_destroy_state |= SOL_CMA_CALLER_EVENT_PROGRESS; mutex_exit(&chanp->chan_mutex); sidr_rep = &((eventp->cm_event).sidr_rep); if (sidr_rep->srep_status == IBT_CM_SREP_CHAN_VALID) { @@ -1666,6 +1666,7 @@ ibcma_handle_req(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr, root_chanp->chan_req_total_cnt++; avl_insert(&root_chanp->chan_req_avl_tree, (void *)event_idp, where); mutex_exit(&root_chanp->chan_mutex); + event_chanp->chan_req_state = REQ_CMID_NOTIFIED; return (IBT_CM_DEFER); } @@ -1714,6 +1715,7 @@ ibcma_handle_est(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr, if (chanp->chan_listenp == NULL) { ASSERT(chanp->chan_connect_flag == SOL_CMA_CONNECT_INITIATED); chanp->chan_connect_flag = SOL_CMA_CONNECT_CLIENT_DONE; + *event_id_ptr = idp; bcopy(&chanp->chan_param, paramp, sizeof (struct rdma_conn_param)); if (paramp->private_data_len) { @@ -1726,6 +1728,9 @@ ibcma_handle_est(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr, paramp->private_data_len); } event_chanp = chanp; + mutex_enter(&chanp->chan_mutex); + chanp->chan_cmid_destroy_state |= SOL_CMA_CALLER_EVENT_PROGRESS; + mutex_exit(&chanp->chan_mutex); goto est_common; } @@ -1734,7 +1739,9 @@ ibcma_handle_est(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr, root_chanp = (sol_cma_chan_t *)root_idp; event_chanp = NULL; + mutex_enter(&root_chanp->chan_mutex); event_idp = cma_get_acpt_idp(root_idp, eventp->cm_channel); + mutex_exit(&root_chanp->chan_mutex); if (event_idp == NULL) { SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str, "ibcma_handle_est: " "No matching CMID for qp_hdl %p in ACPT AVL of CMID %p", @@ -1743,7 +1750,11 @@ ibcma_handle_est(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr, } *event_id_ptr = event_idp; event_chanp = (sol_cma_chan_t *)event_idp; + mutex_enter(&event_chanp->chan_mutex); event_chanp->chan_connect_flag = SOL_CMA_CONNECT_SERVER_DONE; + event_chanp->chan_cmid_destroy_state |= + SOL_CMA_CALLER_EVENT_PROGRESS; + mutex_exit(&event_chanp->chan_mutex); est_common: #ifdef QP_DEBUG @@ -1766,34 +1777,44 @@ ibcma_handle_closed(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr, ibt_cm_event_t *eventp, enum rdma_cm_event_type *event, int *evt_status) { struct rdma_cm_id *root_idp, *event_idp; - sol_cma_chan_t *chanp, *event_chanp; + sol_cma_chan_t *chanp, *root_chanp, *event_chanp; *event = RDMA_CM_EVENT_DISCONNECTED; *evt_status = 0; chanp = (sol_cma_chan_t *)idp; mutex_enter(&chanp->chan_mutex); root_idp = CHAN_LISTEN_ROOT((chanp)); + root_chanp = (sol_cma_chan_t *)root_idp; chanp->chan_qp_hdl = NULL; if (!root_idp) { - chanp->chan_connect_flag = 0; + chanp->chan_cmid_destroy_state |= + SOL_CMA_CALLER_EVENT_PROGRESS; mutex_exit(&chanp->chan_mutex); + *event_id_ptr = idp; return (IBT_CM_DEFAULT); } mutex_exit(&chanp->chan_mutex); /* On the passive side, search ACPT AVL Tree */ + mutex_enter(&root_chanp->chan_mutex); event_idp = cma_get_acpt_idp(root_idp, eventp->cm_channel); + event_chanp = (sol_cma_chan_t *)event_idp; if (event_idp == NULL) { + mutex_exit(&root_chanp->chan_mutex); SOL_OFS_DPRINTF_L5(sol_rdmacm_dbg_str, "ibcma_handle_closed: " "No matching CMID for qp hdl %p in EST AVL of CMID %p", eventp->cm_channel, root_idp); return (IBT_CM_DEFAULT); } - event_chanp = (sol_cma_chan_t *)event_idp; + avl_remove(&root_chanp->chan_acpt_avl_tree, event_idp); + mutex_exit(&root_chanp->chan_mutex); mutex_enter(&event_chanp->chan_mutex); - event_chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE; + event_chanp->chan_req_state = REQ_CMID_NONE; + event_chanp->chan_cmid_destroy_state |= + SOL_CMA_CALLER_EVENT_PROGRESS; mutex_exit(&event_chanp->chan_mutex); + *event_id_ptr = event_idp; return (IBT_CM_DEFAULT); } @@ -1843,9 +1864,14 @@ ibcma_handle_failed(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr, * event to accepted CMID. */ if (root_idp) { + sol_cma_chan_t *root_chanp; ASSERT(eventp->cm_channel); + + root_chanp = (sol_cma_chan_t *)root_idp; + mutex_enter(&root_chanp->chan_mutex); event_idp = cma_get_acpt_idp(root_idp, eventp->cm_channel); + mutex_exit(&root_chanp->chan_mutex); if (event_idp == NULL) { SOL_OFS_DPRINTF_L2(sol_rdmacm_dbg_str, "ibcma_handle_failed: No matching CMID " @@ -1856,8 +1882,9 @@ ibcma_handle_failed(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr, event_chanp = (sol_cma_chan_t *)event_idp; mutex_enter(&event_chanp->chan_mutex); - event_chanp->chan_connect_flag = - SOL_CMA_CONNECT_NONE; + event_chanp->chan_req_state = REQ_CMID_NONE; + event_chanp->chan_cmid_destroy_state |= + SOL_CMA_CALLER_EVENT_PROGRESS; event_chanp->chan_qp_hdl = NULL; mutex_exit(&event_chanp->chan_mutex); *event_id_ptr = event_idp; @@ -1865,8 +1892,14 @@ ibcma_handle_failed(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr, avl_remove(&root_chanp->chan_acpt_avl_tree, event_idp); mutex_exit(&root_chanp->chan_mutex); - } else - chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE; + } else { + mutex_enter(&chanp->chan_mutex); + chanp->chan_cmid_destroy_state |= + SOL_CMA_CALLER_EVENT_PROGRESS; + chanp->chan_qp_hdl = NULL; + mutex_exit(&chanp->chan_mutex); + *event_id_ptr = idp; + } *evt_status = failedp->cf_reason; *event = RDMA_CM_EVENT_REJECTED; break; @@ -1889,8 +1922,7 @@ ibcma_handle_failed(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr, event_chanp = (sol_cma_chan_t *)event_idp; mutex_enter(&event_chanp->chan_mutex); - event_chanp->chan_connect_flag = - SOL_CMA_CONNECT_NONE; + event_chanp->chan_req_state = REQ_CMID_NONE; event_chanp->chan_qp_hdl = NULL; mutex_exit(&event_chanp->chan_mutex); *event_id_ptr = event_idp; @@ -1909,12 +1941,15 @@ ibcma_handle_failed(struct rdma_cm_id *idp, struct rdma_cm_id **event_id_ptr, "session_id NULL"); } if (!root_idp) { - chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE; + *event_id_ptr = idp; + mutex_enter(&chanp->chan_mutex); + chanp->chan_cmid_destroy_state |= + SOL_CMA_CALLER_EVENT_PROGRESS; + chanp->chan_qp_hdl = NULL; + mutex_exit(&chanp->chan_mutex); *evt_status = IBT_CM_TIMEOUT; *event = RDMA_CM_EVENT_REJECTED; } - chanp->chan_connect_flag = SOL_CMA_CONNECT_NONE; - chanp->chan_qp_hdl = NULL; break; case IBT_CM_FAILURE_STALE : diff --git a/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_kverbs.c b/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_kverbs.c new file mode 100644 index 0000000000..92a7fabc14 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_kverbs.c @@ -0,0 +1,2323 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* Solaris Open Fabric kernel verbs */ + +#include <sys/types.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/modctl.h> +#include <sys/ib/clients/of/rdma/ib_verbs.h> +#include <sys/ib/clients/of/rdma/ib_addr.h> +#include <sys/ib/clients/of/rdma/rdma_cm.h> +#include <sys/ib/clients/of/sol_ofs/sol_kverb_impl.h> + +static void *statep; +char *sol_kverbs_dbg_str = "sol_kverbs"; + +static llist_head_t client_list = LLIST_HEAD_INIT(client_list); +kmutex_t clist_lock; /* mutex for client_list */ + +static void ofs_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, + ibt_async_event_t *); + +/* + * set ibt_client_t members. clnt->ib_client must be set before + * this func is called. + */ +static int +alloc_ibt_client(ofs_client_t *clnt) +{ + int namelen; + ASSERT(clnt->ib_client != NULL); + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "alloc_ibt_client: client: 0x%p", clnt); + + /* + * double-check the name string. if it's longer than MAXNAMELEN + * including the string terminator, assuming the name is invalid, + * return EINVAL. + */ + namelen = strlen(clnt->ib_client->name); + if (namelen >= MAXNAMELEN) { + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "alloc_ibt_client: client: 0x%p => " + "namelen(%d) is larger than MAXNAMELEN", clnt, namelen); + return (-EINVAL); + } + clnt->ibt_client.mi_clnt_name = kmem_zalloc(namelen + 1, KM_NOSLEEP); + if (clnt->ibt_client.mi_clnt_name == NULL) { + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "alloc_ibt_client: client: 0x%p => " + "no sufficient memory", clnt); + return (-ENOMEM); + } + bcopy(clnt->ib_client->name, clnt->ibt_client.mi_clnt_name, namelen); + clnt->ibt_client.mi_ibt_version = IBTI_V_CURR; + if (clnt->ib_client->dip) { + clnt->ibt_client.mi_clnt_class = IBT_GENERIC; + } else { + clnt->ibt_client.mi_clnt_class = IBT_GENERIC_MISC; + } + clnt->ibt_client.mi_async_handler = ofs_async_handler; + + return (0); +} + +static void +free_ibt_client(ofs_client_t *clnt) +{ + int namelen = strlen(clnt->ib_client->name); + ASSERT(namelen < MAXNAMELEN); + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "free_ibt_client: client: 0x%p", clnt); + + kmem_free(clnt->ibt_client.mi_clnt_name, namelen + 1); + clnt->ibt_client.mi_clnt_name = NULL; +} + +/* + * get_device() returns a pointer to struct ib_devcie with + * the same guid as one passed to the function. + */ +static ib_device_t * +get_device(ofs_client_t *ofs_client, ib_guid_t guid) +{ + ib_device_t *device; + llist_head_t *entry; + + ASSERT(RW_LOCK_HELD(&ofs_client->lock)); + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "get_device: client: 0x%p, guid:0x%p", + ofs_client, (void *)(uintptr_t)htonll(guid)); + + list_for_each(entry, &ofs_client->device_list) { + device = entry->ptr; + if (device->node_guid == htonll(guid)) { + ASSERT(device->reg_state == IB_DEV_CLOSE); + ASSERT(device->node_type == RDMA_NODE_IB_CA); + ASSERT(device->clnt_hdl == (ofs_client_p_t)ofs_client); + return (device); + } + } + + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "get_device: client: 0x%p, guid:0x%p => no match guid", + ofs_client, (void *)(uintptr_t)htonll(guid)); + + return (NULL); +} + +/* + * ofs_async_handler() is a delegated function to handle asynchrnonous events, + * which dispatches each event to corresponding qp/cq handlers registered + * with ib_create_qp() and/or ib_create_cq(). + */ +static void +ofs_async_handler(void *clntp, ibt_hca_hdl_t hdl, ibt_async_code_t code, + ibt_async_event_t *event) +{ + ofs_client_t *ofs_client = (ofs_client_t *)clntp; + struct ib_event ib_event; + struct ib_qp *qpp; + struct ib_cq *cqp; + + + ASSERT(ofs_client != NULL); + + cqp = event->ev_cq_hdl ? ibt_get_cq_private(event->ev_cq_hdl) : NULL; + qpp = event->ev_chan_hdl ? + ibt_get_qp_private(event->ev_chan_hdl) : NULL; + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ofs_async_handler: client: 0x%p, hca_hdl: 0x%p, code:0x%x, " + "event->qp: 0x%p, event->cq: 0x%p, event->srq: 0x%p " + "event->guid: 0x%p, event->port: 0x%x", + clntp, hdl, code, qpp, cqp, event->ev_srq_hdl, + (void *)(uintptr_t)event->ev_hca_guid, event->ev_port); + + bzero(&ib_event, sizeof (struct ib_event)); + switch (code) { + case IBT_EVENT_PATH_MIGRATED: + FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp, + IB_EVENT_PATH_MIG); + return; + case IBT_EVENT_SQD: + FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp, + IB_EVENT_SQ_DRAINED); + return; + case IBT_EVENT_COM_EST: + FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp, + IB_EVENT_COMM_EST); + return; + case IBT_ERROR_CATASTROPHIC_CHAN: + FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp, + IB_EVENT_QP_FATAL); + return; + case IBT_ERROR_INVALID_REQUEST_CHAN: + FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp, + IB_EVENT_QP_REQ_ERR); + return; + case IBT_ERROR_ACCESS_VIOLATION_CHAN: + FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp, + IB_EVENT_QP_ACCESS_ERR); + return; + case IBT_ERROR_PATH_MIGRATE_REQ: + FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp, + IB_EVENT_PATH_MIG); + return; + case IBT_EVENT_EMPTY_CHAN: + FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp, + IB_EVENT_QP_LAST_WQE_REACHED); + return; + case IBT_ERROR_CQ: + FIRE_CQ_EVENT(ofs_client, hdl, ib_event, cqp, + IB_EVENT_CQ_ERR); + return; + case IBT_HCA_ATTACH_EVENT: + { + ib_device_t *device; + int rtn; + + /* re-use the device once it was created */ + rw_enter(&ofs_client->lock, RW_WRITER); + device = get_device(ofs_client, event->ev_hca_guid); + if (device == NULL) { + device = kmem_alloc(sizeof (ib_device_t), KM_SLEEP); + device->node_type = RDMA_NODE_IB_CA; + device->reg_state = IB_DEV_CLOSE; + device->clnt_hdl = (ofs_client_p_t)ofs_client; + device->node_guid = htonll(event->ev_hca_guid); + device->data = NULL; + /* add this HCA */ + ofs_client->hca_num++; + llist_head_init(&device->list, device); + llist_add_tail(&device->list, &ofs_client->device_list); + } + device->hca_hdl = NULL; + device->local_dma_lkey = 0; + device->phys_port_cnt = 0; + + /* open this HCA */ + rtn = ibt_open_hca(ofs_client->ibt_hdl, event->ev_hca_guid, + &device->hca_hdl); + if (rtn == IBT_SUCCESS) { + ibt_hca_attr_t hattr; + + ofs_client->hca_open_num++; + device->reg_state = IB_DEV_OPEN; + ibt_set_hca_private(device->hca_hdl, device); + + rtn = ibt_query_hca(device->hca_hdl, &hattr); + if (rtn != IBT_SUCCESS) { + device->reg_state = IB_DEV_CLOSE; + rtn = ibt_close_hca(device->hca_hdl); + ASSERT(rtn == IBT_SUCCESS); + ofs_client->hca_open_num--; + return; + } + + (void) sprintf(device->name, "%x:%x:%x", + hattr.hca_vendor_id, hattr.hca_device_id, + hattr.hca_version_id); + device->local_dma_lkey = hattr.hca_reserved_lkey; + device->phys_port_cnt = hattr.hca_nports; + ibt_set_hca_private(device->hca_hdl, device); + + /* invoke client's callback */ + if (ofs_client->ib_client->add) { + ofs_client->ib_client->add(device); + } + } + rw_exit(&ofs_client->lock); + + return; + } + case IBT_HCA_DETACH_EVENT: + { + struct ib_device *device; + + rw_enter(&ofs_client->lock, RW_WRITER); + device = ibt_get_hca_private(hdl); + if (device->reg_state == IB_DEV_OPEN) { + ibt_status_t rtn; + /* invoke client's callback */ + if (ofs_client->ib_client->remove) { + ofs_client->ib_client->remove(device); + } + /* change the state only */ + device->reg_state = IB_DEV_CLOSE; + /* close this HCA */ + rtn = ibt_close_hca(device->hca_hdl); + ASSERT(rtn == IBT_SUCCESS); + ofs_client->hca_open_num--; + } + rw_exit(&ofs_client->lock); + + return; + } + case IBT_EVENT_LIMIT_REACHED_SRQ: + case IBT_ERROR_CATASTROPHIC_SRQ: + default: + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "sol_ofs does not support this event(0x%x).\n" + "\t clntp=0x%p, hca_hdl=0x%p, code=%d, eventp=0x%p\n", + code, clntp, hdl, code, event); + return; + } +} + +/* + * ib_register_client - Register an IB client + * @client:Client to register + * + * Upper level users of the IB drivers can use ib_register_client() to + * register callbacks for IB device addition and removal. When an IB + * device is added, each registered client's add method will be called + * (in the order the clients were registered), and when a device is + * removed, each client's remove method will be called (in the reverse + * order that clients were registered). In addition, when + * ib_register_client() is called, the client will receive an add + * callback for all devices already registered. + * + * Note that struct ib_client should have a dip pointer to the client, + * which is different from the Linux implementation. + */ +int +ib_register_client(struct ib_client *client) +{ + uint_t i, nhcas; /* number of HCAs */ + ib_guid_t *guidp; + ofs_client_t *ofs_client; + llist_head_t *entry, *tmp; + ib_device_t *device; + int rtn; + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_register_client: client: 0x%p", client); + + /* get the number of HCAs on this system */ + if ((nhcas = ibt_get_hca_list(&guidp)) == 0) { + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_register_client: client: 0x%p => no HCA", client); + return (-ENXIO); + } + + /* allocate a new sol_ofs_client structure */ + ofs_client = kmem_zalloc(sizeof (ofs_client_t), KM_NOSLEEP); + if (ofs_client == NULL) { + (void) ibt_free_hca_list(guidp, nhcas); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_register_client: client: 0x%p => " + "no sufficient memory for ofs_client", client); + return (-ENOMEM); + } + + /* set members */ + ofs_client->ib_client = client; + if ((rtn = alloc_ibt_client(ofs_client)) != 0) { + kmem_free(ofs_client, sizeof (ofs_client_t)); + (void) ibt_free_hca_list(guidp, nhcas); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_register_client: client: 0x%p => " + "alloc_ibt_client failed w/ 0x%x", client, rtn); + return (rtn); + } + ofs_client->state = IB_OFS_CLNT_INITIALIZED; + llist_head_init(&ofs_client->device_list, NULL); + llist_head_init(&ofs_client->client_list, ofs_client); + rw_init(&ofs_client->lock, NULL, RW_DEFAULT, NULL); + + /* initialize IB client */ + rw_enter(&ofs_client->lock, RW_WRITER); + if (client->state != IB_CLNT_UNINITIALIZED) { + rw_exit(&ofs_client->lock); + kmem_free(ofs_client, sizeof (ofs_client_t)); + (void) ibt_free_hca_list(guidp, nhcas); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_register_client: client: 0x%p => " + "invalid client state(%d)", client, client->state); + return (-EPERM); + } + + /* attach this client to IBTF */ + rtn = ibt_attach(&ofs_client->ibt_client, client->dip, ofs_client, + &ofs_client->ibt_hdl); + if (rtn != IBT_SUCCESS) { + rw_exit(&ofs_client->lock); + free_ibt_client(ofs_client); + kmem_free(ofs_client, sizeof (ofs_client_t)); + (void) ibt_free_hca_list(guidp, nhcas); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_register_client: client: 0x%p => " + "ibt_attach failed w/ 0x%x", client, rtn); + return (-EINVAL); + } + client->clnt_hdl = (ofs_client_p_t)ofs_client; + client->state = IB_CLNT_INITIALIZED; + + /* link this client */ + mutex_enter(&clist_lock); + llist_add_tail(&ofs_client->client_list, &client_list); + mutex_exit(&clist_lock); + + /* Open HCAs */ + ofs_client->hca_num = nhcas; + for (i = 0; i < ofs_client->hca_num; i++) { + /* allocate the ib_device structure */ + device = kmem_zalloc(sizeof (ib_device_t), KM_NOSLEEP); + if (device == NULL) { + rtn = -ENOMEM; + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_register_client: client: 0x%p => " + "no sufficient memory for ib_device", client); + goto err; + } + device->node_guid = htonll(guidp[i]); + device->node_type = RDMA_NODE_IB_CA; + device->reg_state = IB_DEV_CLOSE; + device->clnt_hdl = (ofs_client_p_t)ofs_client; + llist_head_init(&device->list, device); + llist_add_tail(&device->list, &ofs_client->device_list); + + rtn = ibt_open_hca(ofs_client->ibt_hdl, guidp[i], + &device->hca_hdl); + if (rtn == IBT_SUCCESS) { + ibt_hca_attr_t hattr; + + ofs_client->hca_open_num++; + device->reg_state = IB_DEV_OPEN; + + rtn = ibt_query_hca(device->hca_hdl, &hattr); + if (rtn != IBT_SUCCESS) { + rtn = -EIO; + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_register_client: client: 0x%p," + "hca_hdl: 0x%p ==> " + "ibt_query_hca() failed w/ %d", + client, device->hca_hdl, rtn); + goto err; + } + + (void) sprintf(device->name, "%x:%x:%x", + hattr.hca_vendor_id, hattr.hca_device_id, + hattr.hca_version_id); + device->local_dma_lkey = hattr.hca_reserved_lkey; + device->phys_port_cnt = hattr.hca_nports; + ibt_set_hca_private(device->hca_hdl, device); + + /* invoke client's callback */ + if (client->add) { + client->add(device); + } + } + } + if (ofs_client->hca_open_num == 0) { + rtn = -ENXIO; + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_register_client: client: 0x%p => " + "no available HCA", client); + goto err; + } + rw_exit(&ofs_client->lock); + + (void) ibt_free_hca_list(guidp, nhcas); + return (0); + +err: + /* first close all open HCAs */ + list_for_each(entry, &ofs_client->device_list) { + device = entry->ptr; + /* + * If it's open already, close it after the remove + * callback. + */ + if (device->reg_state == IB_DEV_OPEN) { + ibt_status_t rtn; + /* invoke client's callback */ + if (client->remove) { + client->remove(device); + } + device->reg_state = IB_DEV_CLOSE; + rtn = ibt_close_hca(device->hca_hdl); + ASSERT(rtn == IBT_SUCCESS); + ofs_client->hca_open_num--; + } + } + ASSERT(ofs_client->hca_open_num == 0); + + /* then free the devices */ + list_for_each_safe(entry, tmp, &ofs_client->device_list) { + device = entry->ptr; + /* de-link and free the device */ + llist_del(entry); + kmem_free(device, sizeof (ib_device_t)); + ofs_client->hca_num--; + } + ASSERT(ofs_client->hca_num == 0); + + /* delink this client */ + mutex_enter(&clist_lock); + llist_del(&ofs_client->client_list); + mutex_exit(&clist_lock); + + /* detach the client */ + client->clnt_hdl = NULL; + client->state = IB_CLNT_UNINITIALIZED; + (void) ibt_detach(ofs_client->ibt_hdl); + rw_exit(&ofs_client->lock); + + /* free sol_ofs_client */ + free_ibt_client(ofs_client); + kmem_free(ofs_client, sizeof (ofs_client_t)); + + (void) ibt_free_hca_list(guidp, nhcas); + return (rtn); +} + +/* + * ib_unregister_client - Unregister an IB client + * @client:Client to unregister + * + * Upper level users use ib_unregister_client() to remove their client + * registration. When ib_unregister_client() is called, the client + * will receive a remove callback for each IB device still registered. + */ +void +ib_unregister_client(struct ib_client *client) +{ + ofs_client_t *ofs_client; + ib_device_t *device; + llist_head_t *entry, *tmp; + + ASSERT(client->state == IB_CLNT_INITIALIZED && + client->clnt_hdl != NULL); + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_unregister_client: client: 0x%p", client); + + ofs_client = (ofs_client_t *)client->clnt_hdl; + rw_enter(&ofs_client->lock, RW_WRITER); + + /* first close all open HCAs */ + list_for_each(entry, &ofs_client->device_list) { + device = entry->ptr; + /* + * If it's open already, close it after the remove + * callback. + */ + if (device->reg_state == IB_DEV_OPEN) { + ibt_status_t rtn; + /* invoke client's callback */ + if (client->remove) { + client->remove(device); + } + device->reg_state = IB_DEV_CLOSE; + rtn = ibt_close_hca(device->hca_hdl); + if (rtn != IBT_SUCCESS) + SOL_OFS_DPRINTF_L3( + sol_kverbs_dbg_str, + "ib_unregister_client(%p) - " + "ibt_close_hca failed %d", + client, rtn); + + ofs_client->hca_open_num--; + } + } + ASSERT(ofs_client->hca_open_num == 0); + + /* then free the devices */ + list_for_each_safe(entry, tmp, &ofs_client->device_list) { + device = entry->ptr; + /* de-link and free the device */ + llist_del(entry); + kmem_free(device, sizeof (ib_device_t)); + ofs_client->hca_num--; + } + ASSERT(ofs_client->hca_num == 0); + + /* delink this client */ + mutex_enter(&clist_lock); + llist_del(&ofs_client->client_list); + mutex_exit(&clist_lock); + + /* detach the client */ + client->clnt_hdl = NULL; + client->state = IB_CLNT_UNINITIALIZED; + (void) ibt_detach(ofs_client->ibt_hdl); + rw_exit(&ofs_client->lock); + + /* free sol_ofs_client */ + free_ibt_client(ofs_client); + kmem_free(ofs_client, sizeof (ofs_client_t)); +} + +/* + * ofs_lock_enter() and ofs_lock_exit() are used to avoid the recursive + * rwlock while the client callbacks are invoked. + * + * Note that the writer lock is used only in the client callback case, + * so that the kverb functions wanting to acquire the reader lock can + * safely ignore the reader lock if the writer lock is already held. + * The writer lock shouldn't be used in no other plances. + */ +static inline void +ofs_lock_enter(krwlock_t *lock) +{ + if (!RW_WRITE_HELD(lock)) { + rw_enter(lock, RW_READER); + } +} + +static inline void +ofs_lock_exit(krwlock_t *lock) +{ + if (!RW_WRITE_HELD(lock)) { + rw_exit(lock); + } +} + +/* + * ib_get_client_data - Get IB client context + * @device:Device to get context for + * @client:Client to get context for + * + * ib_get_client_data() returns client context set with + * ib_set_client_data() and returns NULL if it's not found. + */ +void *ib_get_client_data(struct ib_device *device, + struct ib_client *client) +{ + ofs_client_t *ofs_client; + struct ib_device *ib_device; + boolean_t found = B_FALSE; + llist_head_t *entry; + void *data; + + ASSERT(device != 0 && client != 0); + + ofs_client = (ofs_client_t *)client->clnt_hdl; + if (ofs_client == 0) { + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_get_client_data: device: 0x%p, client: 0x%p => " + "no ofs_client", device, client); + return (NULL); + } + + ofs_lock_enter(&ofs_client->lock); + list_for_each(entry, &ofs_client->device_list) { + ib_device = entry->ptr; + if (ib_device->node_guid == device->node_guid) { + found = B_TRUE; + break; + } + } + if (!found) { + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_get_client_data: device: 0x%p, client: 0x%p => " + "no ib_device found", device, client); + return (NULL); + } + data = ib_device->data; + ofs_lock_exit(&ofs_client->lock); + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_get_client_data: device: 0x%p, client: 0x%p", + device, client); + + return (data); +} + +/* + * ib_set_client_data - Set IB client context + * @device:Device to set context for + * @client:Client to set context for + * @data:Context to set + * + * ib_set_client_data() sets client context that can be retrieved with + * ib_get_client_data(). If the specified device is not found, the function + * returns w/o any operations. + */ +void ib_set_client_data(struct ib_device *device, struct ib_client *client, + void *data) +{ + ofs_client_t *ofs_client; + struct ib_device *ib_device; + boolean_t found = B_FALSE; + llist_head_t *entry; + + ASSERT(device != 0 && client != 0); + + ofs_client = (ofs_client_t *)client->clnt_hdl; + if (ofs_client == 0) { + cmn_err(CE_WARN, "No client context found for %s/%s\n", + device->name, client->name); + return; + } + + ofs_lock_enter(&ofs_client->lock); + list_for_each(entry, &ofs_client->device_list) { + ib_device = entry->ptr; + if (ib_device->node_guid == device->node_guid) { + found = B_TRUE; + break; + } + } + if (!found) { + cmn_err(CE_WARN, "No client context found for %s/%s\n", + device->name, client->name); + ofs_lock_exit(&ofs_client->lock); + return; + } + ib_device->data = data; + ofs_lock_exit(&ofs_client->lock); + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_set_client_data: device: 0x%p, client: 0x%p, " + "data: 0x%p", device, client, data); +} + +/* + * ib_query_device - Query IB device attributes + * @device:Device to query + * @device_attr:Device attributes + * + * ib_query_device() returns the attributes of a device through the + * @device_attr pointer. + */ +int +ib_query_device(struct ib_device *device, struct ib_device_attr *attr) +{ + ofs_client_t *ofs_client = (ofs_client_t *)device->clnt_hdl; + ibt_hca_attr_t hattr; + int rtn; + + ofs_lock_enter(&ofs_client->lock); + if (device->reg_state != IB_DEV_OPEN) { + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_query_device: device: 0x%p => " + "invalid device state (%d)", device, device->reg_state); + return (-ENXIO); + } + if ((rtn = ibt_query_hca(device->hca_hdl, &hattr)) != IBT_SUCCESS) { + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_query_device: device: 0x%p => " + "ibt_query_hca failed w/ 0x%x", device, rtn); + return (-EIO); + } + ofs_lock_exit(&ofs_client->lock); + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_query_device: device: 0x%p, attr: 0x%p, rtn: 0x%p", + device, attr, rtn); + + /* OF order is major.micro.minor, so keep it here */ + attr->fw_ver = (uint64_t)hattr.hca_fw_major_version << 32 | + hattr.hca_fw_micro_version << 16 & 0xFFFF0000 | + hattr.hca_fw_minor_version & 0xFFFF; + + attr->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | + IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN; + if (hattr.hca_flags & IBT_HCA_PKEY_CNTR) { + attr->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; + } + if (hattr.hca_flags & IBT_HCA_QKEY_CNTR) { + attr->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; + } + if (hattr.hca_flags & IBT_HCA_AUTO_PATH_MIG) { + attr->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; + } + if (hattr.hca_flags & IBT_HCA_AH_PORT_CHECK) { + attr->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; + } + + attr->vendor_id = hattr.hca_vendor_id; + attr->vendor_part_id = hattr.hca_device_id; + attr->hw_ver = hattr.hca_version_id; + attr->sys_image_guid = htonll(hattr.hca_si_guid); + attr->max_mr_size = ~0ull; + attr->page_size_cap = IBTF2OF_PGSZ(hattr.hca_page_sz); + attr->max_qp = hattr.hca_max_qp; + attr->max_qp_wr = hattr.hca_max_qp_sz; + attr->max_sge = hattr.hca_max_sgl; + attr->max_sge_rd = hattr.hca_max_rd_sgl; + attr->max_cq = hattr.hca_max_cq; + attr->max_cqe = hattr.hca_max_cq_sz; + attr->max_mr = hattr.hca_max_memr; + attr->max_pd = hattr.hca_max_pd; + attr->max_qp_rd_atom = hattr.hca_max_rdma_in_qp; + attr->max_qp_init_rd_atom = hattr.hca_max_rdma_in_qp; + attr->max_ee_rd_atom = hattr.hca_max_rdma_in_ee; + attr->max_ee_init_rd_atom = hattr.hca_max_rdma_in_ee; + attr->max_res_rd_atom = hattr.hca_max_rsc; + attr->max_srq = hattr.hca_max_srqs; + attr->max_srq_wr = hattr.hca_max_srqs_sz -1; + attr->max_srq_sge = hattr.hca_max_srq_sgl; + attr->local_ca_ack_delay = hattr.hca_local_ack_delay; + attr->atomic_cap = hattr.hca_flags & IBT_HCA_ATOMICS_GLOBAL ? + IB_ATOMIC_GLOB : (hattr.hca_flags & IBT_HCA_ATOMICS_HCA ? + IB_ATOMIC_HCA : IB_ATOMIC_NONE); + attr->max_ee = hattr.hca_max_eec; + attr->max_rdd = hattr.hca_max_rdd; + attr->max_mw = hattr.hca_max_mem_win; + attr->max_pkeys = hattr.hca_max_port_pkey_tbl_sz; + attr->max_raw_ipv6_qp = hattr.hca_max_ipv6_qp; + attr->max_raw_ethy_qp = hattr.hca_max_ether_qp; + attr->max_mcast_grp = hattr.hca_max_mcg; + attr->max_mcast_qp_attach = hattr.hca_max_qp_per_mcg; + attr->max_total_mcast_qp_attach = hattr.hca_max_mcg_qps; + attr->max_ah = hattr.hca_max_ah; + attr->max_fmr = hattr.hca_max_fmrs; + attr->max_map_per_fmr = hattr.hca_opaque9; /* hca_max_map_per_fmr */ + + return (0); +} + +/* Protection domains */ +struct ib_pd * +ib_alloc_pd(struct ib_device *device) +{ + ofs_client_t *ofs_client = (ofs_client_t *)device->clnt_hdl; + struct ib_pd *pd; + int rtn; + + if ((pd = kmem_alloc(sizeof (struct ib_pd), KM_NOSLEEP)) == NULL) { + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_alloc_pd: device: 0x%p => no sufficient memory", + device); + return ((struct ib_pd *)-ENOMEM); + } + + ofs_lock_enter(&ofs_client->lock); + if (device->reg_state != IB_DEV_OPEN) { + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_alloc_pd: device: 0x%p => invalid device state (%d)", + device, device->reg_state); + return ((struct ib_pd *)-ENXIO); + } + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_alloc_pd: device: 0x%p", device); + + rtn = ibt_alloc_pd(device->hca_hdl, IBT_PD_NO_FLAGS, &pd->ibt_pd); + ofs_lock_exit(&ofs_client->lock); + + if (rtn == IBT_SUCCESS) { + pd->device = device; + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_alloc_pd: device: 0x%p, pd: 0x%p, ibt_pd: 0x%p, " + "rtn: 0x%x", device, pd, pd->ibt_pd, rtn); + return (pd); + } + kmem_free(pd, sizeof (struct ib_pd)); + + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_alloc_pd: device: 0x%p, pd: 0x%p, ibt_pd: 0x%p => " + "ibt_alloc_pd failed w/ 0x%x", device, pd, pd->ibt_pd, rtn); + + switch (rtn) { + case IBT_INSUFF_RESOURCE: + return ((struct ib_pd *)-ENOMEM); + case IBT_HCA_HDL_INVALID: + return ((struct ib_pd *)-EFAULT); + default: + return ((struct ib_pd *)-EIO); + } +} + +int +ib_dealloc_pd(struct ib_pd *pd) +{ + ofs_client_t *ofs_client = (ofs_client_t *)pd->device->clnt_hdl; + int rtn; + + ofs_lock_enter(&ofs_client->lock); + if (pd->device->reg_state != IB_DEV_OPEN) { + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_dealloc_pd: pd: 0x%p => invalid device state (%d)", + pd, pd->device->reg_state); + return (-ENXIO); + } + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_dealloc_pd: pd: 0x%p", pd); + + rtn = ibt_free_pd(pd->device->hca_hdl, pd->ibt_pd); + ofs_lock_exit(&ofs_client->lock); + + if (rtn == IBT_SUCCESS) { + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_dealloc_pd: pd: 0x%p, device: 0x%p, ibt_pd: 0x%p, " + "rtn: 0x%x", pd, pd->device, pd->ibt_pd, rtn); + kmem_free(pd, sizeof (struct ib_pd)); + return (0); + } + + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_dealloc_pd: pd: 0x%p => ibt_free_pd failed w/ 0x%x", + pd, rtn); + + switch (rtn) { + case IBT_PD_IN_USE: + return (-EBUSY); + case IBT_HCA_HDL_INVALID: + return (-EFAULT); + default: + return (-EIO); + } +} + +/* + * ofs_cq_handler() is a delegated function to handle CQ events, + * which dispatches them to corresponding cq handlers registered + * with ib_create_cq(). + */ +static void +ofs_cq_handler(ibt_cq_hdl_t ibt_cq, void *arg) +{ + struct ib_cq *cq = (struct ib_cq *)ibt_get_cq_private(ibt_cq); + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ofs_cq_handler: ibt_cq: 0x%p, ib_cq: 0x%p, comp_handler: 0x%p, " + "arg: 0x%p", ibt_cq, cq, cq->comp_handler, arg); + + if (cq->comp_handler) { + cq->comp_handler(cq, cq->cq_context); + } +} + +/* + * ib_create_cq - Creates a CQ on the specified device. + * @device: The device on which to create the CQ. + * @comp_handler: A user-specified callback that is invoked when a + * completion event occurs on the CQ. + * @event_handler: A user-specified callback that is invoked when an + * asynchronous event not associated with a completion occurs on the CQ. + * @cq_context: Context associated with the CQ returned to the user via + * the associated completion and event handlers. + * @cqe: The minimum size of the CQ. + * @comp_vector - Completion vector used to signal completion events. + * Must be >= 0 and < context->num_comp_vectors. + * + * Users can examine the cq structure to determine the actual CQ size. + * + * Note that comp_vector is not supported currently. + */ +struct ib_cq * +ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, + void (*event_handler)(struct ib_event *, void *), void *cq_context, + int cqe, int comp_vector) +{ + ofs_client_t *ofs_client = (ofs_client_t *)device->clnt_hdl; + ibt_cq_attr_t cq_attr; + uint32_t real_size; + struct ib_cq *cq; + int rtn; + + if ((cq = kmem_alloc(sizeof (struct ib_cq), KM_NOSLEEP)) == NULL) { + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_create_cq: device: 0x%p, comp_handler: 0x%p, " + "event_handler: 0x%p, cq_context: 0x%p, cqe: 0x%x, " + "comp_vector: %d => no sufficient memory", device, + comp_handler, event_handler, cq_context, cqe, comp_vector); + return ((struct ib_cq *)-ENOMEM); + } + + ofs_lock_enter(&ofs_client->lock); + if (device->reg_state != IB_DEV_OPEN) { + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_create_cq: device: 0x%p, comp_handler: 0x%p, " + "event_handler: 0x%p, cq_context: 0x%p, cqe: 0x%x, " + "comp_vector: %d => invalid device state (%d)", device, + comp_handler, event_handler, cq_context, cqe, comp_vector, + device->reg_state); + return ((struct ib_cq *)-ENXIO); + } + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_create_cq: device: 0x%p, comp_handler: 0x%p, " + "event_handler: 0x%p, cq_context: 0x%p, cqe: 0x%x, " + "comp_vector: %d", device, comp_handler, event_handler, + cq_context, cqe, comp_vector); + + cq_attr.cq_size = cqe; + cq_attr.cq_sched = 0; /* no hint */ + cq_attr.cq_flags = IBT_CQ_NO_FLAGS; + rtn = ibt_alloc_cq(device->hca_hdl, &cq_attr, &cq->ibt_cq, &real_size); + ofs_lock_exit(&ofs_client->lock); + + if (rtn == IBT_SUCCESS) { + cq->device = device; + cq->comp_handler = comp_handler; + cq->event_handler = event_handler; + cq->cq_context = cq_context; + cq->cqe = real_size; + ibt_set_cq_private(cq->ibt_cq, cq); + ibt_set_cq_handler(cq->ibt_cq, ofs_cq_handler, cq_context); + mutex_init(&cq->lock, NULL, MUTEX_DEFAULT, NULL); + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_create_cq: device: 0x%p, cqe: 0x%x, ibt_cq: 0x%p, " + "rtn: 0x%x", device, cqe, cq->ibt_cq, rtn); + return (cq); + } + kmem_free(cq, sizeof (struct ib_cq)); + + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_create_cq: device: 0x%p, cqe: 0x%x, ibt_cq: 0x%p => " + "ibt_alloc_cq failed w/ 0x%x", device, cqe, cq->ibt_cq, rtn); + + switch (rtn) { + case IBT_HCA_CQ_EXCEEDED: + case IBT_INVALID_PARAM: + case IBT_HCA_HDL_INVALID: + return ((struct ib_cq *)-EINVAL); + case IBT_INSUFF_RESOURCE: + return ((struct ib_cq *)-ENOMEM); + default: + return ((struct ib_cq *)-EIO); + } +} + +int +ib_destroy_cq(struct ib_cq *cq) +{ + ofs_client_t *ofs_client = (ofs_client_t *)cq->device->clnt_hdl; + int rtn; + + ofs_lock_enter(&ofs_client->lock); + if (cq->device->reg_state != IB_DEV_OPEN) { + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_destroy_cq: cq: 0x%p => invalid device state (%d)", + cq, cq->device->reg_state); + return (-ENXIO); + } + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_destroy_cq: cq: 0x%p", cq); + + /* + * if IBTL_ASYNC_PENDING is set, ibt_qp is not freed + * at this moment, but yet alive for a while. Then + * there is a possibility that this qp is used even after + * ib_destroy_cq() is called. To distinguish this case from + * others, clear ibt_qp here. + */ + ibt_set_cq_private(cq->ibt_cq, NULL); + + rtn = ibt_free_cq(cq->ibt_cq); + if (rtn == IBT_SUCCESS) { + ofs_lock_exit(&ofs_client->lock); + kmem_free(cq, sizeof (struct ib_cq)); + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_destroy_cq: cq: 0x%p, rtn: 0x%x", cq, rtn); + return (0); + } + ibt_set_cq_private(cq->ibt_cq, cq); + ofs_lock_exit(&ofs_client->lock); + + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_destroy_cq: cq: 0x%p => ibt_free_cq failed w/ 0x%x", cq, rtn); + + switch (rtn) { + case IBT_CQ_BUSY: + return (-EBUSY); + case IBT_HCA_HDL_INVALID: + case IBT_CQ_HDL_INVALID: + return (-EINVAL); + default: + return (-EIO); + } +} + +struct ib_qp * +ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) +{ + ofs_client_t *ofs_client = pd->device->clnt_hdl; + ibt_qp_alloc_attr_t attrs; + ibt_chan_sizes_t sizes; + ib_qpn_t qpn; + ibt_qp_hdl_t ibt_qp; + struct ib_qp *qp; + int rtn; + + /* sanity check */ + if (!(qp_init_attr->send_cq && qp_init_attr->recv_cq)) { + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_create_qp: pd: 0x%p => invalid cqs " + "(send_cq=0x%p, recv_cq=0x%p)", pd, + qp_init_attr->send_cq, qp_init_attr->recv_cq); + return ((struct ib_qp *)-EINVAL); + } + + /* UC, Raw IPv6 and Raw Ethernet are not supported */ + if (qp_init_attr->qp_type == IB_QPT_UC || + qp_init_attr->qp_type == IB_QPT_RAW_IPV6 || + qp_init_attr->qp_type == IB_QPT_RAW_ETY) { + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_create_qp: pd: 0x%p => invalid qp_type", + pd, qp_init_attr->qp_type); + return ((struct ib_qp *)-EINVAL); + } + + if ((qp = kmem_alloc(sizeof (struct ib_qp), KM_NOSLEEP)) == NULL) { + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_create_qp: pd: 0x%p, init_attr: 0x%p => " + "no sufficient memory", pd, qp_init_attr); + return ((struct ib_qp *)-ENOMEM); + } + + ofs_lock_enter(&ofs_client->lock); + if (pd->device->reg_state != IB_DEV_OPEN) { + ofs_lock_exit(&ofs_client->lock); + kmem_free(qp, sizeof (struct ib_qp)); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_create_qp: pd: 0x%p, init_attr: 0x%p => " + "invalid device state (%d)", pd, qp_init_attr, + pd->device->reg_state); + return ((struct ib_qp *)-ENXIO); + } + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_create_qp: pd: 0x%p, event_handler: 0x%p, qp_context: 0x%p, " + "send_cq: 0x%p, recv_cq: 0x%p, srq: 0x%p, max_send_wr: 0x%x, " + "max_recv_wr: 0x%x, max_send_sge: 0x%x, max_recv_sge: 0x%x, " + "max_inline_data: 0x%x, sq_sig_type: %d, qp_type: %d, " + "port_num: %d", + pd, qp_init_attr->event_handler, qp_init_attr->qp_context, + qp_init_attr->send_cq, qp_init_attr->recv_cq, qp_init_attr->srq, + qp_init_attr->cap.max_send_wr, qp_init_attr->cap.max_recv_wr, + qp_init_attr->cap.max_send_sge, qp_init_attr->cap.max_recv_sge, + qp_init_attr->cap.max_inline_data, qp_init_attr->sq_sig_type, + qp_init_attr->qp_type, qp_init_attr->port_num); + + attrs.qp_alloc_flags = IBT_QP_NO_FLAGS; + if (qp_init_attr->srq) { + attrs.qp_alloc_flags |= IBT_QP_USES_SRQ; + } + + attrs.qp_flags = IBT_ALL_SIGNALED | IBT_FAST_REG_RES_LKEY; + if (qp_init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) { + attrs.qp_flags |= IBT_WR_SIGNALED; + } + + attrs.qp_scq_hdl = qp_init_attr->send_cq->ibt_cq; + attrs.qp_rcq_hdl = qp_init_attr->recv_cq->ibt_cq; + attrs.qp_pd_hdl = pd->ibt_pd; + + attrs.qp_sizes.cs_sq = qp_init_attr->cap.max_send_wr; + attrs.qp_sizes.cs_rq = qp_init_attr->cap.max_recv_wr; + attrs.qp_sizes.cs_sq_sgl = qp_init_attr->cap.max_send_sge; + attrs.qp_sizes.cs_rq_sgl = qp_init_attr->cap.max_recv_sge; + attrs.qp_sizes.cs_inline = qp_init_attr->cap.max_inline_data; + + switch (qp_init_attr->qp_type) { + case IB_QPT_RC: + rtn = ibt_alloc_qp(pd->device->hca_hdl, IBT_RC_RQP, &attrs, + &sizes, &qpn, &ibt_qp); + break; + case IB_QPT_UD: + rtn = ibt_alloc_qp(pd->device->hca_hdl, IBT_UD_RQP, &attrs, + &sizes, &qpn, &ibt_qp); + break; + case IB_QPT_SMI: + rtn = ibt_alloc_special_qp(pd->device->hca_hdl, + qp_init_attr->port_num, IBT_SMI_SQP, &attrs, &sizes, + &ibt_qp); + break; + case IB_QPT_GSI: + rtn = ibt_alloc_special_qp(pd->device->hca_hdl, + qp_init_attr->port_num, IBT_GSI_SQP, &attrs, &sizes, + &ibt_qp); + break; + default: + /* this should never happens */ + ofs_lock_exit(&ofs_client->lock); + kmem_free(qp, sizeof (struct ib_qp)); + return ((struct ib_qp *)-EINVAL); + } + ofs_lock_exit(&ofs_client->lock); + + if (rtn == IBT_SUCCESS) { + /* fill in ib_qp_cap w/ the real values */ + qp_init_attr->cap.max_send_wr = sizes.cs_sq; + qp_init_attr->cap.max_recv_wr = sizes.cs_rq; + qp_init_attr->cap.max_send_sge = sizes.cs_sq_sgl; + qp_init_attr->cap.max_recv_sge = sizes.cs_rq_sgl; + /* max_inline_data is not supported */ + qp_init_attr->cap.max_inline_data = 0; + /* fill in ib_qp */ + qp->device = pd->device; + qp->pd = pd; + qp->send_cq = qp_init_attr->send_cq; + qp->recv_cq = qp_init_attr->recv_cq; + qp->srq = qp_init_attr->srq; + qp->event_handler = qp_init_attr->event_handler; + qp->qp_context = qp_init_attr->qp_context; + qp->qp_num = qp_init_attr->qp_type == IB_QPT_SMI ? 0 : + qp_init_attr->qp_type == IB_QPT_GSI ? 1 : qpn; + qp->qp_type = qp_init_attr->qp_type; + qp->ibt_qp = ibt_qp; + ibt_set_qp_private(qp->ibt_qp, qp); + mutex_init(&qp->lock, NULL, MUTEX_DEFAULT, NULL); + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_create_qp: device: 0x%p, pd: 0x%x, init_attr: 0x%p, " + "rtn: 0x%x", pd->device, pd, qp_init_attr, rtn); + return (qp); + } + kmem_free(qp, sizeof (struct ib_qp)); + + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_create_qp: device: 0x%p, pd: 0x%x, init_attr: 0x%p => " + "ibt_alloc_(special)_qp failed w/ rtn: 0x%x", pd->device, pd, + qp_init_attr, rtn); + + switch (rtn) { + case IBT_NOT_SUPPORTED: + case IBT_QP_SRV_TYPE_INVALID: + case IBT_CQ_HDL_INVALID: + case IBT_HCA_HDL_INVALID: + case IBT_INVALID_PARAM: + case IBT_SRQ_HDL_INVALID: + case IBT_PD_HDL_INVALID: + case IBT_HCA_SGL_EXCEEDED: + case IBT_HCA_WR_EXCEEDED: + return ((struct ib_qp *)-EINVAL); + case IBT_INSUFF_RESOURCE: + return ((struct ib_qp *)-ENOMEM); + default: + return ((struct ib_qp *)-EIO); + } +} + +int +ib_destroy_qp(struct ib_qp *qp) +{ + ofs_client_t *ofs_client = (ofs_client_t *)qp->device->clnt_hdl; + int rtn; + + ofs_lock_enter(&ofs_client->lock); + if (qp->device->reg_state != IB_DEV_OPEN) { + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_destroy_qp: qp: 0x%p => invalid device state (%d)", + qp, qp->device->reg_state); + return (-ENXIO); + } + + /* + * if IBTL_ASYNC_PENDING is set, ibt_qp is not freed + * at this moment, but yet alive for a while. Then + * there is a possibility that this qp is used even after + * ib_destroy_qp() is called. To distinguish this case from + * others, clear ibt_qp here. + */ + ibt_set_qp_private(qp->ibt_qp, NULL); + + rtn = ibt_free_qp(qp->ibt_qp); + if (rtn == IBT_SUCCESS) { + ofs_lock_exit(&ofs_client->lock); + kmem_free(qp, sizeof (struct ib_qp)); + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_destroy_qp: qp: 0x%p, rtn: 0x%x", qp, rtn); + return (0); + } + ibt_set_qp_private(qp->ibt_qp, qp); + ofs_lock_exit(&ofs_client->lock); + + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_destroy_qp: qp: 0x%p => ibt_free_qp failed w/ 0x%x", qp, rtn); + + switch (rtn) { + case IBT_CHAN_STATE_INVALID: + case IBT_HCA_HDL_INVALID: + case IBT_QP_HDL_INVALID: + return (-EINVAL); + default: + return (-EIO); + } +} + +/* + * ib_req_notify_cq - Request completion notification on a CQ. + * @cq: The CQ to generate an event for. + * @flags: + * Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP + * to request an event on the next solicited event or next work + * completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS + * may also be |ed in to request a hint about missed events, as + * described below. + * + * Return Value: + * < 0 means an error occurred while requesting notification + * == 0 means notification was requested successfully, and if + * IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events + * were missed and it is safe to wait for another event. In + * this case is it guaranteed that any work completions added + * to the CQ since the last CQ poll will trigger a completion + * notification event. + * > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed + * in. It means that the consumer must poll the CQ again to + * make sure it is empty to avoid missing an event because of a + * race between requesting notification and an entry being + * added to the CQ. This return value means it is possible + * (but not guaranteed) that a work completion has been added + * to the CQ since the last poll without triggering a + * completion notification event. + * + * Note that IB_CQ_REPORT_MISSED_EVENTS is currently not supported. + */ +int +ib_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags) +{ + ibt_cq_notify_flags_t notify_type; + int rtn; + ofs_client_t *ofs_client = cq->device->clnt_hdl; + + ofs_lock_enter(&ofs_client->lock); + if (cq->device->reg_state != IB_DEV_OPEN) { + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_req_notify_cq: cq: 0x%p, flag: 0x%x", cq, flags); + return (-ENXIO); + } + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_req_notify_cq: cq: 0x%p, flag: 0x%x", cq, flags); + + switch (flags & IB_CQ_SOLICITED_MASK) { + case IB_CQ_SOLICITED: + notify_type = IBT_NEXT_SOLICITED; + break; + case IB_CQ_NEXT_COMP: + notify_type = IBT_NEXT_COMPLETION; + break; + default: + /* Currently only two flags are supported */ + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_req_notify_cq: cq: 0x%p, flag: 0x%x => invalid flag", + cq, flags); + return (-EINVAL); + } + + rtn = ibt_enable_cq_notify(cq->ibt_cq, notify_type); + ofs_lock_exit(&ofs_client->lock); + + if (rtn == IBT_SUCCESS) { + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_req_notify_cq: cq: 0x%p, flag: 0x%x rtn: 0x%x", + cq, flags, rtn); + return (0); + } + + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_req_notify_cq: cq: 0x%p, flag: 0x%x => ibt_enable_cq_notify " + "failed w/ 0x%x", cq, flags, rtn); + + switch (rtn) { + case IBT_HCA_HDL_INVALID: + case IBT_CQ_HDL_INVALID: + case IBT_CQ_NOTIFY_TYPE_INVALID: + return (-EINVAL); + default: + return (-EIO); + } +} + +static const struct { + int valid; + enum ib_qp_attr_mask req_param[IB_QPT_RAW_ETY + 1]; + enum ib_qp_attr_mask opt_param[IB_QPT_RAW_ETY + 1]; +} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { + + [IB_QPS_RESET] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_INIT] = { + .valid = 1, + .req_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_PORT | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), + } + }, + }, + [IB_QPS_INIT] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_INIT] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_PORT | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), + } + }, + [IB_QPS_RTR] = { + .valid = 1, + .req_param = { + [IB_QPT_UC] = (IB_QP_AV | IB_QP_PATH_MTU | + IB_QP_DEST_QPN | IB_QP_RQ_PSN), + [IB_QPT_RC] = (IB_QP_AV | IB_QP_PATH_MTU | + IB_QP_DEST_QPN | IB_QP_RQ_PSN | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_MIN_RNR_TIMER), + }, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), + [IB_QPT_RC] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), + } + } + }, + [IB_QPS_RTR] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .req_param = { + [IB_QPT_UD] = IB_QP_SQ_PSN, + [IB_QPT_UC] = IB_QP_SQ_PSN, + [IB_QPT_RC] = (IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | + IB_QP_SQ_PSN | IB_QP_MAX_QP_RD_ATOMIC), + [IB_QPT_SMI] = IB_QP_SQ_PSN, + [IB_QPT_GSI] = IB_QP_SQ_PSN, + }, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY), + } + } + }, + [IB_QPS_RTS] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE | IB_QP_MIN_RNR_TIMER), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY), + } + }, + [IB_QPS_SQD] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY + } + }, + }, + [IB_QPS_SQD] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY), + } + }, + [IB_QPS_SQD] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_AV | IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_PORT | IB_QP_AV | + IB_QP_TIMEOUT | IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC | + IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | + IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), + } + } + }, + [IB_QPS_SQE] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY), + } + } + }, + [IB_QPS_ERR] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 } + } +}; + +static inline int +ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, + enum ib_qp_type type, enum ib_qp_attr_mask mask) +{ + enum ib_qp_attr_mask req_param, opt_param; + + if (cur_state < 0 || cur_state > IB_QPS_ERR || + next_state < 0 || next_state > IB_QPS_ERR) { + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp_is_ok: cur_state: %d, next_state: %d, " + "qp_type: %d, attr_mask: 0x%x => invalid state(1)", + cur_state, next_state, type, mask); + return (0); + } + + if (mask & IB_QP_CUR_STATE && + cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS && + cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE) { + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp_is_ok: cur_state: %d, next_state: %d, " + "qp_type: %d, attr_mask: 0x%x => invalid state(2)", + cur_state, next_state, type, mask); + return (0); + } + + if (!qp_state_table[cur_state][next_state].valid) { + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp_is_ok: cur_state: %d, next_state: %d, " + "qp_type: %d, attr_mask: 0x%x => state is not valid", + cur_state, next_state, type, mask); + return (0); + } + + req_param = qp_state_table[cur_state][next_state].req_param[type]; + opt_param = qp_state_table[cur_state][next_state].opt_param[type]; + + if ((mask & req_param) != req_param) { + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp_is_ok: cur_state: %d, next_state: %d, " + "qp_type: %d, attr_mask: 0x%x => " + "required param doesn't match. req_param = 0x%x", + cur_state, next_state, type, mask, req_param); + return (0); + } + + if (mask & ~(req_param | opt_param | IB_QP_STATE)) { + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp_is_ok: cur_state: %d, next_state: %d, " + "qp_type: %d, attr_mask: 0x%x => " + "unsupported options. req_param = 0x%x, opt_param = 0x%x", + cur_state, next_state, type, mask, req_param, opt_param); + return (0); + } + + return (1); +} + +static inline enum ib_qp_state +qp_current_state(ibt_qp_query_attr_t *qp_attr) +{ + ASSERT(qp_attr->qp_info.qp_state != IBT_STATE_SQDRAIN); + return (enum ib_qp_state)(qp_attr->qp_info.qp_state); +} + +static inline ibt_tran_srv_t +of2ibtf_qp_type(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + return (IBT_UD_SRV); + case IB_QPT_RC: + return (IBT_RC_SRV); + case IB_QPT_UC: + return (IBT_UC_SRV); + case IB_QPT_RAW_IPV6: + return (IBT_RAWIP_SRV); + case IB_QPT_RAW_ETY: + default: + ASSERT(type == IB_QPT_RAW_ETY); + return (IBT_RAWETHER_SRV); + } +} + +static inline void +set_av(struct ib_ah_attr *attr, ibt_cep_path_t *pathp) +{ + ibt_adds_vect_t *av = &pathp->cep_adds_vect; + + pathp->cep_hca_port_num = attr->port_num; + av->av_srate = OF2IBTF_SRATE(attr->static_rate); + av->av_srvl = attr->sl & 0xF; + av->av_send_grh = attr->ah_flags & IB_AH_GRH ? 1 : 0; + + if (av->av_send_grh) { + av->av_dgid.gid_prefix = + attr->grh.dgid.global.subnet_prefix; + av->av_dgid.gid_guid = + attr->grh.dgid.global.interface_id; + av->av_flow = attr->grh.flow_label & 0xFFFFF; + av->av_tclass = attr->grh.traffic_class; + av->av_hop = attr->grh.hop_limit; + av->av_sgid_ix = attr->grh.sgid_index; + } + av->av_dlid = attr->dlid; + av->av_src_path = attr->src_path_bits; +} + +int +ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, int attr_mask) +{ + enum ib_qp_state cur_state, new_state; + ibt_hca_attr_t hattr; + ibt_qp_query_attr_t qp_attr; + ibt_qp_info_t modify_attr; + ibt_cep_modify_flags_t flags; + int rtn; + ofs_client_t *ofs_client = qp->device->clnt_hdl; + + ofs_lock_enter(&ofs_client->lock); + if (qp->device->reg_state != IB_DEV_OPEN) { + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp: qp: 0x%p => invalid device state (%d)", + qp, qp->device->reg_state); + return (-ENXIO); + } + + rtn = ibt_query_hca(qp->device->hca_hdl, &hattr); + if (rtn != IBT_SUCCESS) { + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp: qp: 0x%p, hca_hdl: 0x%p => " + "ibt_query_hca() failed w/ %d", + qp, qp->device->hca_hdl, rtn); + return (-EIO); + } + + /* only one thread per qp is allowed during the qp modification */ + mutex_enter(&qp->lock); + + /* Get the current QP attributes first */ + bzero(&qp_attr, sizeof (ibt_qp_query_attr_t)); + if ((rtn = ibt_query_qp(qp->ibt_qp, &qp_attr)) != IBT_SUCCESS) { + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => " + "ibt_query_qp failed w/ 0x%x", qp, attr, attr_mask, rtn); + return (-EIO); + } + + /* Get the current and new state for this QP */ + cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : + qp_current_state(&qp_attr); + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : + cur_state; + + /* Sanity check of the current/new states */ + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + /* Linux OF returns 0 in this case */ + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => " + "invalid state (both of current/new states are RESET)", + qp, attr, attr_mask); + return (0); + } + + /* + * Check if this modification request is supported with the new + * and/or current state. + */ + if (!ib_modify_qp_is_ok(cur_state, new_state, qp->qp_type, attr_mask)) { + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => " + "invalid arguments", + qp, attr, attr_mask); + return (-EINVAL); + } + + /* Sanity checks */ + if (attr_mask & IB_QP_PORT && (attr->port_num == 0 || + attr->port_num > hattr.hca_nports)) { + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => " + "invalid attr->port_num(%d), max_nports(%d)", + qp, attr, attr_mask, attr->port_num, hattr.hca_nports); + return (-EINVAL); + } + + if (attr_mask & IB_QP_PKEY_INDEX && + attr->pkey_index >= hattr.hca_max_port_pkey_tbl_sz) { + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => " + "invalid attr->pkey_index(%d), max_pkey_index(%d)", + qp, attr, attr_mask, attr->pkey_index, + hattr.hca_max_port_pkey_tbl_sz); + return (-EINVAL); + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > hattr.hca_max_rdma_out_qp) { + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => " + "invalid attr->max_rd_atomic(0x%x), max_rdma_out_qp(0x%x)", + qp, attr, attr_mask, attr->max_rd_atomic, + hattr.hca_max_rdma_out_qp); + return (-EINVAL); + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > hattr.hca_max_rdma_in_qp) { + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => " + "invalid attr->max_dest_rd_atomic(0x%x), " + "max_rdma_in_qp(0x%x)", qp, attr, attr_mask, + attr->max_dest_rd_atomic, hattr.hca_max_rdma_in_qp); + return (-EINVAL); + } + + /* copy the current setting */ + modify_attr = qp_attr.qp_info; + + /* + * Since it's already checked if the modification request matches + * the new and/or current states, just assign both of states to + * modify_attr here. The current state is required if qp_state + * is RTR, but it's harmelss otherwise, so it's set always. + */ + modify_attr.qp_current_state = OF2IBTF_STATE(cur_state); + modify_attr.qp_state = OF2IBTF_STATE(new_state); + modify_attr.qp_trans = of2ibtf_qp_type(qp->qp_type); + + /* Convert OF modification requests into IBTF ones */ + flags = IBT_CEP_SET_STATE; /* IBTF needs IBT_CEP_SET_STATE */ + if (cur_state == IB_QPS_RESET && + new_state == IB_QPS_INIT) { + flags |= IBT_CEP_SET_RESET_INIT; + } else if (cur_state == IB_QPS_INIT && + new_state == IB_QPS_RTR) { + flags |= IBT_CEP_SET_INIT_RTR; + } else if (cur_state == IB_QPS_RTR && + new_state == IB_QPS_RTS) { + flags |= IBT_CEP_SET_RTR_RTS; + } + if (attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) { + flags |= IBT_CEP_SET_SQD_EVENT; + } + if (attr_mask & IB_QP_ACCESS_FLAGS) { + modify_attr.qp_flags &= ~(IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR | + IBT_CEP_ATOMIC); + if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) { + flags |= IBT_CEP_SET_RDMA_R; + modify_attr.qp_flags |= IBT_CEP_RDMA_RD; + } + if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) { + flags |= IBT_CEP_SET_RDMA_W; + modify_attr.qp_flags |= IBT_CEP_RDMA_WR; + } + if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) { + flags |= IBT_CEP_SET_ATOMIC; + modify_attr.qp_flags |= IBT_CEP_ATOMIC; + } + } + if (attr_mask & IB_QP_PKEY_INDEX) { + flags |= IBT_CEP_SET_PKEY_IX; + switch (qp->qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + modify_attr.qp_transport.ud.ud_pkey_ix = + attr->pkey_index; + break; + case IB_QPT_RC: + modify_attr.qp_transport.rc.rc_path.cep_pkey_ix = + attr->pkey_index; + break; + case IB_QPT_UC: + modify_attr.qp_transport.uc.uc_path.cep_pkey_ix = + attr->pkey_index; + break; + default: + /* This should never happen */ + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp(IB_QP_PKEY_INDEX): qp: 0x%p, " + "attr: 0x%p, attr_mask: 0x%x => " + "invalid qp->qp_type(%d)", + qp, attr, attr_mask, qp->qp_type); + return (-EINVAL); + } + } + if (attr_mask & IB_QP_PORT) { + flags |= IBT_CEP_SET_PORT; + switch (qp->qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + modify_attr.qp_transport.ud.ud_port = attr->port_num; + break; + case IB_QPT_RC: + modify_attr.qp_transport.rc.rc_path.cep_hca_port_num = + attr->port_num; + break; + case IB_QPT_UC: + modify_attr.qp_transport.uc.uc_path.cep_hca_port_num = + attr->port_num; + break; + default: + /* This should never happen */ + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp(IB_QP_PORT): qp: 0x%p, " + "attr: 0x%p, attr_mask: 0x%x => " + "invalid qp->qp_type(%d)", + qp, attr, attr_mask, qp->qp_type); + return (-EINVAL); + } + } + if (attr_mask & IB_QP_QKEY) { + ASSERT(qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_SMI || + qp->qp_type == IB_QPT_GSI); + flags |= IBT_CEP_SET_QKEY; + modify_attr.qp_transport.ud.ud_qkey = attr->qkey; + } + if (attr_mask & IB_QP_AV) { + flags |= IBT_CEP_SET_ADDS_VECT; + switch (qp->qp_type) { + case IB_QPT_RC: + set_av(&attr->ah_attr, + &modify_attr.qp_transport.rc.rc_path); + break; + case IB_QPT_UC: + set_av(&attr->ah_attr, + &modify_attr.qp_transport.uc.uc_path); + break; + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + default: + /* This should never happen */ + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp(IB_QP_AV): qp: 0x%p, " + "attr: 0x%p, attr_mask: 0x%x => " + "invalid qp->qp_type(%d)", + qp, attr, attr_mask, qp->qp_type); + return (-EINVAL); + } + } + if (attr_mask & IB_QP_PATH_MTU) { + switch (qp->qp_type) { + case IB_QPT_RC: + modify_attr.qp_transport.rc.rc_path_mtu = + OF2IBTF_PATH_MTU(attr->path_mtu); + break; + case IB_QPT_UC: + modify_attr.qp_transport.uc.uc_path_mtu = + OF2IBTF_PATH_MTU(attr->path_mtu); + break; + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + default: + /* nothing to do */ + break; + } + } + if (attr_mask & IB_QP_TIMEOUT && qp->qp_type == IB_QPT_RC) { + flags |= IBT_CEP_SET_TIMEOUT; + modify_attr.qp_transport.rc.rc_path.cep_timeout = + attr->timeout; + } + if (attr_mask & IB_QP_RETRY_CNT && qp->qp_type == IB_QPT_RC) { + flags |= IBT_CEP_SET_RETRY; + modify_attr.qp_transport.rc.rc_retry_cnt = + attr->retry_cnt & 0x7; + } + if (attr_mask & IB_QP_RNR_RETRY && qp->qp_type == IB_QPT_RC) { + flags |= IBT_CEP_SET_RNR_NAK_RETRY; + modify_attr.qp_transport.rc.rc_rnr_retry_cnt = + attr->rnr_retry & 0x7; + } + if (attr_mask & IB_QP_RQ_PSN) { + switch (qp->qp_type) { + case IB_QPT_RC: + modify_attr.qp_transport.rc.rc_rq_psn = + attr->rq_psn & 0xFFFFFF; + break; + case IB_QPT_UC: + modify_attr.qp_transport.uc.uc_rq_psn = + attr->rq_psn & 0xFFFFFF; + break; + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + default: + /* nothing to do */ + break; + } + } + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && qp->qp_type == IB_QPT_RC) { + if (attr->max_rd_atomic) { + flags |= IBT_CEP_SET_RDMARA_OUT; + modify_attr.qp_transport.rc.rc_rdma_ra_out = + attr->max_rd_atomic; + } + } + if (attr_mask & IB_QP_ALT_PATH) { + /* Sanity checks */ + if (attr->alt_port_num == 0 || + attr->alt_port_num > hattr.hca_nports) { + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp: qp: 0x%p, attr: 0x%p, " + "attr_mask: 0x%x => invalid attr->alt_port_num" + "(%d), max_nports(%d)", + qp, attr, attr_mask, attr->alt_port_num, + hattr.hca_nports); + return (-EINVAL); + } + if (attr->alt_pkey_index >= hattr.hca_max_port_pkey_tbl_sz) { + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp: qp: 0x%p, attr: 0x%p, " + "attr_mask: 0x%x => invalid attr->alt_pkey_index" + "(%d), max_port_key_index(%d)", + qp, attr, attr_mask, attr->alt_pkey_index, + hattr.hca_max_port_pkey_tbl_sz); + return (-EINVAL); + } + flags |= IBT_CEP_SET_ALT_PATH; + switch (qp->qp_type) { + case IB_QPT_RC: + modify_attr.qp_transport.rc.rc_alt_path. + cep_pkey_ix = attr->alt_pkey_index; + modify_attr.qp_transport.rc.rc_alt_path. + cep_hca_port_num = attr->alt_port_num; + set_av(&attr->alt_ah_attr, + &modify_attr.qp_transport.rc.rc_alt_path); + modify_attr.qp_transport.rc.rc_alt_path. + cep_timeout = attr->alt_timeout; + break; + case IB_QPT_UC: + modify_attr.qp_transport.uc.uc_alt_path. + cep_pkey_ix = attr->alt_pkey_index; + modify_attr.qp_transport.uc.uc_alt_path. + cep_hca_port_num = attr->alt_port_num; + set_av(&attr->alt_ah_attr, + &modify_attr.qp_transport.uc.uc_alt_path); + modify_attr.qp_transport.uc.uc_alt_path. + cep_timeout = attr->alt_timeout; + break; + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + default: + /* This should never happen */ + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp(IB_QP_ALT_PATH): qp: 0x%p, " + "attr: 0x%p, attr_mask: 0x%x => " + "invalid qp->qp_type(%d)", + qp, attr, attr_mask, qp->qp_type); + return (-EINVAL); + } + } + if (attr_mask & IB_QP_MIN_RNR_TIMER && qp->qp_type == IB_QPT_RC) { + flags |= IBT_CEP_SET_MIN_RNR_NAK; + modify_attr.qp_transport.rc.rc_min_rnr_nak = + attr->min_rnr_timer & 0x1F; + } + if (attr_mask & IB_QP_SQ_PSN) { + switch (qp->qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + modify_attr.qp_transport.ud.ud_sq_psn = + attr->sq_psn; + break; + case IB_QPT_RC: + modify_attr.qp_transport.rc.rc_sq_psn = + attr->sq_psn; + break; + case IB_QPT_UC: + modify_attr.qp_transport.uc.uc_sq_psn = + attr->sq_psn; + break; + default: + /* This should never happen */ + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp(IB_QP_SQ_PSN): qp: 0x%p, " + "attr: 0x%p, attr_mask: 0x%x => " + "invalid qp->qp_type(%d)", + qp, attr, attr_mask, qp->qp_type); + return (-EINVAL); + } + } + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && qp->qp_type == IB_QPT_RC) { + /* Linux OF sets the value if max_dest_rd_atomic is not zero */ + if (attr->max_dest_rd_atomic) { + flags |= IBT_CEP_SET_RDMARA_IN; + modify_attr.qp_transport.rc.rc_rdma_ra_in = + attr->max_dest_rd_atomic; + } + } + if (attr_mask & IB_QP_PATH_MIG_STATE) { + flags |= IBT_CEP_SET_MIG; + switch (qp->qp_type) { + case IB_QPT_RC: + modify_attr.qp_transport.rc.rc_mig_state = + OF2IBTF_PATH_MIG_STATE(attr->path_mig_state); + break; + case IB_QPT_UC: + modify_attr.qp_transport.uc.uc_mig_state = + OF2IBTF_PATH_MIG_STATE(attr->path_mig_state); + break; + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + default: + /* This should never happen */ + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp(IB_QP_PATH_MIG_STATE): qp: 0x%p, " + "attr: 0x%p, attr_mask: 0x%x => " + "invalid qp->qp_type(%d)", + qp, attr, attr_mask, qp->qp_type); + return (-EINVAL); + } + } + if (attr_mask & IB_QP_CAP) { + /* IB_QP_CAP is not supported */ + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp: qp: 0x%p, attr: 0x%p, " + "attr_mask: 0x%x => IB_QP_CAP is not supported", + qp, attr, attr_mask); + return (-EINVAL); + } + if (attr_mask & IB_QP_DEST_QPN) { + switch (qp->qp_type) { + case IB_QPT_RC: + modify_attr.qp_transport.rc.rc_dst_qpn = + attr->dest_qp_num; + break; + case IB_QPT_UC: + modify_attr.qp_transport.uc.uc_dst_qpn = + attr->dest_qp_num; + break; + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + default: + /* This should never happen */ + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp(IB_QP_DEST_PSN): qp: 0x%p, " + "attr: 0x%p, attr_mask: 0x%x => " + "invalid qp->qp_type(%d)", + qp, attr, attr_mask, qp->qp_type); + return (-EINVAL); + } + } + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x, " + "flags: 0x%x, modify_attr: 0x%p", + qp, attr, attr_mask, flags, &modify_attr); + + /* Modify the QP attributes */ + rtn = ibt_modify_qp(qp->ibt_qp, flags, &modify_attr, NULL); + if (rtn == IBT_SUCCESS) { + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + return (0); + } + mutex_exit(&qp->lock); + ofs_lock_exit(&ofs_client->lock); + + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => " + "ibt_modify_qp failed w/ %d, flags: 0x%x", + qp, attr, attr_mask, rtn, flags); + + switch (rtn) { + case IBT_HCA_HDL_INVALID: + case IBT_QP_HDL_INVALID: + case IBT_QP_SRV_TYPE_INVALID: + case IBT_QP_STATE_INVALID: + case IBT_HCA_PORT_INVALID: + case IBT_PKEY_IX_ILLEGAL: + return (-EINVAL); + default: + return (-EIO); + } +} + +static inline enum ib_wc_status +ibt2of_wc_status(ibt_wc_status_t status) +{ + switch (status) { + case IBT_WC_LOCAL_LEN_ERR: + return (IB_WC_LOC_LEN_ERR); + case IBT_WC_LOCAL_CHAN_OP_ERR: + return (IB_WC_LOC_QP_OP_ERR); + case IBT_WC_LOCAL_PROTECT_ERR: + return (IB_WC_LOC_PROT_ERR); + case IBT_WC_WR_FLUSHED_ERR: + return (IB_WC_WR_FLUSH_ERR); + case IBT_WC_MEM_WIN_BIND_ERR: + return (IB_WC_MW_BIND_ERR); + case IBT_WC_BAD_RESPONSE_ERR: + return (IB_WC_BAD_RESP_ERR); + case IBT_WC_LOCAL_ACCESS_ERR: + return (IB_WC_LOC_ACCESS_ERR); + case IBT_WC_REMOTE_INVALID_REQ_ERR: + return (IB_WC_REM_INV_REQ_ERR); + case IBT_WC_REMOTE_ACCESS_ERR: + return (IB_WC_REM_ACCESS_ERR); + case IBT_WC_REMOTE_OP_ERR: + return (IB_WC_REM_OP_ERR); + case IBT_WC_TRANS_TIMEOUT_ERR: + return (IB_WC_RETRY_EXC_ERR); + case IBT_WC_RNR_NAK_TIMEOUT_ERR: + return (IB_WC_RNR_RETRY_EXC_ERR); + case IBT_WC_SUCCESS: + default: + /* Hermon doesn't support EEC yet */ + ASSERT(status == IBT_WC_SUCCESS); + return (IB_WC_SUCCESS); + } +} + +static inline enum ib_wc_opcode +ibt2of_wc_opcode(ibt_wrc_opcode_t wc_type) +{ + switch (wc_type) { + case IBT_WRC_SEND: + return (IB_WC_SEND); + case IBT_WRC_RDMAR: + return (IB_WC_RDMA_READ); + case IBT_WRC_RDMAW: + return (IB_WC_RDMA_WRITE); + case IBT_WRC_CSWAP: + return (IB_WC_COMP_SWAP); + case IBT_WRC_FADD: + return (IB_WC_FETCH_ADD); + case IBT_WRC_BIND: + return (IB_WC_BIND_MW); + case IBT_WRC_RECV: + return (IB_WC_RECV); + case IBT_WRC_RECV_RDMAWI: + default: + ASSERT(wc_type == IBT_WRC_RECV_RDMAWI); + return (IB_WC_RECV_RDMA_WITH_IMM); + } +} + +static inline int +ibt2of_wc_flags(ibt_wc_flags_t wc_flags) +{ + return (wc_flags & ~IBT_WC_CKSUM_OK); +} + +static inline void +set_wc(ibt_wc_t *ibt_wc, struct ib_wc *wc) +{ + wc->wr_id = ibt_wc->wc_id; + wc->status = ibt2of_wc_status(ibt_wc->wc_status); + /* opcode can be undefined if status is not success */ + if (wc->status == IB_WC_SUCCESS) { + wc->opcode = ibt2of_wc_opcode(ibt_wc->wc_type); + } + wc->vendor_err = 0; /* not supported */ + wc->byte_len = ibt_wc->wc_bytes_xfer; + wc->qp = NULL; /* not supported */ + wc->imm_data = htonl(ibt_wc->wc_immed_data); + wc->src_qp = ibt_wc->wc_qpn; + wc->wc_flags = ibt2of_wc_flags(ibt_wc->wc_flags); + wc->pkey_index = ibt_wc->wc_pkey_ix; + wc->slid = ibt_wc->wc_slid; + wc->sl = ibt_wc->wc_sl; + wc->dlid_path_bits = ibt_wc->wc_path_bits; + wc->port_num = 0; /* not supported */ +} + +/* + * ib_poll_cq - poll a CQ for completion(s) + * @cq:the CQ being polled + * @num_entries:maximum number of completions to return + * @wc:array of at least @num_entries &struct ib_wc where completions + * will be returned + * + * Poll a CQ for (possibly multiple) completions. If the return value + * is < 0, an error occurred. If the return value is >= 0, it is the + * number of completions returned. If the return value is + * non-negative and < num_entries, then the CQ was emptied. + * + * Note that three following memebers in struct ib_wc are not supported + * currently, and the values are always either 0 or NULL. + * u32 vendor_err; + * struct ib_qp *qp; + * u8 port_num; + */ +int +ib_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) +{ + ibt_wc_t ibt_wc; + int npolled; + ibt_status_t rtn; + ofs_client_t *ofs_client = (ofs_client_t *)cq->device->clnt_hdl; + + ofs_lock_enter(&ofs_client->lock); + if (cq->device->reg_state != IB_DEV_OPEN) { + ofs_lock_exit(&ofs_client->lock); + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_poll_cq: cq: 0x%p => invalid device state (%d)", + cq, cq->device->reg_state); + return (-ENXIO); + } + + SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str, + "ib_poll_cq: cq: 0x%p, num_entries: %d, wc: 0x%p, " + "ibt_cq: 0x%p, ibt_wc: 0x%p", + cq, num_entries, wc, cq->ibt_cq, &ibt_wc); + + /* only one thread per cq is allowed during ibt_poll_cq() */ + mutex_enter(&cq->lock); + for (npolled = 0; npolled < num_entries; ++npolled) { + bzero(&ibt_wc, sizeof (ibt_wc_t)); + rtn = ibt_poll_cq(cq->ibt_cq, &ibt_wc, 1, NULL); + if (rtn != IBT_SUCCESS) { + break; + } + /* save this result to struct ib_wc */ + set_wc(&ibt_wc, wc + npolled); + } + mutex_exit(&cq->lock); + ofs_lock_exit(&ofs_client->lock); + + if (rtn == IBT_SUCCESS || rtn == IBT_CQ_EMPTY) { + return (npolled); + } + + SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str, + "ib_poll_cq: cq: 0x%p, num_entries: %d, wc: 0x%p => " + "ibt_poll_cq failed w/ %d, npolled = %d", + cq, num_entries, wc, rtn, npolled); + + switch (rtn) { + case IBT_HCA_HDL_INVALID: + case IBT_CQ_HDL_INVALID: + case IBT_INVALID_PARAM: + return (-EINVAL); + default: + return (-EIO); + } +} + +ibt_hca_hdl_t +ib_get_ibt_hca_hdl(struct ib_device *device) +{ + return (device->hca_hdl); +} + +ibt_channel_hdl_t +ib_get_ibt_channel_hdl(struct rdma_cm_id *cm) +{ + return (cm->qp == NULL ? NULL : cm->qp->ibt_qp); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/LICENSE b/usr/src/uts/common/io/ib/clients/rdsv3/LICENSE new file mode 100644 index 0000000000..29c3aeca4f --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/LICENSE @@ -0,0 +1,40 @@ +DO NOT TRANSLATE OR LOCALIZE. + +This package includes software from the OpenFabrics SW Stack. +Use of any of this software is governed by the terms of the license below: + +OpenIB BSD License + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +The Oracle components of this package are licensed under version 1.0 of +the Common Development and Distribution License ("CDDL"). +You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or +http://www.opensolaris.org/os/licensing. See the License for the +specific language governing permissions and limitations under the License. + +When distributing Covered Code, include this CDDL HEADER in each +file and include the License file at usr/src/OPENSOLARIS.LICENSE. +If applicable, add the following below this CDDL HEADER, with the +fields enclosed by brackets "[]" replaced with your own identifying +information: Portions Copyright [yyyy] [name of copyright owner] diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/LICENSE.descrip b/usr/src/uts/common/io/ib/clients/rdsv3/LICENSE.descrip new file mode 100644 index 0000000000..dc724239e2 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/LICENSE.descrip @@ -0,0 +1 @@ +Support of Reliable Datagram Service (RDS) diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/af_rds.c b/usr/src/uts/common/io/ib/clients/rdsv3/af_rds.c new file mode 100644 index 0000000000..49789637f6 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/af_rds.c @@ -0,0 +1,1009 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/modctl.h> +#include <sys/rds.h> +#include <sys/stropts.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sockio.h> +#include <sys/sysmacros.h> + +#include <inet/ip.h> +#include <net/if_types.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/rdma.h> +#include <sys/ib/clients/rdsv3/rdma_transport.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +extern void rdsv3_remove_bound(struct rdsv3_sock *rds); +extern int rdsv3_verify_bind_address(ipaddr_t addr); + +extern ddi_taskq_t *rdsv3_taskq; +extern struct rdma_cm_id *rdsv3_rdma_listen_id; + +/* this is just used for stats gathering :/ */ +kmutex_t rdsv3_sock_lock; +static unsigned long rdsv3_sock_count; +list_t rdsv3_sock_list; +rdsv3_wait_queue_t rdsv3_poll_waitq; + +/* + * This is called as the final descriptor referencing this socket is closed. + * We have to unbind the socket so that another socket can be bound to the + * address it was using. + * + * We have to be careful about racing with the incoming path. sock_orphan() + * sets SOCK_DEAD and we use that as an indicator to the rx path that new + * messages shouldn't be queued. + */ +/* ARGSUSED */ +static int +rdsv3_release(sock_lower_handle_t proto_handle, int flgs, cred_t *cr) +{ + struct rsock *sk = (struct rsock *)proto_handle; + struct rdsv3_sock *rs; + + if (sk == NULL) + goto out; + + rs = rdsv3_sk_to_rs(sk); + RDSV3_DPRINTF4("rdsv3_release", "Enter(rs: %p, sk: %p)", rs, sk); + + rdsv3_sk_sock_orphan(sk); + rdsv3_cong_remove_socket(rs); + rdsv3_remove_bound(rs); + /* + * Note - rdsv3_clear_recv_queue grabs rs_recv_lock, so + * that ensures the recv path has completed messing + * with the socket. + */ + rdsv3_clear_recv_queue(rs); + rdsv3_send_drop_to(rs, NULL); + rdsv3_rdma_drop_keys(rs); + (void) rdsv3_notify_queue_get(rs, NULL); + + mutex_enter(&rdsv3_sock_lock); + list_remove_node(&rs->rs_item); + rdsv3_sock_count--; + mutex_exit(&rdsv3_sock_lock); + + rdsv3_sk_sock_put(sk); + + RDSV3_DPRINTF4("rdsv3_release", "Return (rds: %p)", rs); +out: + return (0); +} + +void +__rdsv3_wake_sk_sleep(struct rsock *sk) +{ + /* wakup anyone waiting in recvmsg */ + if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD) && sk->sk_sleep) + rdsv3_wake_up(sk->sk_sleep); +} + +/* + * Careful not to race with rdsv3_release -> sock_orphan which clears sk_sleep. + * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK + * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but + * this seems more conservative. + * NB - normally, one would use sk_callback_lock for this, but we can + * get here from interrupts, whereas the network code grabs sk_callback_lock + * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. + */ +void +rdsv3_wake_sk_sleep(struct rdsv3_sock *rs) +{ + RDSV3_DPRINTF4("rdsv3_wake_sk_sleep", "Enter(rs: %p)", rs); + + rw_enter(&rs->rs_recv_lock, RW_READER); + __rdsv3_wake_sk_sleep(rdsv3_rs_to_sk(rs)); + rw_exit(&rs->rs_recv_lock); +} + +/*ARGSUSED*/ +static int +rdsv3_getname(sock_lower_handle_t proto_handle, struct sockaddr *addr, + socklen_t *addr_len, cred_t *cr) +{ + struct rsock *sk = (struct rsock *)proto_handle; + struct sockaddr_in *sin = (struct sockaddr_in *)addr; + struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); + + RDSV3_DPRINTF4("rdsv3_getname", "Enter(rs: %p, port: %d)", rs, + rs->rs_bound_port); + + sin->sin_port = rs->rs_bound_port; + sin->sin_addr.s_addr = rs->rs_bound_addr; + + sin->sin_family = AF_INET_OFFLOAD; + + *addr_len = sizeof (*sin); + return (0); +} + +/* + * RDS' poll is without a doubt the least intuitive part of the interface, + * as POLLIN and POLLOUT do not behave entirely as you would expect from + * a network protocol. + * + * POLLIN is asserted if + * - there is data on the receive queue. + * - to signal that a previously congested destination may have become + * uncongested + * - A notification has been queued to the socket (this can be a congestion + * update, or a RDMA completion). + * + * POLLOUT is asserted if there is room on the send queue. This does not mean + * however, that the next sendmsg() call will succeed. If the application tries + * to send to a congested destination, the system call may still fail (and + * return ENOBUFS). + */ +/* ARGSUSED */ +static short +rdsv3_poll(sock_lower_handle_t proto_handle, short events, int anyyet, + cred_t *cr) +{ + struct rsock *sk = (struct rsock *)proto_handle; + struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); + unsigned short mask = 0; + +#if 0 + RDSV3_DPRINTF4("rdsv3_poll", "enter(%p %x %d)", rs, events, anyyet); +#endif + + rw_enter(&rs->rs_recv_lock, RW_READER); + if (!rs->rs_cong_monitor) { + /* + * When a congestion map was updated, we signal POLLIN for + * "historical" reasons. Applications can also poll for + * WRBAND instead. + */ + if (rdsv3_cong_updated_since(&rs->rs_cong_track)) + mask |= (POLLIN | POLLRDNORM | POLLWRBAND); + } else { + mutex_enter(&rs->rs_lock); + if (rs->rs_cong_notify) + mask |= (POLLIN | POLLRDNORM); + mutex_exit(&rs->rs_lock); + } + if (!list_is_empty(&rs->rs_recv_queue) || + !list_is_empty(&rs->rs_notify_queue)) + mask |= (POLLIN | POLLRDNORM); + if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs)) + mask |= (POLLOUT | POLLWRNORM); + rw_exit(&rs->rs_recv_lock); + +#if 0 + RDSV3_DPRINTF4("rdsv3_poll", "return(%p %x)", rs, mask); +#endif + + return (mask); +} + +/* ARGSUSED */ +static int +rdsv3_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, + int mode, int32_t *rvalp, cred_t *cr) +{ + ksocket_t so4; + struct lifconf lifc; + struct lifreq lifr, *lifrp; + struct ifconf ifc; + struct ifreq ifr; + int rval = 0, rc, len; + int numifs; + int bufsize; + void *buf; + + RDSV3_DPRINTF4("rdsv3_ioctl", "enter: cmd: %d", cmd); + + /* Only ipv4 for now */ + rval = ksocket_socket(&so4, PF_INET, SOCK_DGRAM, 0, KSOCKET_NOSLEEP, + CRED()); + if (rval != 0) { + RDSV3_DPRINTF2("rdsv3_ioctl", "ksocket_socket returned %d", + rval); + return (rval); + } + + switch (cmd) { + case SIOCGLIFNUM : + case SIOCGIFNUM : + rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs); + if (rval != 0) break; + if (cmd == SIOCGLIFNUM) { + (void) ddi_copyout(&numifs, (void *)arg, + sizeof (int), 0); + } else { + len = 0; + for (lifrp = (struct lifreq *)buf, rc = 0; rc < numifs; + rc++, lifrp++) { + if (strlen(lifrp->lifr_name) <= IFNAMSIZ) { + len++; + } + } + (void) ddi_copyout(&len, (void *)arg, + sizeof (int), 0); + } + kmem_free(buf, bufsize); + break; + + case SIOCGLIFCONF : + if (ddi_copyin((void *)arg, &lifc, sizeof (struct lifconf), 0) + != 0) { + RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifc"); + rval = EFAULT; + break; + } + + rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs); + if (rval != 0) { + RDSV3_DPRINTF2("rdsv3_ioctl", + "rdsv3_do_ip_ioctl failed: %d", rval); + break; + } + + if ((lifc.lifc_len > 0) && (numifs > 0)) { + if (ddi_copyout(buf, (void *)lifc.lifc_req, + (lifc.lifc_len < bufsize) ? lifc.lifc_len : + bufsize, 0) != 0) { + RDSV3_DPRINTF2("rdsv3_ioctl", + "copyout of records failed"); + rval = EFAULT; + } + + } + + lifc.lifc_len = bufsize; + if (ddi_copyout(&lifc, (void *)arg, sizeof (struct lifconf), + 0) != 0) { + RDSV3_DPRINTF2("rdsv3_ioctl", + "copyout of lifconf failed"); + rval = EFAULT; + } + + kmem_free(buf, bufsize); + break; + + case SIOCGIFCONF : + case O_SIOCGIFCONF : + if (ddi_copyin((void *)arg, &ifc, sizeof (struct ifconf), 0) + != 0) { + RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifc"); + rval = EFAULT; + break; + } + + RDSV3_DPRINTF2("rdsv3_ioctl", + "O_SIOCGIFCONF: ifc_len: %d, req: %p", + ifc.ifc_len, ifc.ifc_req); + + rval = rdsv3_do_ip_ioctl_old(so4, &buf, &bufsize, &numifs); + if (rval != 0) { + RDSV3_DPRINTF2("rdsv3_ioctl", + "rdsv3_do_ip_ioctl_old failed: %d", rval); + break; + } + + if ((ifc.ifc_len > 0) && (numifs > 0)) { + if (ddi_copyout(buf, (void *)ifc.ifc_req, + (ifc.ifc_len < bufsize) ? ifc.ifc_len : + bufsize, 0) != 0) { + RDSV3_DPRINTF2("rdsv3_ioctl", + "copyout of records failed"); + rval = EFAULT; + } + + } + + ifc.ifc_len = bufsize; + if (ddi_copyout(&ifc, (void *)arg, sizeof (struct ifconf), + 0) != 0) { + RDSV3_DPRINTF2("rdsv3_ioctl", + "copyout of ifconf failed"); + rval = EFAULT; + } + + kmem_free(buf, bufsize); + break; + + case SIOCGLIFFLAGS : + case SIOCSLIFFLAGS : + case SIOCGLIFMTU : + case SIOCGLIFNETMASK : + case SIOCGLIFINDEX : + if (ddi_copyin((void *)arg, &lifr, sizeof (struct lifreq), 0) + != 0) { + RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifr"); + rval = EFAULT; + break; + } + + rc = ksocket_ioctl(so4, cmd, (intptr_t)&lifr, &rval, CRED()); + if (rc != 0) { + RDSV3_DPRINTF2("rdsv3_ioctl", + "ksocket_ioctl failed: %d, name: %s cmd: 0x%x", + rc, lifr.lifr_name, cmd); + break; + } + + (void) ddi_copyout(&lifr, (void *)arg, + sizeof (struct lifreq), 0); + break; + + case SIOCGIFFLAGS : + case SIOCSIFFLAGS : + case SIOCGIFMTU : + case SIOCGIFNETMASK : + case SIOCGIFINDEX : + if (ddi_copyin((void *)arg, &ifr, sizeof (struct ifreq), 0) + != 0) { + RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifr"); + rval = EFAULT; + break; + } + + RDSV3_DPRINTF2("rdsv3_ioctl", "1. name: %s", ifr.ifr_name); + + rc = ksocket_ioctl(so4, cmd, (intptr_t)&ifr, &rval, CRED()); + if (rc != 0) { + RDSV3_DPRINTF2("rdsv3_ioctl", + "ksocket_ioctl failed: %d, name: %s cmd: 0x%x", + rc, ifr.ifr_name, cmd); + + break; + } + + RDSV3_DPRINTF2("rdsv3_ioctl", "2. name: %s", ifr.ifr_name); + + (void) ddi_copyout(&ifr, (void *)arg, + sizeof (struct ifreq), 0); + break; + + default: + cmn_err(CE_CONT, "unsupported IOCTL cmd: %d \n", cmd); + rval = EOPNOTSUPP; + } + + (void) ksocket_close(so4, CRED()); + + RDSV3_DPRINTF4("rdsv3_ioctl", "return: %d cmd: %d", rval, cmd); + + *rvalp = rval; + return (rval); +} + +static int +rdsv3_cancel_sent_to(struct rdsv3_sock *rs, char *optval, int len) +{ + struct sockaddr_in sin; + + /* racing with another thread binding seems ok here */ + if (rs->rs_bound_addr == 0) + return (-ENOTCONN); /* XXX not a great errno */ + + if (len < sizeof (struct sockaddr_in)) + return (-EINVAL); + + if (ddi_copyin((void *)optval, &sin, sizeof (struct sockaddr_in), + 0) != 0) { + RDSV3_DPRINTF2("rdsv3_cancel_sent_to", "ddi_copyin failed sin"); + return (-EFAULT); + } + + rdsv3_send_drop_to(rs, &sin); + + return (0); +} + +static int +rdsv3_set_bool_option(unsigned char *optvar, char *optval, int optlen) +{ + int value = *optval; + + if (optlen < sizeof (int)) + return (-EINVAL); + *optvar = !!value; + return (0); +} + +static int +rdsv3_cong_monitor(struct rdsv3_sock *rs, char *optval, int optlen) +{ + int ret; + + ret = rdsv3_set_bool_option(&rs->rs_cong_monitor, optval, optlen); + if (ret == 0) { + if (rs->rs_cong_monitor) { + rdsv3_cong_add_socket(rs); + } else { + rdsv3_cong_remove_socket(rs); + rs->rs_cong_mask = 0; + rs->rs_cong_notify = 0; + } + } + return (ret); +} + +/*ARGSUSED*/ +static int +rdsv3_setsockopt(sock_lower_handle_t proto_handle, int level, + int optname, const void *optval, socklen_t optlen, cred_t *cr) +{ + struct rsock *sk = (struct rsock *)proto_handle; + struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); + int ret = 0; + + RDSV3_DPRINTF4("rdsv3_setsockopt", "enter(%p %d %d)", + rs, level, optname); + + switch (optname) { + case RDSV3_CANCEL_SENT_TO: + ret = rdsv3_cancel_sent_to(rs, (char *)optval, optlen); + break; + case RDSV3_GET_MR: + ret = rdsv3_get_mr(rs, optval, optlen); + break; + case RDSV3_FREE_MR: + ret = rdsv3_free_mr(rs, optval, optlen); + break; + case RDSV3_RECVERR: + ret = rdsv3_set_bool_option(&rs->rs_recverr, + (char *)optval, optlen); + break; + case RDSV3_CONG_MONITOR: + ret = rdsv3_cong_monitor(rs, (char *)optval, optlen); + break; + case SO_SNDBUF: + sk->sk_sndbuf = *(uint_t *)optval; + return (ret); + case SO_RCVBUF: + sk->sk_rcvbuf = *(uint_t *)optval; + return (ret); + default: +#if 1 + break; +#else + ret = -ENOPROTOOPT; +#endif + } +out: + return (ret); +} + +/* XXX */ +/*ARGSUSED*/ +static int +rdsv3_getsockopt(sock_lower_handle_t proto_handle, int level, + int optname, void *optval, socklen_t *optlen, cred_t *cr) +{ + struct rsock *sk = (struct rsock *)proto_handle; + struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); + int ret = 0; + + RDSV3_DPRINTF4("rdsv3_getsockopt", "enter(%p %d %d)", + rs, optname, *optlen); + + switch (optname) { + case SO_SNDBUF: + RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_SNDBUF(%d)", + sk->sk_sndbuf); + if (*optlen != 0) { + *((int *)optval) = sk->sk_sndbuf; + *optlen = sizeof (uint_t); + } + return (ret); + case SO_RCVBUF: + RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_RCVBUF(%d)", + sk->sk_rcvbuf); + if (*optlen != 0) { + *((int *)optval) = sk->sk_rcvbuf; + *optlen = sizeof (uint_t); + } + return (ret); + case RDSV3_RECVERR: + RDSV3_DPRINTF4("rdsv3_getsockopt", "RDSV3_RECVERR(%d)", + rs->rs_recverr); + if (*optlen < sizeof (int)) + return (-EINVAL); + else { + *(int *)optval = rs->rs_recverr; + *optlen = sizeof (int); + } + return (0); + default: + if ((optname >= RDSV3_INFO_FIRST) && + (optname <= RDSV3_INFO_LAST)) { + return (rdsv3_info_getsockopt(sk, optname, optval, + optlen)); + } + RDSV3_DPRINTF2("rdsv3_getsockopt", + "Unknown: level: %d optname: %d", level, optname); + ret = -ENOPROTOOPT; + } + + RDSV3_DPRINTF4("rdsv3_getsockopt", "return(%p %d %d)", + rs, optname, ret); + return (ret); +} + +/*ARGSUSED*/ +static int rdsv3_connect(sock_lower_handle_t proto_handle, + const struct sockaddr *addr, socklen_t addr_len, sock_connid_t *conn, + cred_t *cr) +{ + struct rsock *sk = (struct rsock *)proto_handle; + struct sockaddr_in *sin = (struct sockaddr_in *)addr; + struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); + int ret = 0; + + RDSV3_DPRINTF4("rdsv3_connect", "Enter(rs: %p)", rs); + + mutex_enter(&sk->sk_lock); + + if (addr_len != sizeof (struct sockaddr_in)) { + ret = -EINVAL; + goto out; + } + + if (sin->sin_family != AF_INET_OFFLOAD) { + ret = -EAFNOSUPPORT; + goto out; + } + + if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { + ret = -EDESTADDRREQ; + goto out; + } + + rs->rs_conn_addr = sin->sin_addr.s_addr; + rs->rs_conn_port = sin->sin_port; + + sk->sk_upcalls->su_connected(sk->sk_upper_handle, 0, NULL, -1); + + RDSV3_DPRINTF4("rdsv3_connect", "Return(rs: %p)", rs); + +out: + mutex_exit(&sk->sk_lock); + return (ret); +} + +/*ARGSUSED*/ +static int +rdsv3_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) +{ + struct rsock *sk = (struct rsock *)proto_handle; + struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); + + RDSV3_DPRINTF4("rdsv3_shutdown", "Enter(rs: %p)", rs); + + return (0); +} + +/*ARGSUSED*/ +void +rdsv3_activate(sock_lower_handle_t proto_handle, + sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, + int flags, cred_t *cr) +{ + struct rsock *sk = (struct rsock *)proto_handle; + struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); + + RDSV3_DPRINTF4("rdsv3_activate", "Enter(rs: %p)", rs); + + sk->sk_upcalls = sock_upcalls; + sk->sk_upper_handle = sock_handle; + + RDSV3_DPRINTF4("rdsv3_activate", "Return (rs: %p)", rs); +} + + +/* ARGSUSED */ +int +rdsv3_send_uio(sock_lower_handle_t proto_handle, uio_t *uio, + struct nmsghdr *msg, cred_t *cr) +{ + struct rsock *sk = (struct rsock *)proto_handle; + struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); + int ret; + + RDSV3_DPRINTF4("rdsv3_send_uio", "Enter(rs: %p)", rs); + ret = rdsv3_sendmsg(rs, uio, msg, uio->uio_resid); + + RDSV3_DPRINTF4("rdsv3_send_uio", "Return(rs: %p ret %d)", rs, ret); + if (ret < 0) { + return (-ret); + } + + return (0); +} + +/* ARGSUSED */ +int +rdsv3_recv_uio(sock_lower_handle_t proto_handle, uio_t *uio, + struct nmsghdr *msg, cred_t *cr) +{ + struct rsock *sk = (struct rsock *)proto_handle; + struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); + int ret; + + RDSV3_DPRINTF4("rdsv3_recv_uio", "Enter (rs: %p)", rs); + ret = rdsv3_recvmsg(rs, uio, msg, uio->uio_resid, msg->msg_flags); + + RDSV3_DPRINTF4("rdsv3_recv_uio", "Return(rs: %p ret %d)", rs, ret); + + if (ret < 0) { + return (-ret); + } + + return (0); +} + +/*ARGSUSED*/ +int +rdsv3_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, + socklen_t *addr_len, cred_t *cr) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)addr; + struct rsock *sk = (struct rsock *)proto_handle; + struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); + + RDSV3_DPRINTF2("rdsv3_getpeername", "enter(rs: %p)", rs); + + (void) memset(sin->sin_zero, 0, sizeof (sin->sin_zero)); + + /* racey, don't care */ + if (!rs->rs_conn_addr) + return (-ENOTCONN); + + sin->sin_port = rs->rs_conn_port; + sin->sin_addr.s_addr = rs->rs_conn_addr; + + sin->sin_family = AF_INET_OFFLOAD; + + *addr_len = sizeof (*sin); + return (0); +} + +void +rdsv3_clrflowctrl(sock_lower_handle_t proto_handle) +{ + struct rsock *sk = (struct rsock *)proto_handle; + struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); + + RDSV3_DPRINTF2("rdsv3_clrflowctrl", "enter(rs: %p)", rs); +} + +#ifndef __lock_lint +static struct sock_downcalls_s rdsv3_sock_downcalls = { + .sd_close = rdsv3_release, + .sd_bind = rdsv3_bind, + .sd_connect = rdsv3_connect, + .sd_accept = NULL, + .sd_getsockname = rdsv3_getname, + .sd_poll = rdsv3_poll, + .sd_ioctl = rdsv3_ioctl, + .sd_listen = NULL, + .sd_shutdown = rdsv3_shutdown, + .sd_setsockopt = rdsv3_setsockopt, + .sd_getsockopt = rdsv3_getsockopt, + .sd_send_uio = rdsv3_send_uio, + .sd_recv_uio = rdsv3_recv_uio, + .sd_activate = rdsv3_activate, + .sd_getpeername = rdsv3_getpeername, + .sd_send = NULL, + .sd_clr_flowctrl = NULL +}; +#else +static struct sock_downcalls_s rdsv3_sock_downcalls = { + rdsv3_activate, + NULL, + rdsv3_bind, + NULL, + rdsv3_connect, + rdsv3_getpeername, + rdsv3_getname, + rdsv3_getsockopt, + rdsv3_setsockopt, + NULL, + rdsv3_send_uio, + rdsv3_recv_uio, + rdsv3_poll, + rdsv3_shutdown, + NULL, + rdsv3_ioctl, + rdsv3_release +}; +#endif + +sock_lower_handle_t +rdsv3_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, + uint_t *smodep, int *errorp, int flags, cred_t *credp) +{ + struct rdsv3_sock *rs; + struct rsock *sk; + + RDSV3_DPRINTF4("rdsv3_create", "Enter (family: %d type: %d, proto: %d " + "flags: %d", family, type, proto, flags); + + sk = rdsv3_sk_alloc(); + if (sk == NULL) + return (NULL); + rdsv3_sock_init_data(sk); + + rs = rdsv3_sk_to_rs(sk); + rs->rs_sk = sk; + mutex_init(&rs->rs_lock, NULL, MUTEX_DRIVER, NULL); + rw_init(&rs->rs_recv_lock, NULL, RW_DRIVER, NULL); + list_create(&rs->rs_send_queue, sizeof (struct rdsv3_message), + offsetof(struct rdsv3_message, m_sock_item)); + list_create(&rs->rs_recv_queue, sizeof (struct rdsv3_incoming), + offsetof(struct rdsv3_incoming, i_item)); + list_create(&rs->rs_notify_queue, sizeof (struct rdsv3_notifier), + offsetof(struct rdsv3_notifier, n_list)); + mutex_init(&rs->rs_rdma_lock, NULL, MUTEX_DRIVER, NULL); + avl_create(&rs->rs_rdma_keys, rdsv3_mr_compare, + sizeof (struct rdsv3_mr), offsetof(struct rdsv3_mr, r_rb_node)); + mutex_init(&rs->rs_conn_lock, NULL, MUTEX_DRIVER, NULL); + rs->rs_cred = credp; + rs->rs_zoneid = getzoneid(); + crhold(credp); + + mutex_enter(&rdsv3_sock_lock); + list_insert_tail(&rdsv3_sock_list, rs); + rdsv3_sock_count++; + /* Initialize RDMA/IB on the 1st socket if not done at attach */ + if (rdsv3_sock_count == 1) { + rdsv3_rdma_init(); + } + mutex_exit(&rdsv3_sock_lock); + + *errorp = 0; + *smodep = SM_ATOMIC; + *sock_downcalls = &rdsv3_sock_downcalls; + + RDSV3_DPRINTF4("rdsv3_create", "Return: %p", rs); + + return ((sock_lower_handle_t)rdsv3_rs_to_sk(rs)); +} + +void +rdsv3_sock_addref(struct rdsv3_sock *rs) +{ + RDSV3_DPRINTF4("rdsv3_sock_addref", "Enter(rs: %p)", rs); + rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs)); +} + +void +rdsv3_sock_put(struct rdsv3_sock *rs) +{ + RDSV3_DPRINTF4("rdsv3_sock_put", "Enter(rs: %p)", rs); + rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs)); +} + +static void +rdsv3_sock_inc_info(struct rsock *sock, unsigned int len, + struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) +{ + struct rdsv3_sock *rs; + struct rdsv3_incoming *inc; + unsigned int total = 0; + + RDSV3_DPRINTF4("rdsv3_sock_inc_info", "Enter(rs: %p)", + rdsv3_sk_to_rs(sock)); + + len /= sizeof (struct rdsv3_info_message); + + mutex_enter(&rdsv3_sock_lock); + + RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) { + rw_enter(&rs->rs_recv_lock, RW_READER); + + /* XXX too lazy to maintain counts.. */ + RDSV3_FOR_EACH_LIST_NODE(inc, &rs->rs_recv_queue, i_item) { + total++; + if (total <= len) + rdsv3_inc_info_copy(inc, iter, inc->i_saddr, + rs->rs_bound_addr, 1); + } + + rw_exit(&rs->rs_recv_lock); + } + + mutex_exit(&rdsv3_sock_lock); + + lens->nr = total; + lens->each = sizeof (struct rdsv3_info_message); + + RDSV3_DPRINTF4("rdsv3_sock_inc_info", "return(rs: %p)", + rdsv3_sk_to_rs(sock)); +} + +static void +rdsv3_sock_info(struct rsock *sock, unsigned int len, + struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) +{ + struct rdsv3_info_socket sinfo; + struct rdsv3_sock *rs; + unsigned long bytes; + + RDSV3_DPRINTF4("rdsv3_sock_info", "Enter(rs: %p)", + rdsv3_sk_to_rs(sock)); + + len /= sizeof (struct rdsv3_info_socket); + + mutex_enter(&rdsv3_sock_lock); + + if ((len < rdsv3_sock_count) || (iter->addr == NULL)) + goto out; + + bytes = sizeof (struct rdsv3_info_socket); + RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) { + sinfo.sndbuf = rdsv3_sk_sndbuf(rs); + sinfo.rcvbuf = rdsv3_sk_rcvbuf(rs); + sinfo.bound_addr = rs->rs_bound_addr; + sinfo.connected_addr = rs->rs_conn_addr; + sinfo.bound_port = rs->rs_bound_port; + sinfo.connected_port = rs->rs_conn_port; + + rdsv3_info_copy(iter, &sinfo, bytes); + } + + RDSV3_DPRINTF4("rdsv3_sock_info", "Return(rs: %p)", + rdsv3_sk_to_rs(sock)); + +out: + lens->nr = rdsv3_sock_count; + lens->each = sizeof (struct rdsv3_info_socket); + + mutex_exit(&rdsv3_sock_lock); +} + +rdsv3_delayed_work_t *rdsv3_rdma_dwp = NULL; +uint_t rdsv3_rdma_init_delay = 5; /* secs */ +extern void rdsv3_rdma_init_worker(struct rdsv3_work_s *work); + +void +rdsv3_exit(void) +{ + RDSV3_DPRINTF4("rdsv3_exit", "Enter"); + + if (rdsv3_rdma_dwp) { + rdsv3_cancel_delayed_work(rdsv3_rdma_dwp); + } + + (void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_rdma_exit, + NULL, DDI_SLEEP); + while (rdsv3_rdma_listen_id != NULL) { +#ifndef __lock_lint + RDSV3_DPRINTF5("rdsv3", "%s-%d Waiting for rdsv3_rdma_exit", + __func__, __LINE__); +#endif + delay(drv_usectohz(1000)); + } + + rdsv3_conn_exit(); + rdsv3_cong_exit(); + rdsv3_sysctl_exit(); + rdsv3_threads_exit(); + rdsv3_stats_exit(); + rdsv3_info_deregister_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info); + rdsv3_info_deregister_func(RDSV3_INFO_RECV_MESSAGES, + rdsv3_sock_inc_info); + + if (rdsv3_rdma_dwp) { + kmem_free(rdsv3_rdma_dwp, sizeof (rdsv3_delayed_work_t)); + rdsv3_rdma_dwp = NULL; + } + + RDSV3_DPRINTF4("rdsv3_exit", "Return"); +} + +/*ARGSUSED*/ +int +rdsv3_init() +{ + int ret; + + RDSV3_DPRINTF4("rdsv3_init", "Enter"); + + rdsv3_cong_init(); + ret = rdsv3_conn_init(); + if (ret) + goto out; + ret = rdsv3_threads_init(); + if (ret) + goto out_conn; + ret = rdsv3_sysctl_init(); + if (ret) + goto out_threads; + ret = rdsv3_stats_init(); + if (ret) + goto out_sysctl; + + rdsv3_info_register_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info); + rdsv3_info_register_func(RDSV3_INFO_RECV_MESSAGES, rdsv3_sock_inc_info); + + /* rdsv3_rdma_init need to be called with a little delay */ + rdsv3_rdma_dwp = kmem_zalloc(sizeof (rdsv3_delayed_work_t), KM_SLEEP); + RDSV3_INIT_DELAYED_WORK(rdsv3_rdma_dwp, rdsv3_rdma_init_worker); + rdsv3_queue_delayed_work(rdsv3_wq, rdsv3_rdma_dwp, + rdsv3_rdma_init_delay); + + RDSV3_DPRINTF4("rdsv3_init", "Return"); + + goto out; + +out_stats: + rdsv3_stats_exit(); +out_sysctl: + rdsv3_sysctl_exit(); +out_threads: + rdsv3_threads_exit(); +out_conn: + rdsv3_conn_exit(); + rdsv3_cong_exit(); +out: + return (ret); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/bind.c b/usr/src/uts/common/io/ib/clients/rdsv3/bind.c new file mode 100644 index 0000000000..965b2977d0 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/bind.c @@ -0,0 +1,202 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/random.h> +#include <sys/rds.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +/* + * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't + * particularly zippy. + * + * This is now called for every incoming frame so we arguably care much more + * about it than we used to. + */ +kmutex_t rdsv3_bind_lock; +avl_tree_t rdsv3_bind_tree; + +static struct rdsv3_sock * +rdsv3_bind_tree_walk(uint32_be_t addr, uint16_be_t port, + struct rdsv3_sock *insert) +{ + struct rdsv3_sock *rs; + avl_index_t where; + + rs = avl_find(&rdsv3_bind_tree, &port, &where); + if ((rs == NULL) && (insert != NULL)) { + insert->rs_bound_addr = addr; + insert->rs_bound_port = port; + avl_insert(&rdsv3_bind_tree, insert, where); + } + + return (rs); +} + +/* + * Return the rdsv3_sock bound at the given local address. + * + * The rx path can race with rdsv3_release. We notice if rdsv3_release() has + * marked this socket and don't return a rs ref to the rx path. + */ +struct rdsv3_sock * +rdsv3_find_bound(uint32_be_t addr, uint16_be_t port) +{ + struct rdsv3_sock *rs; + + RDSV3_DPRINTF4("rdsv3_find_bound", "Enter(port: %x)", port); + + mutex_enter(&rdsv3_bind_lock); + rs = rdsv3_bind_tree_walk(addr, port, NULL); + if (rs && !rdsv3_sk_sock_flag(rdsv3_rs_to_sk(rs), SOCK_DEAD)) + rdsv3_sock_addref(rs); + else + rs = NULL; + mutex_exit(&rdsv3_bind_lock); + + RDSV3_DPRINTF5("rdsv3_find_bound", "returning rs %p for %u.%u.%u.%u:%x", + rs, NIPQUAD(addr), port); + return (rs); +} + +/* returns -ve errno or +ve port */ +static int +rdsv3_add_bound(struct rdsv3_sock *rs, uint32_be_t addr, uint16_be_t *port) +{ + int ret = -EADDRINUSE; + uint16_t rover, last; + + RDSV3_DPRINTF4("rdsv3_add_bound", "Enter(port: %x)", *port); + + if (*port != 0) { + rover = ntohs(*port); + last = rover; + } else { + (void) random_get_pseudo_bytes((uint8_t *)&rover, + sizeof (uint16_t)); + rover = MAX(rover, 2); + last = rover - 1; + } + + mutex_enter(&rdsv3_bind_lock); + + do { + if (rover == 0) + rover++; + if (rdsv3_bind_tree_walk(addr, htons(rover), rs) == NULL) { + *port = htons(rover); + ret = 0; + break; + } + } while (rover++ != last); + + if (ret == 0) { + rs->rs_bound_addr = addr; + rs->rs_bound_port = *port; + rdsv3_sock_addref(rs); + + RDSV3_DPRINTF5("rdsv3_add_bound", + "rs %p binding to %u.%u.%u.%u:%x", + rs, NIPQUAD(addr), *port); + + } + + mutex_exit(&rdsv3_bind_lock); + + RDSV3_DPRINTF4("rdsv3_add_bound", "Return(port: %x)", *port); + + return (ret); +} + +void +rdsv3_remove_bound(struct rdsv3_sock *rs) +{ + RDSV3_DPRINTF4("rdsv3_remove_bound", "Enter(rs: %p)", rs); + + mutex_enter(&rdsv3_bind_lock); + + if (rs->rs_bound_addr) { + RDSV3_DPRINTF5("rdsv3_remove_bound", + "rs %p unbinding from %u.%u.%u.%u:%x", + rs, NIPQUAD(rs->rs_bound_addr), rs->rs_bound_port); + + avl_remove(&rdsv3_bind_tree, rs); + rdsv3_sock_put(rs); + rs->rs_bound_addr = 0; + } + + mutex_exit(&rdsv3_bind_lock); + + RDSV3_DPRINTF4("rdsv3_remove_bound", "Return(rs: %p)", rs); +} + +/* ARGSUSED */ +int +rdsv3_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, + socklen_t len, cred_t *cr) +{ + struct rsock *sk = (struct rsock *)proto_handle; + sin_t *sin = (sin_t *)sa; + struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); + int ret; + + if (len != sizeof (sin_t) || (sin == NULL) || + !OK_32PTR((char *)sin)) { + RDSV3_DPRINTF2("rdsv3_bind", "address to bind not specified"); + return (EINVAL); + } + + RDSV3_DPRINTF4("rdsv3_bind", "Enter(rs: %p, addr: 0x%x, port: %x)", + rs, ntohl(sin->sin_addr.s_addr), htons(sin->sin_port)); + + if (sin->sin_addr.s_addr == INADDR_ANY) { + RDSV3_DPRINTF2("rdsv3_bind", "Invalid address"); + return (EINVAL); + } + + /* We don't allow multiple binds */ + if (rs->rs_bound_addr) { + RDSV3_DPRINTF2("rdsv3_bind", "Multiple binds not allowed"); + return (EINVAL); + } + + ret = rdsv3_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port); + if (ret) { + return (ret); + } + + rs->rs_transport = rdsv3_trans_get_preferred(sin->sin_addr.s_addr); + if (rs->rs_transport == NULL) { + rdsv3_remove_bound(rs); + return (EADDRNOTAVAIL); + } + + RDSV3_DPRINTF4("rdsv3_bind", "Return: Assigned port: %x to sock: %p", + sin->sin_port, rs); + + return (0); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/cong.c b/usr/src/uts/common/io/ib/clients/rdsv3/cong.c new file mode 100644 index 0000000000..634459f0fe --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/cong.c @@ -0,0 +1,523 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2007 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/rds.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/rdsv3_impl.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +/* + * This file implements the receive side of the unconventional congestion + * management in RDS. + * + * Messages waiting in the receive queue on the receiving socket are accounted + * against the sockets SO_RCVBUF option value. Only the payload bytes in the + * message are accounted for. If the number of bytes queued equals or exceeds + * rcvbuf then the socket is congested. All sends attempted to this socket's + * address should return block or return -EWOULDBLOCK. + * + * Applications are expected to be reasonably tuned such that this situation + * very rarely occurs. An application encountering this "back-pressure" is + * considered a bug. + * + * This is implemented by having each node maintain bitmaps which indicate + * which ports on bound addresses are congested. As the bitmap changes it is + * sent through all the connections which terminate in the local address of the + * bitmap which changed. + * + * The bitmaps are allocated as connections are brought up. This avoids + * allocation in the interrupt handling path which queues messages on sockets. + * The dense bitmaps let transports send the entire bitmap on any bitmap change + * reasonably efficiently. This is much easier to implement than some + * finer-grained communication of per-port congestion. The sender does a very + * inexpensive bit test to test if the port it's about to send to is congested + * or not. + */ + +/* + * Interaction with poll is a tad tricky. We want all processes stuck in + * poll to wake up and check whether a congested destination became uncongested. + * The really sad thing is we have no idea which destinations the application + * wants to send to - we don't even know which rdsv3_connections are involved. + * So until we implement a more flexible rds poll interface, we have to make + * do with this: + * We maintain a global counter that is incremented each time a congestion map + * update is received. Each rds socket tracks this value, and if rdsv3_poll + * finds that the saved generation number is smaller than the global generation + * number, it wakes up the process. + */ +static atomic_t rdsv3_cong_generation = ATOMIC_INIT(0); + +/* + * Congestion monitoring + */ +static struct list rdsv3_cong_monitor; +static krwlock_t rdsv3_cong_monitor_lock; + +/* + * Yes, a global lock. It's used so infrequently that it's worth keeping it + * global to simplify the locking. It's only used in the following + * circumstances: + * + * - on connection buildup to associate a conn with its maps + * - on map changes to inform conns of a new map to send + * + * It's sadly ordered under the socket callback lock and the connection lock. + * Receive paths can mark ports congested from interrupt context so the + * lock masks interrupts. + */ +static kmutex_t rdsv3_cong_lock; +static struct avl_tree rdsv3_cong_tree; + +static struct rdsv3_cong_map * +rdsv3_cong_tree_walk(uint32_be_t addr, struct rdsv3_cong_map *insert) +{ + struct rdsv3_cong_map *map; + avl_index_t where; + + if (insert) { + map = avl_find(&rdsv3_cong_tree, insert, &where); + if (map == NULL) { + avl_insert(&rdsv3_cong_tree, insert, where); + return (NULL); + } + } else { + struct rdsv3_cong_map map1; + map1.m_addr = addr; + map = avl_find(&rdsv3_cong_tree, &map1, &where); + } + + return (map); +} + +/* + * There is only ever one bitmap for any address. Connections try and allocate + * these bitmaps in the process getting pointers to them. The bitmaps are only + * ever freed as the module is removed after all connections have been freed. + */ +static struct rdsv3_cong_map * +rdsv3_cong_from_addr(uint32_be_t addr) +{ + struct rdsv3_cong_map *map; + struct rdsv3_cong_map *ret = NULL; + unsigned long zp; + unsigned long i; + + RDSV3_DPRINTF4("rdsv3_cong_from_addr", "Enter(addr: %x)", ntohl(addr)); + + map = kmem_zalloc(sizeof (struct rdsv3_cong_map), KM_NOSLEEP); + if (map == NULL) + return (NULL); + + map->m_addr = addr; + rdsv3_init_waitqueue(&map->m_waitq); + list_create(&map->m_conn_list, sizeof (struct rdsv3_connection), + offsetof(struct rdsv3_connection, c_map_item)); + + for (i = 0; i < RDSV3_CONG_MAP_PAGES; i++) { + zp = (unsigned long)kmem_zalloc(PAGE_SIZE, KM_NOSLEEP); + if (zp == 0) + goto out; + map->m_page_addrs[i] = zp; + } + + mutex_enter(&rdsv3_cong_lock); + ret = rdsv3_cong_tree_walk(addr, map); + mutex_exit(&rdsv3_cong_lock); + + if (ret == NULL) { + ret = map; + map = NULL; + } + +out: + if (map) { + for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i]; + i++) + kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE); + kmem_free(map, sizeof (*map)); + } + + RDSV3_DPRINTF5("rdsv3_cong_from_addr", "map %p for addr %x", + ret, ntohl(addr)); + + return (ret); +} + +/* + * Put the conn on its local map's list. This is called when the conn is + * really added to the hash. It's nested under the rdsv3_conn_lock, sadly. + */ +void +rdsv3_cong_add_conn(struct rdsv3_connection *conn) +{ + RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Enter(conn: %p)", conn); + + RDSV3_DPRINTF5("rdsv3_cong_add_conn", "conn %p now on map %p", + conn, conn->c_lcong); + mutex_enter(&rdsv3_cong_lock); + list_insert_tail(&conn->c_lcong->m_conn_list, conn); + mutex_exit(&rdsv3_cong_lock); + + RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Return(conn: %p)", conn); +} + +void +rdsv3_cong_remove_conn(struct rdsv3_connection *conn) +{ + RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Enter(conn: %p)", conn); + + RDSV3_DPRINTF5("rdsv3_cong_remove_conn", "removing conn %p from map %p", + conn, conn->c_lcong); + mutex_enter(&rdsv3_cong_lock); + list_remove_node(&conn->c_map_item); + mutex_exit(&rdsv3_cong_lock); + + RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Return(conn: %p)", conn); +} + +int +rdsv3_cong_get_maps(struct rdsv3_connection *conn) +{ + conn->c_lcong = rdsv3_cong_from_addr(conn->c_laddr); + conn->c_fcong = rdsv3_cong_from_addr(conn->c_faddr); + + if (conn->c_lcong == NULL || conn->c_fcong == NULL) + return (-ENOMEM); + + return (0); +} + +void +rdsv3_cong_queue_updates(struct rdsv3_cong_map *map) +{ + struct rdsv3_connection *conn; + + RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Enter(map: %p)", map); + + mutex_enter(&rdsv3_cong_lock); + + RDSV3_FOR_EACH_LIST_NODE(conn, &map->m_conn_list, c_map_item) { + if (!test_and_set_bit(0, &conn->c_map_queued)) { + rdsv3_stats_inc(s_cong_update_queued); + rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); + } + } + + mutex_exit(&rdsv3_cong_lock); + + RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Return(map: %p)", map); +} + +void +rdsv3_cong_map_updated(struct rdsv3_cong_map *map, uint64_t portmask) +{ + RDSV3_DPRINTF4("rdsv3_cong_map_updated", + "waking map %p for %u.%u.%u.%u", + map, NIPQUAD(map->m_addr)); + rdsv3_stats_inc(s_cong_update_received); + atomic_add_32(&rdsv3_cong_generation, 1); +#if 0 +XXX + if (waitqueue_active(&map->m_waitq)) +#endif + rdsv3_wake_up(&map->m_waitq); +#if 0 +XXX + if (waitqueue_active(&rds_poll_waitq)) +#endif + rdsv3_wake_up_all(&rdsv3_poll_waitq); + + if (portmask && !list_is_empty(&rdsv3_cong_monitor)) { + struct rdsv3_sock *rs; + + rw_enter(&rdsv3_cong_monitor_lock, RW_READER); + RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_cong_monitor, + rs_cong_list) { + mutex_enter(&rs->rs_lock); + rs->rs_cong_notify |= (rs->rs_cong_mask & portmask); + rs->rs_cong_mask &= ~portmask; + mutex_exit(&rs->rs_lock); + if (rs->rs_cong_notify) + rdsv3_wake_sk_sleep(rs); + } + rw_exit(&rdsv3_cong_monitor_lock); + } + + RDSV3_DPRINTF4("rdsv3_cong_map_updated", "Return(map: %p)", map); +} + +int +rdsv3_cong_updated_since(unsigned long *recent) +{ + unsigned long gen = atomic_get(&rdsv3_cong_generation); + + if (*recent == gen) + return (0); + *recent = gen; + return (1); +} + +/* + * These should be using generic_{test,__{clear,set}}_le_bit() but some old + * kernels don't have them. Sigh. + */ +#if defined(sparc) +#define LE_BIT_XOR ((BITS_PER_LONG-1) & ~0x7) +#else +#define LE_BIT_XOR 0 +#endif + +/* + * We're called under the locking that protects the sockets receive buffer + * consumption. This makes it a lot easier for the caller to only call us + * when it knows that an existing set bit needs to be cleared, and vice versa. + * We can't block and we need to deal with concurrent sockets working against + * the same per-address map. + */ +void +rdsv3_cong_set_bit(struct rdsv3_cong_map *map, uint16_be_t port) +{ + unsigned long i; + unsigned long off; + + RDSV3_DPRINTF4("rdsv3_cong_set_bit", + "setting congestion for %u.%u.%u.%u:%u in map %p", + NIPQUAD(map->m_addr), ntohs(port), map); + + i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS; + off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS; + + set_bit(off ^ LE_BIT_XOR, (void *)map->m_page_addrs[i]); +} + +void +rdsv3_cong_clear_bit(struct rdsv3_cong_map *map, uint16_be_t port) +{ + unsigned long i; + unsigned long off; + + RDSV3_DPRINTF4("rdsv3_cong_clear_bit", + "clearing congestion for %u.%u.%u.%u:%u in map %p\n", + NIPQUAD(map->m_addr), ntohs(port), map); + + i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS; + off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS; + + clear_bit(off ^ LE_BIT_XOR, (void *)map->m_page_addrs[i]); +} + +static int +rdsv3_cong_test_bit(struct rdsv3_cong_map *map, uint16_be_t port) +{ + unsigned long i; + unsigned long off; + + i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS; + off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS; + + RDSV3_DPRINTF5("rdsv3_cong_test_bit", "port: 0x%x i = %lx off = %lx", + ntohs(port), i, off); + + return (test_bit(off ^ LE_BIT_XOR, (void *)map->m_page_addrs[i])); +} + +#undef LE_BIT_XOR + +void +rdsv3_cong_add_socket(struct rdsv3_sock *rs) +{ + RDSV3_DPRINTF4("rdsv3_cong_add_socket", "Enter(rs: %p)", rs); + + rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER); + if (!list_link_active(&rs->rs_cong_list)) + list_insert_head(&rdsv3_cong_monitor, rs); + rw_exit(&rdsv3_cong_monitor_lock); +} + +void +rdsv3_cong_remove_socket(struct rdsv3_sock *rs) +{ + struct rdsv3_cong_map *map; + + RDSV3_DPRINTF4("rdsv3_cong_remove_socket", "Enter(rs: %p)", rs); + + rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER); + list_remove_node(&rs->rs_cong_list); + rw_exit(&rdsv3_cong_monitor_lock); + + /* update congestion map for now-closed port */ + mutex_enter(&rdsv3_cong_lock); + map = rdsv3_cong_tree_walk(rs->rs_bound_addr, NULL); + mutex_exit(&rdsv3_cong_lock); + + if (map && rdsv3_cong_test_bit(map, rs->rs_bound_port)) { + rdsv3_cong_clear_bit(map, rs->rs_bound_port); + rdsv3_cong_queue_updates(map); + } +} + +int +rdsv3_cong_wait(struct rdsv3_cong_map *map, uint16_be_t port, int nonblock, + struct rdsv3_sock *rs) +{ + int ret = 0; + + RDSV3_DPRINTF4("rdsv3_cong_wait", "Enter(rs: %p, mode: %d)", + rs, nonblock); + + if (!rdsv3_cong_test_bit(map, port)) + return (0); + if (nonblock) { + if (rs && rs->rs_cong_monitor) { + /* + * It would have been nice to have an atomic set_bit on + * a uint64_t. + */ + mutex_enter(&rs->rs_lock); + rs->rs_cong_mask |= + RDSV3_CONG_MONITOR_MASK(ntohs(port)); + mutex_exit(&rs->rs_lock); + + /* + * Test again - a congestion update may have arrived in + * the meantime. + */ + if (!rdsv3_cong_test_bit(map, port)) + return (0); + } + rdsv3_stats_inc(s_cong_send_error); + return (-ENOBUFS); + } + + rdsv3_stats_inc(s_cong_send_blocked); + RDSV3_DPRINTF3("rdsv3_cong_wait", "waiting on map %p for port %u", + map, ntohs(port)); + + mutex_enter(&map->m_waitq.waitq_mutex); + while (rdsv3_cong_test_bit(map, port)) { + if (cv_wait_sig(&map->m_waitq.waitq_cv, + &map->m_waitq.waitq_mutex) == 0) { + ret = -ERESTART; + break; + } + } + mutex_exit(&map->m_waitq.waitq_mutex); + + return (ret); +} + +void +rdsv3_cong_exit(void) +{ + struct rdsv3_cong_map *map; + unsigned long i; + + RDSV3_DPRINTF4("rdsv3_cong_exit", "Enter"); + + while ((map = avl_first(&rdsv3_cong_tree))) { + RDSV3_DPRINTF5("rdsv3_cong_exit", "freeing map %p\n", map); + avl_remove(&rdsv3_cong_tree, map); + for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i]; + i++) + kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE); + kmem_free(map, sizeof (*map)); + } + + RDSV3_DPRINTF4("rdsv3_cong_exit", "Return"); +} + +/* + * Allocate a RDS message containing a congestion update. + */ +struct rdsv3_message * +rdsv3_cong_update_alloc(struct rdsv3_connection *conn) +{ + struct rdsv3_cong_map *map = conn->c_lcong; + struct rdsv3_message *rm; + + rm = rdsv3_message_map_pages(map->m_page_addrs, RDSV3_CONG_MAP_BYTES); + if (!IS_ERR(rm)) + rm->m_inc.i_hdr.h_flags = RDSV3_FLAG_CONG_BITMAP; + + return (rm); +} + +static int +rdsv3_cong_compare(const void *map1, const void *map2) +{ +#define addr1 ((struct rdsv3_cong_map *)map1)->m_addr +#define addr2 ((struct rdsv3_cong_map *)map2)->m_addr + + if (addr1 < addr2) + return (-1); + if (addr1 > addr2) + return (1); + return (0); +} + +void +rdsv3_cong_init(void) +{ + list_create(&rdsv3_cong_monitor, sizeof (struct rdsv3_sock), + offsetof(struct rdsv3_sock, rs_cong_list)); + rw_init(&rdsv3_cong_monitor_lock, NULL, RW_DRIVER, NULL); + mutex_init(&rdsv3_cong_lock, NULL, MUTEX_DRIVER, NULL); + avl_create(&rdsv3_cong_tree, rdsv3_cong_compare, + sizeof (struct rdsv3_cong_map), offsetof(struct rdsv3_cong_map, + m_rb_node)); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/connection.c b/usr/src/uts/common/io/ib/clients/rdsv3/connection.c new file mode 100644 index 0000000000..4df9489c9f --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/connection.c @@ -0,0 +1,546 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/rds.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/loop.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +/* converting this to RCU is a chore for another day.. */ +static krwlock_t rdsv3_conn_lock; +static unsigned long rdsv3_conn_count; +struct avl_tree rdsv3_conn_hash; +static struct kmem_cache *rdsv3_conn_slab = NULL; + +#define rdsv3_conn_info_set(var, test, suffix) do { \ + if (test) \ + var |= RDSV3_INFO_CONNECTION_FLAG_##suffix; \ +} while (0) + +static inline int +rdsv3_conn_is_sending(struct rdsv3_connection *conn) +{ + int ret = 0; + + if (!mutex_tryenter(&conn->c_send_lock)) + ret = 1; + else + mutex_exit(&conn->c_send_lock); + + return (ret); +} + +static struct rdsv3_connection * +rdsv3_conn_lookup(uint32_be_t laddr, uint32_be_t faddr, avl_index_t *pos) +{ + struct rdsv3_connection *conn; + struct rdsv3_conn_info_s conn_info; + avl_index_t place = 0; + + conn_info.c_laddr = laddr; + conn_info.c_faddr = faddr; + + conn = avl_find(&rdsv3_conn_hash, &conn_info, &place); + + RDSV3_DPRINTF5("rdsv3_conn_lookup", + "returning conn %p for %u.%u.%u.%u -> %u.%u.%u.%u", + conn, NIPQUAD(laddr), NIPQUAD(faddr)); + + if (pos != NULL) + *pos = place; + + return (conn); +} + +/* + * This is called by transports as they're bringing down a connection. + * It clears partial message state so that the transport can start sending + * and receiving over this connection again in the future. It is up to + * the transport to have serialized this call with its send and recv. + */ +void +rdsv3_conn_reset(struct rdsv3_connection *conn) +{ + RDSV3_DPRINTF2("rdsv3_conn_reset", + "connection %u.%u.%u.%u to %u.%u.%u.%u reset", + NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); + + rdsv3_stats_inc(s_conn_reset); + rdsv3_send_reset(conn); + conn->c_flags = 0; + + /* + * Do not clear next_rx_seq here, else we cannot distinguish + * retransmitted packets from new packets, and will hand all + * of them to the application. That is not consistent with the + * reliability guarantees of RDS. + */ +} + +/* + * There is only every one 'conn' for a given pair of addresses in the + * system at a time. They contain messages to be retransmitted and so + * span the lifetime of the actual underlying transport connections. + * + * For now they are not garbage collected once they're created. They + * are torn down as the module is removed, if ever. + */ +static struct rdsv3_connection * +__rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr, + struct rdsv3_transport *trans, int gfp, + int is_outgoing) +{ + struct rdsv3_connection *conn, *parent = NULL; + avl_index_t pos; + int ret; + + rw_enter(&rdsv3_conn_lock, RW_READER); + conn = rdsv3_conn_lookup(laddr, faddr, &pos); + if (conn && + conn->c_loopback && + conn->c_trans != &rdsv3_loop_transport && + !is_outgoing) { + /* + * This is a looped back IB connection, and we're + * called by the code handling the incoming connect. + * We need a second connection object into which we + * can stick the other QP. + */ + parent = conn; + conn = parent->c_passive; + } + rw_exit(&rdsv3_conn_lock); + if (conn) + goto out; + + RDSV3_DPRINTF2("__rdsv3_conn_create", "Enter(%x -> %x)", + ntohl(laddr), ntohl(faddr)); + + conn = kmem_cache_alloc(rdsv3_conn_slab, gfp); + if (conn == NULL) { + conn = ERR_PTR(-ENOMEM); + goto out; + } + + /* see rdsv3_conn_constructor */ + conn->c_laddr = laddr; + conn->c_faddr = faddr; + + ret = rdsv3_cong_get_maps(conn); + if (ret) { + kmem_cache_free(rdsv3_conn_slab, conn); + conn = ERR_PTR(ret); + goto out; + } + + /* + * This is where a connection becomes loopback. If *any* RDS sockets + * can bind to the destination address then we'd rather the messages + * flow through loopback rather than either transport. + */ + if (rdsv3_trans_get_preferred(faddr)) { + conn->c_loopback = 1; + if (is_outgoing && trans->t_prefer_loopback) { + /* + * "outgoing" connection - and the transport + * says it wants the connection handled by the + * loopback transport. This is what TCP does. + */ + trans = &rdsv3_loop_transport; + } + } + + conn->c_trans = trans; + + ret = trans->conn_alloc(conn, gfp); + if (ret) { + kmem_cache_free(rdsv3_conn_slab, conn); + conn = ERR_PTR(ret); + goto out; + } + + conn->c_state = RDSV3_CONN_DOWN; + conn->c_reconnect_jiffies = 0; + RDSV3_INIT_DELAYED_WORK(&conn->c_send_w, rdsv3_send_worker); + RDSV3_INIT_DELAYED_WORK(&conn->c_recv_w, rdsv3_recv_worker); + RDSV3_INIT_DELAYED_WORK(&conn->c_conn_w, rdsv3_connect_worker); + RDSV3_INIT_WORK(&conn->c_down_w, rdsv3_shutdown_worker); + mutex_init(&conn->c_cm_lock, NULL, MUTEX_DRIVER, NULL); + conn->c_flags = 0; + + RDSV3_DPRINTF2("__rdsv3_conn_create", + "allocated conn %p for %u.%u.%u.%u -> %u.%u.%u.%u over %s %s", + conn, NIPQUAD(laddr), NIPQUAD(faddr), + trans->t_name ? trans->t_name : "[unknown]", + is_outgoing ? "(outgoing)" : ""); + + /* + * Since we ran without holding the conn lock, someone could + * have created the same conn (either normal or passive) in the + * interim. We check while holding the lock. If we won, we complete + * init and return our conn. If we lost, we rollback and return the + * other one. + */ + rw_enter(&rdsv3_conn_lock, RW_WRITER); + if (parent) { + /* Creating passive conn */ + if (parent->c_passive) { + trans->conn_free(conn->c_transport_data); + kmem_cache_free(rdsv3_conn_slab, conn); + conn = parent->c_passive; + } else { + parent->c_passive = conn; + rdsv3_cong_add_conn(conn); + rdsv3_conn_count++; + } + } else { + /* Creating normal conn */ + struct rdsv3_connection *found; + + found = rdsv3_conn_lookup(laddr, faddr, &pos); + if (found) { + trans->conn_free(conn->c_transport_data); + kmem_cache_free(rdsv3_conn_slab, conn); + conn = found; + } else { + avl_insert(&rdsv3_conn_hash, conn, pos); + rdsv3_cong_add_conn(conn); + rdsv3_conn_count++; + } + } + + rw_exit(&rdsv3_conn_lock); + + RDSV3_DPRINTF2("__rdsv3_conn_create", "Return(conn: %p)", conn); + +out: + return (conn); +} + +struct rdsv3_connection * +rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr, + struct rdsv3_transport *trans, int gfp) +{ + return (__rdsv3_conn_create(laddr, faddr, trans, gfp, 0)); +} + +struct rdsv3_connection * +rdsv3_conn_create_outgoing(uint32_be_t laddr, uint32_be_t faddr, + struct rdsv3_transport *trans, int gfp) +{ + return (__rdsv3_conn_create(laddr, faddr, trans, gfp, 1)); +} + +void +rdsv3_conn_destroy(struct rdsv3_connection *conn) +{ + struct rdsv3_message *rm, *rtmp; + + RDSV3_DPRINTF4("rdsv3_conn_destroy", + "freeing conn %p for %u.%u.%u.%u -> %u.%u.%u.%u", + conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); + + avl_remove(&rdsv3_conn_hash, conn); + + /* wait for the rds thread to shut it down */ + conn->c_state = RDSV3_CONN_ERROR; + rdsv3_cancel_delayed_work(&conn->c_conn_w); + rdsv3_cancel_delayed_work(&conn->c_send_w); + rdsv3_cancel_delayed_work(&conn->c_recv_w); + rdsv3_shutdown_worker(&conn->c_down_w); + rdsv3_flush_workqueue(rdsv3_wq); + + /* tear down queued messages */ + RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, rtmp, + &conn->c_send_queue, + m_conn_item) { + list_remove_node(&rm->m_conn_item); + ASSERT(!list_link_active(&rm->m_sock_item)); + rdsv3_message_put(rm); + } + if (conn->c_xmit_rm) + rdsv3_message_put(conn->c_xmit_rm); + + conn->c_trans->conn_free(conn->c_transport_data); + + /* + * The congestion maps aren't freed up here. They're + * freed by rdsv3_cong_exit() after all the connections + * have been freed. + */ + rdsv3_cong_remove_conn(conn); + + ASSERT(list_is_empty(&conn->c_retrans)); + kmem_cache_free(rdsv3_conn_slab, conn); + + rdsv3_conn_count--; +} + +/* ARGSUSED */ +static void +rdsv3_conn_message_info(struct rsock *sock, unsigned int len, + struct rdsv3_info_iterator *iter, + struct rdsv3_info_lengths *lens, + int want_send) +{ + struct list *list; + struct rdsv3_connection *conn; + struct rdsv3_message *rm; + unsigned int total = 0; + + RDSV3_DPRINTF4("rdsv3_conn_message_info", "Enter"); + + len /= sizeof (struct rdsv3_info_message); + + rw_enter(&rdsv3_conn_lock, RW_READER); + + if (avl_is_empty(&rdsv3_conn_hash)) { + /* no connections */ + rw_exit(&rdsv3_conn_lock); + return; + } + + conn = (struct rdsv3_connection *)avl_first(&rdsv3_conn_hash); + + do { + if (want_send) + list = &conn->c_send_queue; + else + list = &conn->c_retrans; + + mutex_enter(&conn->c_lock); + + /* XXX too lazy to maintain counts.. */ + RDSV3_FOR_EACH_LIST_NODE(rm, list, m_conn_item) { + total++; + if (total <= len) + rdsv3_inc_info_copy(&rm->m_inc, iter, + conn->c_laddr, conn->c_faddr, 0); + } + + mutex_exit(&conn->c_lock); + + conn = AVL_NEXT(&rdsv3_conn_hash, conn); + } while (conn != NULL); + + rw_exit(&rdsv3_conn_lock); + + lens->nr = total; + lens->each = sizeof (struct rdsv3_info_message); + + RDSV3_DPRINTF4("rdsv3_conn_message_info", "Return"); +} + +static void +rdsv3_conn_message_info_send(struct rsock *sock, unsigned int len, + struct rdsv3_info_iterator *iter, + struct rdsv3_info_lengths *lens) +{ + rdsv3_conn_message_info(sock, len, iter, lens, 1); +} + +static void +rdsv3_conn_message_info_retrans(struct rsock *sock, + unsigned int len, + struct rdsv3_info_iterator *iter, + struct rdsv3_info_lengths *lens) +{ + rdsv3_conn_message_info(sock, len, iter, lens, 0); +} + +/* ARGSUSED */ +void +rdsv3_for_each_conn_info(struct rsock *sock, unsigned int len, + struct rdsv3_info_iterator *iter, + struct rdsv3_info_lengths *lens, + int (*visitor)(struct rdsv3_connection *, void *), + size_t item_len) +{ +#ifndef __lock_lint + uint64_t buffer[(item_len + 7) / 8]; +#else + uint64_t buffer[256]; +#endif + struct rdsv3_connection *conn; + + rw_enter(&rdsv3_conn_lock, RW_READER); + + lens->nr = 0; + lens->each = item_len; + + if (avl_is_empty(&rdsv3_conn_hash)) { + /* no connections */ + rw_exit(&rdsv3_conn_lock); + return; + } + + conn = (struct rdsv3_connection *)avl_first(&rdsv3_conn_hash); + + do { + /* XXX no c_lock usage.. */ + if (!visitor(conn, buffer)) + continue; + + /* + * We copy as much as we can fit in the buffer, + * but we count all items so that the caller + * can resize the buffer. + */ + if (len >= item_len) { + rdsv3_info_copy(iter, buffer, item_len); + len -= item_len; + } + lens->nr++; + conn = AVL_NEXT(&rdsv3_conn_hash, conn); + } while (conn != NULL); + + rw_exit(&rdsv3_conn_lock); +} + +static int +rdsv3_conn_info_visitor(struct rdsv3_connection *conn, void *buffer) +{ + struct rdsv3_info_connection *cinfo = buffer; + + cinfo->next_tx_seq = conn->c_next_tx_seq; + cinfo->next_rx_seq = conn->c_next_rx_seq; + cinfo->laddr = conn->c_laddr; + cinfo->faddr = conn->c_faddr; + (void) strncpy((char *)cinfo->transport, conn->c_trans->t_name, + sizeof (cinfo->transport)); + cinfo->flags = 0; + + rdsv3_conn_info_set(cinfo->flags, + rdsv3_conn_is_sending(conn), SENDING); + /* XXX Future: return the state rather than these funky bits */ + rdsv3_conn_info_set(cinfo->flags, + atomic_get(&conn->c_state) == RDSV3_CONN_CONNECTING, + CONNECTING); + rdsv3_conn_info_set(cinfo->flags, + atomic_get(&conn->c_state) == RDSV3_CONN_UP, + CONNECTED); + return (1); +} + +static void +rdsv3_conn_info(struct rsock *sock, unsigned int len, + struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) +{ + rdsv3_for_each_conn_info(sock, len, iter, lens, + rdsv3_conn_info_visitor, sizeof (struct rdsv3_info_connection)); +} + +int +rdsv3_conn_init() +{ + RDSV3_DPRINTF4("rdsv3_conn_init", "Enter"); + + rdsv3_conn_slab = kmem_cache_create("rdsv3_connection", + sizeof (struct rdsv3_connection), 0, rdsv3_conn_constructor, + rdsv3_conn_destructor, NULL, NULL, NULL, 0); + if (rdsv3_conn_slab == NULL) { + RDSV3_DPRINTF1("rdsv3_conn_init", + "kmem_cache_create(rdsv3_conn_slab) failed"); + return (-1); + } + + avl_create(&rdsv3_conn_hash, rdsv3_conn_compare, + sizeof (struct rdsv3_connection), offsetof(struct rdsv3_connection, + c_hash_node)); + + rw_init(&rdsv3_conn_lock, NULL, RW_DRIVER, NULL); + + rdsv3_loop_init(); + + rdsv3_info_register_func(RDSV3_INFO_CONNECTIONS, rdsv3_conn_info); + rdsv3_info_register_func(RDSV3_INFO_SEND_MESSAGES, + rdsv3_conn_message_info_send); + rdsv3_info_register_func(RDSV3_INFO_RETRANS_MESSAGES, + rdsv3_conn_message_info_retrans); + + RDSV3_DPRINTF4("rdsv3_conn_init", "Return"); + + return (0); +} + +void +rdsv3_conn_exit() +{ + RDSV3_DPRINTF4("rdsv3_conn_exit", "Enter"); + + rdsv3_loop_exit(); + + rw_destroy(&rdsv3_conn_lock); + avl_destroy(&rdsv3_conn_hash); + + ASSERT(rdsv3_conn_slab); + kmem_cache_destroy(rdsv3_conn_slab); + + RDSV3_DPRINTF4("rdsv3_conn_exit", "Return"); +} + +/* + * Force a disconnect + */ +void +rdsv3_conn_drop(struct rdsv3_connection *conn) +{ + conn->c_state = RDSV3_CONN_ERROR; + rdsv3_queue_work(rdsv3_wq, &conn->c_down_w); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib.c new file mode 100644 index 0000000000..3b2adb3932 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib.c @@ -0,0 +1,410 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/sysmacros.h> +#include <sys/rds.h> + +#include <sys/ib/ibtl/ibti.h> +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/ib.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT; + +struct list rdsv3_ib_devices; + +/* NOTE: if also grabbing ibdev lock, grab this first */ +kmutex_t ib_nodev_conns_lock; +list_t ib_nodev_conns; + +void +rdsv3_ib_add_one(ib_device_t *device) +{ + struct rdsv3_ib_device *rds_ibdev; + ibt_hca_attr_t *dev_attr; + + RDSV3_DPRINTF4("rdsv3_ib_add_one", "device: %p", device); + + /* Only handle IB (no iWARP) devices */ + if (device->node_type != RDMA_NODE_IB_CA) + return; + + dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr), + KM_NOSLEEP); + if (!dev_attr) + return; + + if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) { + RDSV3_DPRINTF5("rdsv3_ib_add_one", + "Query device failed for %s", device->name); + goto free_attr; + } + + /* We depend on Reserved Lkey */ + if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) { + RDSV3_DPRINTF5("rdsv3_ib_add_one", + "Reserved Lkey support is required: %s", + device->name); + goto free_attr; + } + + rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP); + if (!rds_ibdev) + goto free_attr; + + mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL); + + rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz; + rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE); + + rds_ibdev->dev = device; + rds_ibdev->pd = ib_alloc_pd(device); + if (IS_ERR(rds_ibdev->pd)) + goto free_dev; + + if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) { + goto free_dev; + } + + list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr), + offsetof(struct rdsv3_ib_ipaddr, list)); + list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection), + offsetof(struct rdsv3_ib_connection, ib_node)); + + list_insert_tail(&rdsv3_ib_devices, rds_ibdev); + + ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev); + + RDSV3_DPRINTF4("rdsv3_ib_add_one", "Return: device: %p", device); + + goto free_attr; + +err_pd: + (void) ib_dealloc_pd(rds_ibdev->pd); +free_dev: + kmem_free(rds_ibdev, sizeof (*rds_ibdev)); +free_attr: + kmem_free(dev_attr, sizeof (*dev_attr)); +} + +void +rdsv3_ib_remove_one(struct ib_device *device) +{ + struct rdsv3_ib_device *rds_ibdev; + struct rdsv3_ib_ipaddr *i_ipaddr, *i_next; + + RDSV3_DPRINTF4("rdsv3_ib_remove_one", "device: %p", device); + + rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client); + if (!rds_ibdev) + return; + + RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, + list) { + list_remove_node(&i_ipaddr->list); + kmem_free(i_ipaddr, sizeof (*i_ipaddr)); + } + + rdsv3_ib_destroy_conns(rds_ibdev); + + rdsv3_ib_destroy_mr_pool(rds_ibdev); + +#if 0 + while (ib_dealloc_pd(rds_ibdev->pd)) { +#ifndef __lock_lint + RDSV3_DPRINTF5("rdsv3_ib_remove_one", + "%s-%d Failed to dealloc pd %p", + __func__, __LINE__, rds_ibdev->pd); +#endif + delay(drv_usectohz(1000)); + } +#else + if (ib_dealloc_pd(rds_ibdev->pd)) { +#ifndef __lock_lint + RDSV3_DPRINTF2("rdsv3_ib_remove_one", + "%s-%d Failed to dealloc pd %p", + __func__, __LINE__, rds_ibdev->pd); +#endif + } +#endif + + list_destroy(&rds_ibdev->ipaddr_list); + list_destroy(&rds_ibdev->conn_list); + list_remove_node(&rds_ibdev->list); + kmem_free(rds_ibdev, sizeof (*rds_ibdev)); + + RDSV3_DPRINTF4("rdsv3_ib_remove_one", "Return: device: %p", device); +} + +#ifndef __lock_lint +struct ib_client rdsv3_ib_client = { + .name = "rdsv3_ib", + .add = rdsv3_ib_add_one, + .remove = rdsv3_ib_remove_one, + .clnt_hdl = NULL, + .state = IB_CLNT_UNINITIALIZED +}; +#else +struct ib_client rdsv3_ib_client = { + "rdsv3_ib", + rdsv3_ib_add_one, + rdsv3_ib_remove_one, + NULL, + NULL, + IB_CLNT_UNINITIALIZED +}; +#endif + +static int +rds_ib_conn_info_visitor(struct rdsv3_connection *conn, + void *buffer) +{ + struct rdsv3_info_rdma_connection *iinfo = buffer; + struct rdsv3_ib_connection *ic; + + RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p", + conn, buffer); + + /* We will only ever look at IB transports */ + if (conn->c_trans != &rdsv3_ib_transport) + return (0); + + iinfo->src_addr = conn->c_laddr; + iinfo->dst_addr = conn->c_faddr; + + (void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid)); + (void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid)); + if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { + struct rdsv3_ib_device *rds_ibdev; + struct rdma_dev_addr *dev_addr; + + ic = conn->c_transport_data; + dev_addr = &ic->i_cm_id->route.addr.dev_addr; + + ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid); + ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid); + + rds_ibdev = ib_get_client_data(ic->i_cm_id->device, + &rdsv3_ib_client); + iinfo->max_send_wr = ic->i_send_ring.w_nr; + iinfo->max_recv_wr = ic->i_recv_ring.w_nr; + iinfo->max_send_sge = rds_ibdev->max_sge; + } + + RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p", + conn, buffer); + return (1); +} + +static void +rds_ib_ic_info(struct rsock *sock, unsigned int len, + struct rdsv3_info_iterator *iter, + struct rdsv3_info_lengths *lens) +{ + RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d", + sock, iter, lens, len); + + rdsv3_for_each_conn_info(sock, len, iter, lens, + rds_ib_conn_info_visitor, + sizeof (struct rdsv3_info_rdma_connection)); +} + +/* + * Early RDS/IB was built to only bind to an address if there is an IPoIB + * device with that address set. + * + * If it were me, I'd advocate for something more flexible. Sending and + * receiving should be device-agnostic. Transports would try and maintain + * connections between peers who have messages queued. Userspace would be + * allowed to influence which paths have priority. We could call userspace + * asserting this policy "routing". + */ +static int +rds_ib_laddr_check(uint32_be_t addr) +{ + int ret; + struct rdma_cm_id *cm_id; + struct sockaddr_in sin; + + RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr)); + + /* + * Create a CMA ID and try to bind it. This catches both + * IB and iWARP capable NICs. + */ + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); + if (IS_ERR(cm_id)) + return (PTR_ERR(cm_id)); + + (void) memset(&sin, 0, sizeof (sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr); + + /* rdma_bind_addr will only succeed for IB & iWARP devices */ + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + /* + * due to this, we will claim to support iWARP devices unless we + * check node_type. + */ + if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) + ret = -EADDRNOTAVAIL; + + RDSV3_DPRINTF5("rds_ib_laddr_check", + "addr %u.%u.%u.%u ret %d node type %d", + NIPQUAD(addr), ret, + cm_id->device ? cm_id->device->node_type : -1); + + rdma_destroy_id(cm_id); + + return (ret); +} + +void +rdsv3_ib_exit(void) +{ + RDSV3_DPRINTF4("rds_ib_exit", "Enter"); + + rdsv3_info_deregister_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info); + rdsv3_ib_destroy_nodev_conns(); + ib_unregister_client(&rdsv3_ib_client); + rdsv3_ib_sysctl_exit(); + rdsv3_ib_recv_exit(); + rdsv3_trans_unregister(&rdsv3_ib_transport); + mutex_destroy(&ib_nodev_conns_lock); + list_destroy(&ib_nodev_conns); + list_destroy(&rdsv3_ib_devices); + + RDSV3_DPRINTF4("rds_ib_exit", "Return"); +} + +#ifndef __lock_lint +struct rdsv3_transport rdsv3_ib_transport = { + .laddr_check = rds_ib_laddr_check, + .xmit_complete = rdsv3_ib_xmit_complete, + .xmit = rdsv3_ib_xmit, + .xmit_cong_map = NULL, + .xmit_rdma = rdsv3_ib_xmit_rdma, + .recv = rdsv3_ib_recv, + .conn_alloc = rdsv3_ib_conn_alloc, + .conn_free = rdsv3_ib_conn_free, + .conn_connect = rdsv3_ib_conn_connect, + .conn_shutdown = rdsv3_ib_conn_shutdown, + .inc_copy_to_user = rdsv3_ib_inc_copy_to_user, + .inc_purge = rdsv3_ib_inc_purge, + .inc_free = rdsv3_ib_inc_free, + .cm_initiate_connect = rdsv3_ib_cm_initiate_connect, + .cm_handle_connect = rdsv3_ib_cm_handle_connect, + .cm_connect_complete = rdsv3_ib_cm_connect_complete, + .stats_info_copy = rdsv3_ib_stats_info_copy, + .exit = rdsv3_ib_exit, + .get_mr = rdsv3_ib_get_mr, + .sync_mr = rdsv3_ib_sync_mr, + .free_mr = rdsv3_ib_free_mr, + .flush_mrs = rdsv3_ib_flush_mrs, + .t_name = "infiniband", +}; +#else +struct rdsv3_transport rdsv3_ib_transport; +#endif + +int +rdsv3_ib_init(void) +{ + int ret; + + RDSV3_DPRINTF4("rds_ib_init", "Enter"); + + list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device), + offsetof(struct rdsv3_ib_device, list)); + list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection), + offsetof(struct rdsv3_ib_connection, ib_node)); + mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL); + + rdsv3_ib_client.dip = rdsv3_dev_info; + ret = ib_register_client(&rdsv3_ib_client); + if (ret) + goto out; + + ret = rdsv3_ib_sysctl_init(); + if (ret) + goto out_ibreg; + + ret = rdsv3_ib_recv_init(); + if (ret) + goto out_sysctl; + + ret = rdsv3_trans_register(&rdsv3_ib_transport); + if (ret) + goto out_recv; + + rdsv3_info_register_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info); + + RDSV3_DPRINTF4("rds_ib_init", "Return"); + + return (0); + +out_recv: + rdsv3_ib_recv_exit(); +out_sysctl: + rdsv3_ib_sysctl_exit(); +out_ibreg: + ib_unregister_client(&rdsv3_ib_client); +out: + mutex_destroy(&ib_nodev_conns_lock); + list_destroy(&ib_nodev_conns); + list_destroy(&rdsv3_ib_devices); + return (ret); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_cm.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_cm.c new file mode 100644 index 0000000000..df1f73ca2f --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_cm.c @@ -0,0 +1,978 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/rds.h> + +#include <sys/ib/clients/of/ofed_kernel.h> +#include <sys/ib/clients/of/rdma/ib_addr.h> +#include <sys/ib/clients/of/rdma/rdma_cm.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/ib.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +extern ddi_taskq_t *rdsv3_taskq; + +/* + * Set the selected protocol version + */ +static void +rdsv3_ib_set_protocol(struct rdsv3_connection *conn, unsigned int version) +{ + RDSV3_DPRINTF4("rdsv3_ib_set_protocol", "conn: %p version: %d", + conn, version); + conn->c_version = version; +} + +/* + * Set up flow control + */ +static void +rdsv3_ib_set_flow_control(struct rdsv3_connection *conn, uint32_t credits) +{ + struct rdsv3_ib_connection *ic = conn->c_transport_data; + + RDSV3_DPRINTF2("rdsv3_ib_set_flow_control", + "Enter: conn: %p credits: %d", conn, credits); + + if (rdsv3_ib_sysctl_flow_control && credits != 0) { + /* We're doing flow control */ + ic->i_flowctl = 1; + rdsv3_ib_send_add_credits(conn, credits); + } else { + ic->i_flowctl = 0; + } + + RDSV3_DPRINTF2("rdsv3_ib_set_flow_control", + "Return: conn: %p credits: %d", + conn, credits); +} + +/* + * Tune RNR behavior. Without flow control, we use a rather + * low timeout, but not the absolute minimum - this should + * be tunable. + * + * We already set the RNR retry count to 7 (which is the + * smallest infinite number :-) above. + * If flow control is off, we want to change this back to 0 + * so that we learn quickly when our credit accounting is + * buggy. + * + * Caller passes in a qp_attr pointer - don't waste stack spacv + * by allocation this twice. + */ +static void +rdsv3_ib_tune_rnr(struct rdsv3_ib_connection *ic, struct ib_qp_attr *attr) +{ + int ret; + + RDSV3_DPRINTF2("rdsv3_ib_tune_rnr", "Enter ic: %p attr: %p", + ic, attr); + + attr->min_rnr_timer = IB_RNR_TIMER_000_32; + ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER); + if (ret) + RDSV3_DPRINTF0("rdsv3_ib_tune_rnr", + "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d", -ret); +} + +/* + * Connection established. + * We get here for both outgoing and incoming connection. + */ +void +rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn, + struct rdma_cm_event *event) +{ + const struct rdsv3_ib_connect_private *dp = NULL; + struct rdsv3_ib_connection *ic = conn->c_transport_data; + struct rdsv3_ib_device *rds_ibdev; + struct ib_qp_attr qp_attr; + int err; + + RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", + "Enter conn: %p event: %p", conn, event); + + if (event->param.conn.private_data_len >= sizeof (*dp)) { + dp = event->param.conn.private_data; + + /* make sure it isn't empty data */ + if (dp->dp_protocol_major) { + rdsv3_ib_set_protocol(conn, + RDS_PROTOCOL(dp->dp_protocol_major, + dp->dp_protocol_minor)); + rdsv3_ib_set_flow_control(conn, + ntohl(dp->dp_credit)); + } + } + + RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", + "RDS/IB: connected to %u.%u.%u.%u version %u.%u%s", + NIPQUAD(conn->c_faddr), + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); + + /* + * Init rings and fill recv. this needs to wait until protocol + * negotiation + * is complete, since ring layout is different from 3.0 to 3.1. + */ + rdsv3_ib_send_init_ring(ic); + rdsv3_ib_recv_init_ring(ic); + /* + * Post receive buffers - as a side effect, this will update + * the posted credit count. + */ + (void) rdsv3_ib_recv_refill(conn, KM_NOSLEEP, 0, 1); + + /* Tune RNR behavior */ + rdsv3_ib_tune_rnr(ic, &qp_attr); + + qp_attr.qp_state = IB_QPS_RTS; + err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); + if (err) + RDSV3_DPRINTF0("rdsv3_ib_cm_connect_complete", + "ib_modify_qp(IB_QP_STATE, RTS): err=%d", err); + + /* update ib_device with this local ipaddr & conn */ + rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client); + err = rdsv3_ib_update_ipaddr(rds_ibdev, conn->c_laddr); + if (err) + RDSV3_DPRINTF0("rdsv3_ib_cm_connect_complete", + "rdsv3_ib_update_ipaddr failed (%d)", err); + rdsv3_ib_add_conn(rds_ibdev, conn); + + /* + * If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. + */ + if (dp && dp->dp_ack_seq) + rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL); + + rdsv3_connect_complete(conn); + + RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", + "Return conn: %p event: %p", + conn, event); +} + +static void +rdsv3_ib_cm_fill_conn_param(struct rdsv3_connection *conn, + struct rdma_conn_param *conn_param, + struct rdsv3_ib_connect_private *dp, + uint32_t protocol_version) +{ + RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param", + "Enter conn: %p conn_param: %p private: %p version: %d", + conn, conn_param, dp, protocol_version); + + (void) memset(conn_param, 0, sizeof (struct rdma_conn_param)); + /* XXX tune these? */ + conn_param->responder_resources = 1; + conn_param->initiator_depth = 1; + conn_param->retry_count = min(rdsv3_ib_retry_count, 7); + conn_param->rnr_retry_count = 7; + + if (dp) { + struct rdsv3_ib_connection *ic = conn->c_transport_data; + + (void) memset(dp, 0, sizeof (*dp)); + dp->dp_saddr = conn->c_laddr; + dp->dp_daddr = conn->c_faddr; + dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); + dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); + dp->dp_protocol_minor_mask = + htons(RDSV3_IB_SUPPORTED_PROTOCOLS); + dp->dp_ack_seq = rdsv3_ib_piggyb_ack(ic); + + /* Advertise flow control */ + if (ic->i_flowctl) { + unsigned int credits; + + credits = IB_GET_POST_CREDITS( + atomic_get(&ic->i_credits)); + dp->dp_credit = htonl(credits); + atomic_add_32(&ic->i_credits, + -IB_SET_POST_CREDITS(credits)); + } + + conn_param->private_data = dp; + conn_param->private_data_len = sizeof (*dp); + } + + RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param", + "Return conn: %p conn_param: %p private: %p version: %d", + conn, conn_param, dp, protocol_version); +} + +static void +rdsv3_ib_cq_event_handler(struct ib_event *event, void *data) +{ + RDSV3_DPRINTF3("rdsv3_ib_cq_event_handler", "event %u data %p", + event->event, data); +} + +static void +rdsv3_ib_qp_event_handler(struct ib_event *event, void *data) +{ + struct rdsv3_connection *conn = data; + struct rdsv3_ib_connection *ic = conn->c_transport_data; + + RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "conn %p ic %p event %u", + conn, ic, event->event); + + switch (event->event) { + case IB_EVENT_COMM_EST: + (void) rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); + break; + default: + if (conn) { + RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", + "RDS/IB: Fatal QP Event %u - " + "connection %u.%u.%u.%u ->%u.%u.%u.%u " + "...reconnecting", + event->event, NIPQUAD(conn->c_laddr), + NIPQUAD(conn->c_faddr)); + rdsv3_conn_drop(conn); + } else { + RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", + "RDS/IB: Fatal QP Event %u - connection" + "...reconnecting", event->event); + } + break; + } + + RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "Return conn: %p event: %p", + conn, event); +} + +extern int rdsv3_ib_alloc_hdrs(ib_device_t *dev, + struct rdsv3_ib_connection *ic); +extern void rdsv3_ib_free_hdrs(ib_device_t *dev, + struct rdsv3_ib_connection *ic); + +/* + * This needs to be very careful to not leave IS_ERR pointers around for + * cleanup to trip over. + */ +static int +rdsv3_ib_setup_qp(struct rdsv3_connection *conn) +{ + struct rdsv3_ib_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct ib_qp_init_attr attr; + struct rdsv3_ib_device *rds_ibdev; + ibt_send_wr_t *wrp; + ibt_wr_ds_t *sgl; + int ret, i; + + RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "Enter conn: %p", conn); + + /* + * rdsv3_ib_add_one creates a rdsv3_ib_device object per IB device, + * and allocates a protection domain, memory range and FMR pool + * for each. If that fails for any reason, it will not register + * the rds_ibdev at all. + */ + rds_ibdev = ib_get_client_data(dev, &rdsv3_ib_client); + if (rds_ibdev == NULL) { + RDSV3_DPRINTF0("rdsv3_ib_setup_qp", + "RDS/IB: No client_data for device %s", dev->name); + return (-EOPNOTSUPP); + } + + if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) + rdsv3_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); + if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1) + rdsv3_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1); + + /* Protection domain and memory range */ + ic->i_pd = rds_ibdev->pd; + + ic->i_send_cq = ib_create_cq(dev, rdsv3_ib_send_cq_comp_handler, + rdsv3_ib_cq_event_handler, conn, + ic->i_send_ring.w_nr + 1, 0); + if (IS_ERR(ic->i_send_cq)) { + ret = PTR_ERR(ic->i_send_cq); + ic->i_send_cq = NULL; + RDSV3_DPRINTF2("rdsv3_ib_setup_qp", + "ib_create_cq send failed: %d", ret); + goto out; + } + + ic->i_recv_cq = ib_create_cq(dev, rdsv3_ib_recv_cq_comp_handler, + rdsv3_ib_cq_event_handler, conn, + ic->i_recv_ring.w_nr, 0); + if (IS_ERR(ic->i_recv_cq)) { + ret = PTR_ERR(ic->i_recv_cq); + ic->i_recv_cq = NULL; + RDSV3_DPRINTF2("rdsv3_ib_setup_qp", + "ib_create_cq recv failed: %d", ret); + goto out; + } + + ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); + if (ret) { + RDSV3_DPRINTF2("rdsv3_ib_setup_qp", + "ib_req_notify_cq send failed: %d", ret); + goto out; + } + + ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); + if (ret) { + RDSV3_DPRINTF2("rdsv3_ib_setup_qp", + "ib_req_notify_cq recv failed: %d", ret); + goto out; + } + + /* XXX negotiate max send/recv with remote? */ + (void) memset(&attr, 0, sizeof (attr)); + attr.event_handler = rdsv3_ib_qp_event_handler; + attr.qp_context = conn; + /* + 1 to allow for the single ack message */ + attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1; + attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1; + attr.cap.max_send_sge = rds_ibdev->max_sge; + attr.cap.max_recv_sge = RDSV3_IB_RECV_SGE; + attr.sq_sig_type = IB_SIGNAL_REQ_WR; + attr.qp_type = IB_QPT_RC; + attr.send_cq = ic->i_send_cq; + attr.recv_cq = ic->i_recv_cq; + + /* + * XXX this can fail if max_*_wr is too large? Are we supposed + * to back off until we get a value that the hardware can support? + */ + ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); + if (ret) { + RDSV3_DPRINTF2("rdsv3_ib_setup_qp", + "rdma_create_qp failed: %d", ret); + goto out; + } + + ret = rdsv3_ib_alloc_hdrs(dev, ic); + if (ret != 0) { + ret = -ENOMEM; + RDSV3_DPRINTF2("rdsv3_ib_setup_qp", + "rdsv3_ib_alloc_hdrs failed: %d", ret); + goto out; + } + + ic->i_sends = kmem_alloc(ic->i_send_ring.w_nr * + sizeof (struct rdsv3_ib_send_work), KM_NOSLEEP); + if (ic->i_sends == NULL) { + ret = -ENOMEM; + RDSV3_DPRINTF2("rdsv3_ib_setup_qp", + "send allocation failed: %d", ret); + goto out; + } + (void) memset(ic->i_sends, 0, ic->i_send_ring.w_nr * + sizeof (struct rdsv3_ib_send_work)); + + ic->i_send_wrs = + kmem_alloc(RDSV3_IB_SEND_WRS * (sizeof (ibt_send_wr_t) + + RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t)), KM_NOSLEEP); + if (ic->i_send_wrs == NULL) { + ret = -ENOMEM; + RDSV3_DPRINTF2("rdsv3_ib_setup_qp", + "WR allocation failed: %d", ret); + goto out; + } + sgl = (ibt_wr_ds_t *)((uint8_t *)ic->i_send_wrs + + (RDSV3_IB_SEND_WRS * sizeof (ibt_send_wr_t))); + RDSV3_DPRINTF4("rdsv3_ib_setup_qp", "i_send_wrs: %p sgl: %p", + ic->i_send_wrs, sgl); + for (i = 0; i < RDSV3_IB_SEND_WRS; i++) { + wrp = &ic->i_send_wrs[i]; + wrp->wr_sgl = &sgl[i * RDSV3_IB_MAX_SGE]; + } + + ic->i_recvs = kmem_alloc(ic->i_recv_ring.w_nr * + sizeof (struct rdsv3_ib_recv_work), KM_NOSLEEP); + if (ic->i_recvs == NULL) { + ret = -ENOMEM; + RDSV3_DPRINTF2("rdsv3_ib_setup_qp", + "recv allocation failed: %d", ret); + goto out; + } + (void) memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * + sizeof (struct rdsv3_ib_recv_work)); + + rdsv3_ib_recv_init_ack(ic); + + RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "conn %p pd %p mr %p cq %p %p", + conn, ic->i_pd, ic->i_mr, ic->i_send_cq, ic->i_recv_cq); + +out: + return (ret); +} + +static uint32_t +rdsv3_ib_protocol_compatible(struct rdma_cm_event *event) +{ + const struct rdsv3_ib_connect_private *dp = + event->param.conn.private_data; + uint16_t common; + uint32_t version = 0; + + RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Enter event: %p", + event); + + /* + * rdma_cm private data is odd - when there is any private data in the + * request, we will be given a pretty large buffer without telling us + * the + * original size. The only way to tell the difference is by looking at + * the contents, which are initialized to zero. + * If the protocol version fields aren't set, + * this is a connection attempt + * from an older version. This could could be 3.0 or 2.0 - + * we can't tell. + * We really should have changed this for OFED 1.3 :-( + */ + + /* Be paranoid. RDS always has privdata */ + if (!event->param.conn.private_data_len) { + RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", + "RDS incoming connection has no private data, rejecting"); + return (0); + } + + /* Even if len is crap *now* I still want to check it. -ASG */ + if (event->param.conn.private_data_len < sizeof (*dp) || + dp->dp_protocol_major == 0) + return (RDS_PROTOCOL_3_0); + + common = ntohs(dp->dp_protocol_minor_mask) & + RDSV3_IB_SUPPORTED_PROTOCOLS; + if (dp->dp_protocol_major == 3 && common) { + version = RDS_PROTOCOL_3_0; + while ((common >>= 1) != 0) + version++; + } else { + RDSV3_DPRINTF0("rdsv3_ib_protocol_compatible", + "RDS: Connection from %u.%u.%u.%u using " + "incompatible protocol version %u.%u\n", + NIPQUAD(dp->dp_saddr), + dp->dp_protocol_major, + dp->dp_protocol_minor); + } + + RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Return event: %p", + event); + + return (version); +} + +int +rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + uint64_be_t lguid = cm_id->route.path_rec->sgid.global.interface_id; + uint64_be_t fguid = cm_id->route.path_rec->dgid.global.interface_id; + const struct rdsv3_ib_connect_private *dp = + event->param.conn.private_data; + struct rdsv3_ib_connect_private dp_rep; + struct rdsv3_connection *conn = NULL; + struct rdsv3_ib_connection *ic = NULL; + struct rdma_conn_param conn_param; + uint32_t version; + int err, destroy = 1; + boolean_t conn_created = B_FALSE; + + RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", + "Enter cm_id: %p event: %p", cm_id, event); + + /* Check whether the remote protocol version matches ours. */ + version = rdsv3_ib_protocol_compatible(event); + if (!version) { + RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", + "version mismatch"); + goto out; + } + + RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", + "saddr %u.%u.%u.%u daddr %u.%u.%u.%u RDSv%d.%d lguid 0x%llx fguid " + "0x%llx", NIPQUAD(dp->dp_saddr), NIPQUAD(dp->dp_daddr), + RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), + (unsigned long long)ntohll(lguid), + (unsigned long long)ntohll(fguid)); + + conn = rdsv3_conn_create(dp->dp_daddr, dp->dp_saddr, + &rdsv3_ib_transport, KM_NOSLEEP); + if (IS_ERR(conn)) { + RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", + "rdsv3_conn_create failed (%ld)", PTR_ERR(conn)); + conn = NULL; + goto out; + } + + /* + * The connection request may occur while the + * previous connection exist, e.g. in case of failover. + * But as connections may be initiated simultaneously + * by both hosts, we have a random backoff mechanism - + * see the comment above rdsv3_queue_reconnect() + */ + mutex_enter(&conn->c_cm_lock); + if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, + RDSV3_CONN_CONNECTING)) { + if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { + RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", + "incoming connect when connected: %p", + conn); + rdsv3_conn_drop(conn); + rdsv3_ib_stats_inc(s_ib_listen_closed_stale); + mutex_exit(&conn->c_cm_lock); + goto out; + } else if (rdsv3_conn_state(conn) == RDSV3_CONN_CONNECTING) { + /* Wait and see - our connect may still be succeeding */ + RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", + "peer-to-peer connection request: %p, " + "lguid: 0x%llx fguid: 0x%llx", + conn, lguid, fguid); + rdsv3_ib_stats_inc(s_ib_connect_raced); + } + mutex_exit(&conn->c_cm_lock); + goto out; + } + + ic = conn->c_transport_data; + + rdsv3_ib_set_protocol(conn, version); + rdsv3_ib_set_flow_control(conn, ntohl(dp->dp_credit)); + + /* + * If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. + */ + if (dp->dp_ack_seq) + rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL); + + ASSERT(!cm_id->context); + ASSERT(!ic->i_cm_id); + + if (ic->i_cm_id != NULL) + RDSV3_PANIC(); + + ic->i_cm_id = cm_id; + cm_id->context = conn; + + /* + * We got halfway through setting up the ib_connection, if we + * fail now, we have to take the long route out of this mess. + */ + destroy = 0; + + err = rdsv3_ib_setup_qp(conn); + if (err) { + RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", + "rdsv3_ib_setup_qp failed (%d)", err); + rdsv3_conn_drop(conn); + goto out; + } + + rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); + + /* rdma_accept() calls rdma_reject() internally if it fails */ + err = rdma_accept(cm_id, &conn_param); + mutex_exit(&conn->c_cm_lock); + if (err) { + RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", + "rdma_accept failed (%d)", err); + rdsv3_conn_drop(conn); + goto out; + } + + RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", + "Return cm_id: %p event: %p", cm_id, event); + + return (0); + +out: + (void) rdma_reject(cm_id, NULL, 0); + return (destroy); +} + + +int +rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) +{ + struct rdsv3_connection *conn = cm_id->context; + struct rdsv3_ib_connection *ic = conn->c_transport_data; + struct rdma_conn_param conn_param; + struct rdsv3_ib_connect_private dp; + int ret; + + RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", "Enter: cm_id: %p", + cm_id); + + /* + * If the peer doesn't do protocol negotiation, we must + * default to RDSv3.0 + */ + rdsv3_ib_set_protocol(conn, RDS_PROTOCOL_3_0); + ic->i_flowctl = + rdsv3_ib_sysctl_flow_control; /* advertise flow control */ + + ret = rdsv3_ib_setup_qp(conn); + if (ret) { + RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", + "rdsv3_ib_setup_qp failed (%d)", ret); + rdsv3_conn_drop(conn); + goto out; + } + + (void) rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp, + RDS_PROTOCOL_VERSION); + + ret = rdma_connect(cm_id, &conn_param); + if (ret) { + RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", + "rdma_connect failed (%d)", ret); + rdsv3_conn_drop(conn); + } + + RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", + "Return: cm_id: %p", cm_id); + +out: + /* + * Beware - returning non-zero tells the rdma_cm to destroy + * the cm_id. We should certainly not do it as long as we still + * "own" the cm_id. + */ + if (ret) { + if (ic->i_cm_id == cm_id) + ret = 0; + } + return (ret); +} + +int +rdsv3_ib_conn_connect(struct rdsv3_connection *conn) +{ + struct rdsv3_ib_connection *ic = conn->c_transport_data; + struct sockaddr_in src, dest; + ipaddr_t laddr, faddr; + int ret; + + RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Enter: conn: %p", conn); + + /* + * XXX I wonder what affect the port space has + */ + /* delegate cm event handler to rdma_transport */ + ic->i_cm_id = rdma_create_id(rdsv3_rdma_cm_event_handler, conn, + RDMA_PS_TCP); + if (IS_ERR(ic->i_cm_id)) { + ret = PTR_ERR(ic->i_cm_id); + ic->i_cm_id = NULL; + RDSV3_DPRINTF2("rdsv3_ib_conn_connect", + "rdma_create_id() failed: %d", ret); + goto out; + } + + RDSV3_DPRINTF3("rdsv3_ib_conn_connect", + "created cm id %p for conn %p", ic->i_cm_id, conn); + + /* The ipaddr should be in the network order */ + laddr = conn->c_laddr; + faddr = conn->c_faddr; + ret = rdsv3_sc_path_lookup(&laddr, &faddr); + if (ret == 0) { + RDSV3_DPRINTF2(LABEL, "Path not found (0x%x 0x%x)", + ntohl(laddr), ntohl(faddr)); + } + + src.sin_family = AF_INET; + src.sin_addr.s_addr = (uint32_t)laddr; + src.sin_port = (uint16_t)htons(0); + + dest.sin_family = AF_INET; + dest.sin_addr.s_addr = (uint32_t)faddr; + dest.sin_port = (uint16_t)htons(RDSV3_PORT); + + ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, + (struct sockaddr *)&dest, + RDSV3_RDMA_RESOLVE_TIMEOUT_MS); + if (ret) { + RDSV3_DPRINTF2("rdsv3_ib_conn_connect", + "addr resolve failed for cm id %p: %d", ic->i_cm_id, ret); + rdma_destroy_id(ic->i_cm_id); + ic->i_cm_id = NULL; + } + + RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Return: conn: %p", conn); + +out: + return (ret); +} + +/* + * This is so careful about only cleaning up resources that were built up + * so that it can be called at any point during startup. In fact it + * can be called multiple times for a given connection. + */ +void +rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn) +{ + struct rdsv3_ib_connection *ic = conn->c_transport_data; + int err = 0; + + RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", + "cm %p pd %p cq %p %p qp %p", ic->i_cm_id, + ic->i_pd, ic->i_send_cq, ic->i_recv_cq, + ic->i_cm_id ? ic->i_cm_id->qp : NULL); + + if (ic->i_cm_id) { + struct ib_device *dev = ic->i_cm_id->device; + + RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", + "disconnecting cm %p", ic->i_cm_id); + err = rdma_disconnect(ic->i_cm_id); + if (err) { + /* + * Actually this may happen quite frequently, when + * an outgoing connect raced with an incoming connect. + */ + RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", + "failed to disconnect, cm: %p err %d", + ic->i_cm_id, err); + } + + if (ic->i_cm_id->qp) { + (void) ibt_flush_qp( + ib_get_ibt_channel_hdl(ic->i_cm_id)); + + /* wait until all WRs are flushed */ + rdsv3_wait_event(rdsv3_ib_ring_empty_wait, + rdsv3_ib_ring_empty(&ic->i_send_ring) && + rdsv3_ib_ring_empty(&ic->i_recv_ring)); + + rdma_destroy_qp(ic->i_cm_id); + } + + + if (ic->i_mr) + rdsv3_ib_free_hdrs(dev, ic); + + if (ic->i_sends) + rdsv3_ib_send_clear_ring(ic); + if (ic->i_recvs) + rdsv3_ib_recv_clear_ring(ic); + + if (ic->i_send_cq) + (void) ib_destroy_cq(ic->i_send_cq); + if (ic->i_recv_cq) + (void) ib_destroy_cq(ic->i_recv_cq); + rdma_destroy_id(ic->i_cm_id); + + /* + * Move connection back to the nodev list. + */ + if (ic->rds_ibdev) + rdsv3_ib_remove_conn(ic->rds_ibdev, conn); + + ic->i_cm_id = NULL; + ic->i_pd = NULL; + ic->i_mr = NULL; + ic->i_send_cq = NULL; + ic->i_recv_cq = NULL; + ic->i_send_hdrs = NULL; + ic->i_recv_hdrs = NULL; + ic->i_ack = NULL; + } + ASSERT(!ic->rds_ibdev); + + /* Clear pending transmit */ + if (ic->i_rm) { + rdsv3_message_put(ic->i_rm); + ic->i_rm = NULL; + } + + /* Clear the ACK state */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + ic->i_ack_next = 0; + ic->i_ack_recv = 0; + + /* Clear flow control state */ + ic->i_flowctl = 0; + ic->i_credits = 0; + + rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr); + rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr); + + if (ic->i_ibinc) { + rdsv3_inc_put(&ic->i_ibinc->ii_inc); + ic->i_ibinc = NULL; + } + + if (ic->i_sends) { + kmem_free(ic->i_sends, + ic->i_send_ring.w_nr * sizeof (struct rdsv3_ib_send_work)); + ic->i_sends = NULL; + } + if (ic->i_send_wrs) { + kmem_free(ic->i_send_wrs, RDSV3_IB_SEND_WRS * + (sizeof (ibt_send_wr_t) + + RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t))); + ic->i_send_wrs = NULL; + } + if (ic->i_recvs) { + kmem_free(ic->i_recvs, + ic->i_recv_ring.w_nr * sizeof (struct rdsv3_ib_recv_work)); + ic->i_recvs = NULL; + } + + RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", "Return conn: %p", conn); +} + +/* + * the connection can be allocated from either rdsv3_conn_create_outgoing() + * or rdsv3_conn_create(), so ddi_taskq_create() can be called with the + * same string. This can print the kstat warning on the console. To prevent + * it, this counter value is used. + * Note that requests from rdsv3_conn_create_outgoing() refers to the cached + * value with the mutex lock before it allocates the connection, so that + * the warning cannot be produced in the case. (only between + * rdsv3_conn_create() and rdsv3_conn_create_outgoing(). + */ +static int conn_cnt; + +/* ARGSUSED */ +int +rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp) +{ + struct rdsv3_ib_connection *ic; + char tq_name[TASKQ_NAMELEN]; + + RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn: %p", conn); + + /* XXX too lazy? */ + ic = kmem_zalloc(sizeof (struct rdsv3_ib_connection), gfp); + if (ic == NULL) + return (-ENOMEM); + + list_link_init(&ic->ib_node); + (void) snprintf(tq_name, TASKQ_NAMELEN, "RDSV3_CONN_to_%x:%u", + htonl(conn->c_faddr), conn_cnt++ % 100); + ic->i_recv_tasklet = + ddi_taskq_create(NULL, tq_name, 1, TASKQ_DEFAULTPRI, 0); + + + mutex_init(&ic->i_recv_mutex, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ic->i_ack_lock, NULL, MUTEX_DRIVER, NULL); + + /* + * rdsv3_ib_conn_shutdown() waits for these to be emptied so they + * must be initialized before it can be called. + */ + rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr); + rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr); + + ic->conn = conn; + conn->c_transport_data = ic; + + mutex_enter(&ib_nodev_conns_lock); + list_insert_tail(&ib_nodev_conns, ic); + mutex_exit(&ib_nodev_conns_lock); + + + RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn %p conn ic %p", + conn, conn->c_transport_data); + return (0); +} + +/* + * Free a connection. Connection must be shut down and not set for reconnect. + */ +void +rdsv3_ib_conn_free(void *arg) +{ + struct rdsv3_ib_connection *ic = arg; + kmutex_t *lock_ptr; + + RDSV3_DPRINTF2("rdsv3_ib_conn_free", "ic %p\n", ic); + +#ifndef __lock_lint + /* + * Conn is either on a dev's list or on the nodev list. + * A race with shutdown() or connect() would cause problems + * (since rds_ibdev would change) but that should never happen. + */ + lock_ptr = ic->rds_ibdev ? + &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock; + + mutex_enter(lock_ptr); + list_remove_node(&ic->ib_node); + mutex_exit(lock_ptr); +#endif + + ddi_taskq_destroy(ic->i_recv_tasklet); + kmem_free(ic, sizeof (*ic)); +} + +/* + * An error occurred on the connection + */ +void +__rdsv3_ib_conn_error(struct rdsv3_connection *conn) +{ + rdsv3_conn_drop(conn); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_rdma.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_rdma.c new file mode 100644 index 0000000000..fce01b7b1d --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_rdma.c @@ -0,0 +1,551 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/rds.h> +#include <netinet/in.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/rdma.h> +#include <sys/ib/clients/rdsv3/ib.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +/* + * This is stored as mr->r_trans_private. + */ +struct rdsv3_ib_mr { + struct rdsv3_ib_device *device; + struct rdsv3_ib_mr_pool *pool; + struct ib_fmr *fmr; + struct list list; + unsigned int remap_count; + + struct rdsv3_scatterlist *sg; + unsigned int sg_len; + uint64_t *dma; + int sg_dma_len; + + /* DDI pinned memory */ + ddi_umem_cookie_t umem_cookie; + /* IBTF type definitions */ + ibt_fmr_pool_hdl_t fmr_pool_hdl; + ibt_ma_hdl_t rc_ma_hdl; + ibt_mr_hdl_t rc_fmr_hdl; + ibt_pmr_desc_t rc_mem_desc; +}; + +/* + * Our own little FMR pool + */ +struct rdsv3_ib_mr_pool { + struct mutex flush_lock; /* serialize fmr invalidate */ + struct rdsv3_work_s flush_worker; /* flush worker */ + + kmutex_t list_lock; /* protect variables below */ + atomic_t item_count; /* total # of MRs */ + atomic_t dirty_count; /* # dirty of MRs */ + /* MRs that have reached their max_maps limit */ + struct list drop_list; + struct list free_list; /* unused MRs */ + struct list clean_list; /* unused & unamapped MRs */ + atomic_t free_pinned; /* memory pinned by free MRs */ + unsigned long max_items; + unsigned long max_items_soft; + unsigned long max_free_pinned; +}; + +static int rdsv3_ib_flush_mr_pool(struct rdsv3_ib_device *rds_ibdev, + ibt_fmr_pool_hdl_t pool_hdl, int free_all); +static void rdsv3_ib_teardown_mr(struct rdsv3_ib_mr *ibmr); +static void rdsv3_ib_mr_pool_flush_worker(struct rdsv3_work_s *work); +static struct rdsv3_ib_mr *rdsv3_ib_alloc_fmr(struct rdsv3_ib_device + *rds_ibdev); +static int rdsv3_ib_map_fmr(struct rdsv3_ib_device *rds_ibdev, + struct rdsv3_ib_mr *ibmr, struct buf *bp, unsigned int nents); + +static struct rdsv3_ib_device * +rdsv3_ib_get_device(uint32_be_t ipaddr) +{ + struct rdsv3_ib_device *rds_ibdev; + struct rdsv3_ib_ipaddr *i_ipaddr; + + RDSV3_DPRINTF4("rdsv3_ib_get_device", "Enter: ipaddr: 0x%x", ipaddr); + + RDSV3_FOR_EACH_LIST_NODE(rds_ibdev, &rdsv3_ib_devices, list) { + mutex_enter(&rds_ibdev->spinlock); + RDSV3_FOR_EACH_LIST_NODE(i_ipaddr, &rds_ibdev->ipaddr_list, + list) { + if (i_ipaddr->ipaddr == ipaddr) { + mutex_exit(&rds_ibdev->spinlock); + return (rds_ibdev); + } + } + mutex_exit(&rds_ibdev->spinlock); + } + + RDSV3_DPRINTF4("rdsv3_ib_get_device", "Return: ipaddr: 0x%x", ipaddr); + + return (NULL); +} + +static int +rdsv3_ib_add_ipaddr(struct rdsv3_ib_device *rds_ibdev, uint32_be_t ipaddr) +{ + struct rdsv3_ib_ipaddr *i_ipaddr; + + RDSV3_DPRINTF4("rdsv3_ib_add_ipaddr", "rds_ibdev: %p ipaddr: %x", + rds_ibdev, ipaddr); + + i_ipaddr = kmem_alloc(sizeof (*i_ipaddr), KM_NOSLEEP); + if (!i_ipaddr) + return (-ENOMEM); + + i_ipaddr->ipaddr = ipaddr; + + mutex_enter(&rds_ibdev->spinlock); + list_insert_tail(&rds_ibdev->ipaddr_list, i_ipaddr); + mutex_exit(&rds_ibdev->spinlock); + + return (0); +} + +static void +rdsv3_ib_remove_ipaddr(struct rdsv3_ib_device *rds_ibdev, uint32_be_t ipaddr) +{ + struct rdsv3_ib_ipaddr *i_ipaddr, *next; + + RDSV3_DPRINTF4("rdsv3_ib_remove_ipaddr", "rds_ibdev: %p, ipaddr: %x", + rds_ibdev, ipaddr); + + mutex_enter(&rds_ibdev->spinlock); + RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, next, &rds_ibdev->ipaddr_list, + list) { + if (i_ipaddr->ipaddr == ipaddr) { + list_remove_node(&i_ipaddr->list); + kmem_free(i_ipaddr, sizeof (*i_ipaddr)); + break; + } + } + mutex_exit(&rds_ibdev->spinlock); + + RDSV3_DPRINTF4("rdsv3_ib_remove_ipaddr", + "Return: rds_ibdev: %p, ipaddr: %x", rds_ibdev, ipaddr); +} + +int +rdsv3_ib_update_ipaddr(struct rdsv3_ib_device *rds_ibdev, uint32_be_t ipaddr) +{ + struct rdsv3_ib_device *rds_ibdev_old; + + RDSV3_DPRINTF4("rdsv3_ib_update_ipaddr", "rds_ibdev: %p, ipaddr: %x", + rds_ibdev, ipaddr); + + rds_ibdev_old = rdsv3_ib_get_device(ipaddr); + if (rds_ibdev_old) + rdsv3_ib_remove_ipaddr(rds_ibdev_old, ipaddr); + + return (rdsv3_ib_add_ipaddr(rds_ibdev, ipaddr)); +} + +void +rdsv3_ib_add_conn(struct rdsv3_ib_device *rds_ibdev, + struct rdsv3_connection *conn) +{ + struct rdsv3_ib_connection *ic = conn->c_transport_data; + + RDSV3_DPRINTF4("rdsv3_ib_add_conn", "rds_ibdev: %p, conn: %p", + rds_ibdev, conn); + + /* conn was previously on the nodev_conns_list */ + mutex_enter(&ib_nodev_conns_lock); + ASSERT(!list_is_empty(&ib_nodev_conns)); + ASSERT(list_link_active(&ic->ib_node)); + list_remove_node(&ic->ib_node); + + mutex_enter(&rds_ibdev->spinlock); + list_insert_tail(&rds_ibdev->conn_list, ic); + mutex_exit(&rds_ibdev->spinlock); + mutex_exit(&ib_nodev_conns_lock); + + ic->rds_ibdev = rds_ibdev; +} + +void +rdsv3_ib_remove_conn(struct rdsv3_ib_device *rds_ibdev, + struct rdsv3_connection *conn) +{ + struct rdsv3_ib_connection *ic = conn->c_transport_data; + + RDSV3_DPRINTF4("rdsv3_ib_remove_conn", "rds_ibdev: %p, conn: %p", + rds_ibdev, conn); + + /* place conn on nodev_conns_list */ + mutex_enter(&ib_nodev_conns_lock); + + mutex_enter(&rds_ibdev->spinlock); + ASSERT(list_link_active(&ic->ib_node)); + list_remove_node(&ic->ib_node); + mutex_exit(&rds_ibdev->spinlock); + + list_insert_tail(&ib_nodev_conns, ic); + + mutex_exit(&ib_nodev_conns_lock); + + ic->rds_ibdev = NULL; + + RDSV3_DPRINTF4("rdsv3_ib_remove_conn", + "Return: rds_ibdev: %p, conn: %p", rds_ibdev, conn); +} + +void +__rdsv3_ib_destroy_conns(struct list *list, kmutex_t *list_lock) +{ + struct rdsv3_ib_connection *ic, *_ic; + list_t tmp_list; + + RDSV3_DPRINTF4("__rdsv3_ib_destroy_conns", "Enter: list: %p", list); + + /* avoid calling conn_destroy with irqs off */ + mutex_enter(list_lock); + list_splice(list, &tmp_list); + mutex_exit(list_lock); + + RDSV3_FOR_EACH_LIST_NODE_SAFE(ic, _ic, &tmp_list, ib_node) { + rdsv3_conn_destroy(ic->conn); + } + + RDSV3_DPRINTF4("__rdsv3_ib_destroy_conns", "Return: list: %p", list); +} + +void +rdsv3_ib_destroy_mr_pool(struct rdsv3_ib_device *rds_ibdev) +{ + RDSV3_DPRINTF4("rdsv3_ib_destroy_mr_pool", "Enter: ibdev: %p", + rds_ibdev); + + if (rds_ibdev->fmr_pool_hdl == NULL) + return; + + (void) rdsv3_ib_flush_mr_pool(rds_ibdev, rds_ibdev->fmr_pool_hdl, 1); + (void) ibt_destroy_fmr_pool(ib_get_ibt_hca_hdl(rds_ibdev->dev), + rds_ibdev->fmr_pool_hdl); +} + +#define IB_FMR_MAX_BUF_SIZE 0x1000000 /* 16MB max buf */ +int +rdsv3_ib_create_mr_pool(struct rdsv3_ib_device *rds_ibdev) +{ + uint_t h_page_sz; + ibt_fmr_pool_attr_t fmr_attr; + ibt_status_t ibt_status; + ibt_hca_hdl_t hca_hdl; + + RDSV3_DPRINTF4("rdsv3_ib_create_mr_pool", + "Enter: ibdev: %p", rds_ibdev); + + hca_hdl = ib_get_ibt_hca_hdl(rds_ibdev->dev); + /* get hca attributes */ + ibt_status = ibt_query_hca(hca_hdl, &rds_ibdev->hca_attr); + if (ibt_status != IBT_SUCCESS) { + return (-ENOMEM); + } + + /* setup FMR pool attributes */ + h_page_sz = rds_ibdev->hca_attr.hca_page_sz * 1024; + + fmr_attr.fmr_max_pages_per_fmr = (IB_FMR_MAX_BUF_SIZE / h_page_sz) + 2; + fmr_attr.fmr_pool_size = RDSV3_FMR_POOL_SIZE; + fmr_attr.fmr_dirty_watermark = 128; + fmr_attr.fmr_cache = B_FALSE; + fmr_attr.fmr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE | + IBT_MR_ENABLE_REMOTE_WRITE | IBT_MR_ENABLE_REMOTE_READ; + fmr_attr.fmr_page_sz = h_page_sz; + fmr_attr.fmr_func_hdlr = NULL; + fmr_attr.fmr_func_arg = (void *) NULL; + + /* create the FMR pool */ + ibt_status = ibt_create_fmr_pool(hca_hdl, rds_ibdev->pd->ibt_pd, + &fmr_attr, &rds_ibdev->fmr_pool_hdl); + if (ibt_status != IBT_SUCCESS) { + return (-ENOMEM); + } + rds_ibdev->max_fmrs = fmr_attr.fmr_pool_size; + rds_ibdev->fmr_message_size = fmr_attr.fmr_max_pages_per_fmr; + return (0); +} + +void +rdsv3_ib_get_mr_info(struct rdsv3_ib_device *rds_ibdev, + struct rdsv3_info_rdma_connection *iinfo) +{ + iinfo->rdma_mr_max = rds_ibdev->max_fmrs; + iinfo->rdma_mr_size = rds_ibdev->fmr_message_size; +} + +static void +rdsv3_umem_cb(ddi_umem_cookie_t *umem_cookie) +{ + /* LINTED E_FUNC_SET_NOT_USED */ + ddi_umem_cookie_t *cp = umem_cookie; + RDSV3_DPRINTF5("rdsv3_umem_cb", "Enter: umem_cookie %p", umem_cookie); + /* all umem_cookies are freed at socket fd close */ + /* there should be no umem_cookies when clearing the addr space */ +} + +struct umem_callback_ops rdsv3_umem_cbops = { + UMEM_CALLBACK_VERSION, + rdsv3_umem_cb, +}; + +void * +rdsv3_ib_get_mr(struct rdsv3_iovec *args, unsigned long nents, + struct rdsv3_sock *rs, uint32_t *key_ret) +{ + struct rdsv3_ib_device *rds_ibdev; + struct rdsv3_ib_mr *ibmr = NULL; + ddi_umem_cookie_t umem_cookie; + size_t umem_len; + caddr_t umem_addr; + int umem_flags; + int ret; + struct buf *bp; + + RDSV3_DPRINTF4("rdsv3_ib_get_mr", "Enter: args.addr: %p", args->addr); + + rds_ibdev = rdsv3_ib_get_device(rs->rs_bound_addr); + + if (rds_ibdev == NULL) + return (void *)(PTR_ERR(-EFAULT)); + + ibmr = rdsv3_ib_alloc_fmr(rds_ibdev); + if (IS_ERR(ibmr)) + return (ibmr); + + /* pin user memory pages */ + umem_len = ptob(btopr(args->bytes + + ((uintptr_t)args->addr & PAGEOFFSET))); + umem_addr = (caddr_t)((uintptr_t)args->addr & ~PAGEOFFSET); + umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ | + DDI_UMEMLOCK_LONGTERM); + ret = umem_lockmemory(umem_addr, umem_len, umem_flags, + &umem_cookie, &rdsv3_umem_cbops, NULL); + if (ret != 0) { + kmem_free((void *) ibmr, sizeof (*ibmr)); + ibmr = ERR_PTR(ret); + return (ibmr); + } + + /* transpose umem_cookie to buf structure for rdsv3_ib_map_fmr() */ + bp = ddi_umem_iosetup(umem_cookie, 0, umem_len, + B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP); + + ret = rdsv3_ib_map_fmr(rds_ibdev, ibmr, bp, nents); + freerbuf(bp); /* free bp */ + if (ret == 0) { + ibmr->umem_cookie = umem_cookie; + *key_ret = (uint32_t)ibmr->rc_mem_desc.pmd_rkey; + ibmr->device = rds_ibdev; + RDSV3_DPRINTF4("rdsv3_ib_get_mr", + "Return: ibmr: %p umem_cookie %p", ibmr, ibmr->umem_cookie); + return (ibmr); + } else { /* error return */ + RDSV3_DPRINTF1("rdsv3_ib_get_mr", "map_fmr failed (errno=%d)\n", + ret); + ddi_umem_unlock(umem_cookie); + kmem_free((void *)ibmr, sizeof (*ibmr)); + return (ERR_PTR(ret)); + } +} + +static struct rdsv3_ib_mr * +rdsv3_ib_alloc_fmr(struct rdsv3_ib_device *rds_ibdev) +{ + struct rdsv3_ib_mr *ibmr; + + RDSV3_DPRINTF4("rdsv3_ib_alloc_fmr", "Enter: ibdev: %p", rds_ibdev); + + if (rds_ibdev->fmr_pool_hdl) { + ibmr = (struct rdsv3_ib_mr *)kmem_zalloc(sizeof (*ibmr), + KM_SLEEP); + ibmr->fmr_pool_hdl = rds_ibdev->fmr_pool_hdl; + return (ibmr); + } + return (struct rdsv3_ib_mr *)(PTR_ERR(-ENOMEM)); +} + +static int +rdsv3_ib_map_fmr(struct rdsv3_ib_device *rds_ibdev, struct rdsv3_ib_mr *ibmr, + struct buf *bp, unsigned int nents) +{ + ibt_va_attr_t va_attr; + ibt_reg_req_t reg_req; + uint_t paddr_list_len; + uint_t page_sz; + ibt_status_t ibt_status; + /* LINTED E_FUNC_SET_NOT_USED */ + unsigned int l_nents = nents; + + RDSV3_DPRINTF4("rdsv3_ib_map_fmr", "Enter: ibmr: %p", ibmr); + RDSV3_DPRINTF4("rdsv3_ib_map_fmr", "buf addr: %p", bp->b_un.b_addr); + + /* setup ibt_map_mem_area attributes */ + bzero(&va_attr, sizeof (ibt_va_attr_t)); + va_attr.va_buf = bp; + va_attr.va_flags = IBT_VA_FMR | IBT_VA_BUF; + + page_sz = rds_ibdev->hca_attr.hca_page_sz * 1024; /* in kbytes */ + paddr_list_len = (bp->b_bcount / page_sz) + 2; /* start + end pg */ + + /* map user buffer to HCA address */ + ibt_status = ibt_map_mem_area(ib_get_ibt_hca_hdl(rds_ibdev->dev), + &va_attr, paddr_list_len, ®_req, &ibmr->rc_ma_hdl); + if (ibt_status != IBT_SUCCESS) { + return (-ENOMEM); + } + + /* use a free entry from FMR pool to register the specified memory */ + ibt_status = ibt_register_physical_fmr( + ib_get_ibt_hca_hdl(rds_ibdev->dev), ibmr->fmr_pool_hdl, + ®_req.fn_arg, &ibmr->rc_fmr_hdl, &ibmr->rc_mem_desc); + if (ibt_status != IBT_SUCCESS) { + (void) ibt_unmap_mem_area(ib_get_ibt_hca_hdl(rds_ibdev->dev), + ibmr->rc_ma_hdl); + if (ibt_status == IBT_INSUFF_RESOURCE) { + return (-ENOBUFS); + } + return (-EINVAL); + } + RDSV3_DPRINTF4("rdsv3_ib_map_fmr", "Return: ibmr: %p rkey: 0x%x", + ibmr, (uint32_t)ibmr->rc_mem_desc.pmd_rkey); + return (0); +} + +void +rdsv3_ib_sync_mr(void *trans_private, int direction) +{ + /* LINTED E_FUNC_SET_NOT_USED */ + void *l_trans_private = trans_private; + /* LINTED E_FUNC_SET_NOT_USED */ + int l_direction = direction; + + /* FMR Sync not needed in Solaris on PCI-ex systems */ + + RDSV3_DPRINTF4("rdsv3_ib_sync_mr", "Enter:"); +} + +void +rdsv3_ib_flush_mrs(void) +{ + struct rdsv3_ib_device *rds_ibdev; + + RDSV3_DPRINTF4("rdsv3_ib_flush_mrs", "Enter:"); + + RDSV3_FOR_EACH_LIST_NODE(rds_ibdev, &rdsv3_ib_devices, list) { + if (rds_ibdev->fmr_pool_hdl) { + (void) rdsv3_ib_flush_mr_pool(rds_ibdev, + rds_ibdev->fmr_pool_hdl, 0); + } + } +} + +static void +__rdsv3_ib_teardown_mr(struct rdsv3_ib_mr *ibmr) +{ + RDSV3_DPRINTF4("__rdsv3_ib_teardown_mr", + "Enter: ibmr: %p umem_cookie %p", ibmr, ibmr->umem_cookie); + + /* unpin memory pages */ + (void) ddi_umem_unlock(ibmr->umem_cookie); +} + +void +rdsv3_ib_free_mr(void *trans_private, int invalidate) +{ + struct rdsv3_ib_mr *ibmr = trans_private; + struct rdsv3_ib_device *rds_ibdev = ibmr->device; + + RDSV3_DPRINTF4("rdsv3_ib_free_mr", "Enter: ibmr: %p inv: %d", + ibmr, invalidate); + + /* return the fmr to the IBTF pool */ + /* the final punch will come from the ibt_flush_fmr_pool() */ + (void) ibt_deregister_fmr(ib_get_ibt_hca_hdl(rds_ibdev->dev), + ibmr->rc_fmr_hdl); + (void) ibt_unmap_mem_area(ib_get_ibt_hca_hdl(rds_ibdev->dev), + ibmr->rc_ma_hdl); + __rdsv3_ib_teardown_mr(ibmr); + if (invalidate) { + rds_ibdev = ibmr->device; + (void) rdsv3_ib_flush_mr_pool(rds_ibdev, + rds_ibdev->fmr_pool_hdl, 0); + } + kmem_free((void *) ibmr, sizeof (*ibmr)); +} + +static int +rdsv3_ib_flush_mr_pool(struct rdsv3_ib_device *rds_ibdev, + ibt_fmr_pool_hdl_t pool_hdl, int free_all) +{ + /* LINTED E_FUNC_SET_NOT_USED */ + int l_free_all = free_all; + + RDSV3_DPRINTF4("rdsv3_ib_flush_mr_pool", "Enter: pool: %p", pool_hdl); + + rdsv3_ib_stats_inc(s_ib_rdma_mr_pool_flush); + + (void) ibt_flush_fmr_pool(ib_get_ibt_hca_hdl(rds_ibdev->dev), + pool_hdl); + return (0); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_recv.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_recv.c new file mode 100644 index 0000000000..21cbfb08f3 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_recv.c @@ -0,0 +1,1129 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/cpuvar.h> +#include <sys/rds.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/ib.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +static struct kmem_cache *rdsv3_ib_incoming_slab; +static struct kmem_cache *rdsv3_ib_frag_slab; +static atomic_t rdsv3_ib_allocation = ATOMIC_INIT(0); + +static void +rdsv3_ib_frag_drop_page(struct rdsv3_page_frag *frag) +{ + RDSV3_DPRINTF5("rdsv3_ib_frag_drop_page", + "frag %p page %p offset %d", frag, frag->f_page, frag->f_offset); + kmem_free(frag->f_page, PAGE_SIZE); + frag->f_page = NULL; +} + +static void +rdsv3_ib_frag_free(struct rdsv3_page_frag *frag) +{ + RDSV3_DPRINTF5("rdsv3_ib_frag_free", "frag %p page %p", + frag, frag->f_page); + ASSERT(frag->f_page == NULL); + kmem_cache_free(rdsv3_ib_frag_slab, frag); +} + +/* + * We map a page at a time. Its fragments are posted in order. This + * is called in fragment order as the fragments get send completion events. + * Only the last frag in the page performs the unmapping. + * + * It's OK for ring cleanup to call this in whatever order it likes because + * DMA is not in flight and so we can unmap while other ring entries still + * hold page references in their frags. + */ +static void +rdsv3_ib_recv_unmap_page(struct rdsv3_ib_connection *ic, + struct rdsv3_ib_recv_work *recv) +{ + struct rdsv3_page_frag *frag = recv->r_frag; + +#if 0 + RDSV3_DPRINTF5("rdsv3_ib_recv_unmap_page", + "recv %p frag %p page %p\n", recv, frag, frag->f_page); +#endif + if (frag->f_mapped) { + (void) ibt_unmap_mem_iov( + ib_get_ibt_hca_hdl(ic->i_cm_id->device), frag->f_mapped); + frag->f_mapped = 0; + } +} + +void +rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection *ic) +{ + struct rdsv3_ib_recv_work *recv; + struct rdsv3_header *hdrp; + uint32_t i; + + RDSV3_DPRINTF4("rdsv3_ib_recv_init_ring", "ic: %p", ic); + + hdrp = ic->i_recv_hdrs; + for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) { + recv->r_ibinc = NULL; + recv->r_frag = NULL; + + recv->r_wr.recv.wr_id = i; + + /* initialize the hdr sgl permanently */ + recv->r_sge[0].ds_va = (ib_vaddr_t)(uintptr_t)hdrp++; + recv->r_sge[0].ds_len = sizeof (struct rdsv3_header); + recv->r_sge[0].ds_key = ic->i_mr->lkey; + } +} + +static void +rdsv3_ib_recv_clear_one(struct rdsv3_ib_connection *ic, + struct rdsv3_ib_recv_work *recv) +{ + RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "ic: %p, recv: %p", + ic, recv); + + if (recv->r_ibinc) { + rdsv3_inc_put(&recv->r_ibinc->ii_inc); + recv->r_ibinc = NULL; + } + if (recv->r_frag) { + rdsv3_ib_recv_unmap_page(ic, recv); + if (recv->r_frag->f_page) + rdsv3_ib_frag_drop_page(recv->r_frag); + rdsv3_ib_frag_free(recv->r_frag); + recv->r_frag = NULL; + } + + RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "Return: ic: %p, recv: %p", + ic, recv); +} + +void +rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection *ic) +{ + uint32_t i; + + RDSV3_DPRINTF4("rdsv3_ib_recv_clear_ring", "ic: %p", ic); + + for (i = 0; i < ic->i_recv_ring.w_nr; i++) + rdsv3_ib_recv_clear_one(ic, &ic->i_recvs[i]); + + if (ic->i_frag.f_page) + rdsv3_ib_frag_drop_page(&ic->i_frag); +} + +static int +rdsv3_ib_recv_refill_one(struct rdsv3_connection *conn, + struct rdsv3_ib_recv_work *recv, + int kptr_gfp, int page_gfp) +{ + struct rdsv3_ib_connection *ic = conn->c_transport_data; + ibt_mi_hdl_t mi_hdl; + ibt_iov_attr_t iov_attr; + ibt_iov_t iov_arr[1]; + int ret = -ENOMEM; + + RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "conn: %p, recv: %p", + conn, recv); + + if (recv->r_ibinc == NULL) { + if (atomic_add_32_nv(&rdsv3_ib_allocation, 1) > + rdsv3_ib_sysctl_max_recv_allocation) { + atomic_add_32(&rdsv3_ib_allocation, -1); + rdsv3_ib_stats_inc(s_ib_rx_alloc_limit); + goto out; + } + recv->r_ibinc = kmem_cache_alloc(rdsv3_ib_incoming_slab, + kptr_gfp); + if (recv->r_ibinc == NULL) { + atomic_add_32(&rdsv3_ib_allocation, -1); + goto out; + } + list_create(&recv->r_ibinc->ii_frags, + sizeof (struct rdsv3_page_frag), + offsetof(struct rdsv3_page_frag, f_item)); + rdsv3_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr); + } + + if (recv->r_frag == NULL) { + recv->r_frag = kmem_cache_alloc(rdsv3_ib_frag_slab, kptr_gfp); + if (recv->r_frag == NULL) + goto out; + list_link_init(&recv->r_frag->f_item); + recv->r_frag->f_page = NULL; + } + + if (ic->i_frag.f_page == NULL) { + ic->i_frag.f_page = kmem_alloc(PAGE_SIZE, page_gfp); + if (ic->i_frag.f_page == NULL) + goto out; + ic->i_frag.f_offset = 0; + } + + iov_attr.iov_as = NULL; + iov_attr.iov = &iov_arr[0]; + iov_attr.iov_buf = NULL; + iov_attr.iov_list_len = 1; + iov_attr.iov_wr_nds = 1; + iov_attr.iov_lso_hdr_sz = 0; + iov_attr.iov_flags = IBT_IOV_SLEEP | IBT_IOV_RECV; + + /* Data */ + iov_arr[0].iov_addr = ic->i_frag.f_page + ic->i_frag.f_offset; + iov_arr[0].iov_len = RDSV3_FRAG_SIZE; + + /* + * Header comes from pre-registered buffer, so don't map it. + * Map the data only and stick in the header sgl quietly after + * the call. + */ + recv->r_wr.recv.wr_sgl = &recv->r_sge[1]; + recv->r_wr.recv.wr_nds = 1; + + ret = ibt_map_mem_iov(ib_get_ibt_hca_hdl(ic->i_cm_id->device), + &iov_attr, &recv->r_wr, &mi_hdl); + if (ret != IBT_SUCCESS) { + RDSV3_DPRINTF2("rdsv3_ib_recv_refill_one", + "ibt_map_mem_iov failed: %d", ret); + goto out; + } + + /* stick in the header */ + recv->r_wr.recv.wr_sgl = &recv->r_sge[0]; + recv->r_wr.recv.wr_nds = RDSV3_IB_RECV_SGE; + + /* + * Once we get the RDSV3_PAGE_LAST_OFF frag then rdsv3_ib_frag_unmap() + * must be called on this recv. This happens as completions hit + * in order or on connection shutdown. + */ + recv->r_frag->f_page = ic->i_frag.f_page; + recv->r_frag->f_offset = ic->i_frag.f_offset; + recv->r_frag->f_mapped = mi_hdl; + + if (ic->i_frag.f_offset < RDSV3_PAGE_LAST_OFF) { + ic->i_frag.f_offset += RDSV3_FRAG_SIZE; + } else { + ic->i_frag.f_page = NULL; + ic->i_frag.f_offset = 0; + } + + ret = 0; + + RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "Return: conn: %p, recv: %p", + conn, recv); +out: + return (ret); +} + +/* + * This tries to allocate and post unused work requests after making sure that + * they have all the allocations they need to queue received fragments into + * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc + * pairs don't go unmatched. + * + * -1 is returned if posting fails due to temporary resource exhaustion. + */ +int +rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int kptr_gfp, + int page_gfp, int prefill) +{ + struct rdsv3_ib_connection *ic = conn->c_transport_data; + struct rdsv3_ib_recv_work *recv; + unsigned int succ_wr; + unsigned int posted = 0; + int ret = 0; + uint32_t pos; + + RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "conn: %p, prefill: %d", + conn, prefill); + + while ((prefill || rdsv3_conn_up(conn)) && + rdsv3_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) { + if (pos >= ic->i_recv_ring.w_nr) { + RDSV3_DPRINTF0("rdsv3_ib_recv_refill", + "Argh - ring alloc returned pos=%u", + pos); + ret = -EINVAL; + break; + } + + recv = &ic->i_recvs[pos]; + ret = rdsv3_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp); + if (ret) { + ret = -1; + break; + } + + /* XXX when can this fail? */ + ret = ibt_post_recv(ib_get_ibt_channel_hdl(ic->i_cm_id), + &recv->r_wr.recv, 1, &succ_wr); + RDSV3_DPRINTF5("rdsv3_ib_recv_refill", + "recv %p ibinc %p frag %p ret %d\n", recv, + recv->r_ibinc, recv->r_frag, ret); + if (ret) { + RDSV3_DPRINTF2("rdsv3_ib_recv_refill", + "Return: conn: %p, posted: %d", conn, ret); + rdsv3_conn_drop(conn); + ret = -1; + break; + } + + posted++; + } + + /* We're doing flow control - update the window. */ + if (ic->i_flowctl && posted) + rdsv3_ib_advertise_credits(conn, posted); + + if (ret) + rdsv3_ib_ring_unalloc(&ic->i_recv_ring, 1); + + RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "Return: conn: %p, posted: %d", + conn, posted); + return (ret); +} + +void +rdsv3_ib_inc_purge(struct rdsv3_incoming *inc) +{ + struct rdsv3_ib_incoming *ibinc; + struct rdsv3_page_frag *frag; + struct rdsv3_page_frag *pos; + + RDSV3_DPRINTF4("rdsv3_ib_inc_purge", "inc: %p", inc); + + ibinc = container_of(inc, struct rdsv3_ib_incoming, ii_inc); + RDSV3_DPRINTF5("rdsv3_ib_inc_purge", + "purging ibinc %p inc %p\n", ibinc, inc); + + RDSV3_FOR_EACH_LIST_NODE_SAFE(frag, pos, &ibinc->ii_frags, f_item) { + list_remove_node(&frag->f_item); + rdsv3_ib_frag_drop_page(frag); + rdsv3_ib_frag_free(frag); + } + + RDSV3_DPRINTF4("rdsv3_ib_inc_purge", "Return: inc: %p", inc); +} + +void +rdsv3_ib_inc_free(struct rdsv3_incoming *inc) +{ + struct rdsv3_ib_incoming *ibinc; + + RDSV3_DPRINTF4("rdsv3_ib_inc_free", "inc: %p", inc); + + ibinc = container_of(inc, struct rdsv3_ib_incoming, ii_inc); + + rdsv3_ib_inc_purge(inc); + RDSV3_DPRINTF5("rdsv3_ib_inc_free", "freeing ibinc %p inc %p", + ibinc, inc); + ASSERT(list_is_empty(&ibinc->ii_frags)); + kmem_cache_free(rdsv3_ib_incoming_slab, ibinc); + atomic_dec_uint(&rdsv3_ib_allocation); + + RDSV3_DPRINTF4("rdsv3_ib_inc_free", "Return: inc: %p", inc); +} + +int +rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming *inc, uio_t *uiop, + size_t size) +{ + struct rdsv3_ib_incoming *ibinc; + struct rdsv3_page_frag *frag; + unsigned long to_copy; + unsigned long frag_off = 0; + int copied = 0; + int ret; + uint32_t len; + + ibinc = container_of(inc, struct rdsv3_ib_incoming, ii_inc); + frag = list_head(&ibinc->ii_frags); + len = ntohl(inc->i_hdr.h_len); + + RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user", "inc: %p, size: %d len: %d", + inc, size, len); + + while (copied < size && copied < len) { + if (frag_off == RDSV3_FRAG_SIZE) { + frag = list_next(&ibinc->ii_frags, frag); + frag_off = 0; + } + + to_copy = min(len - copied, RDSV3_FRAG_SIZE - frag_off); + to_copy = min(size - copied, to_copy); + + RDSV3_DPRINTF5("rdsv3_ib_inc_copy_to_user", + "%lu bytes to user %p from frag [%p, %u] + %lu", + to_copy, uiop, + frag->f_page, frag->f_offset, frag_off); + + ret = uiomove((caddr_t)(frag->f_page + + frag->f_offset + frag_off), + to_copy, UIO_READ, uiop); + if (ret) { + RDSV3_DPRINTF2("rdsv3_ib_inc_copy_to_user", + "uiomove (%d) returned: %d", to_copy, ret); + break; + } + + frag_off += to_copy; + copied += to_copy; + } + + RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user", + "Return: inc: %p, copied: %d", inc, copied); + + return (copied); +} + +/* ic starts out kmem_zalloc()ed */ +void +rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection *ic) +{ + ibt_send_wr_t *wr = &ic->i_ack_wr; + ibt_wr_ds_t *sge = &ic->i_ack_sge; + + RDSV3_DPRINTF4("rdsv3_ib_recv_init_ack", "ic: %p", ic); + + sge->ds_va = ic->i_ack_dma; + sge->ds_len = sizeof (struct rdsv3_header); + sge->ds_key = ic->i_mr->lkey; + + wr->wr_sgl = sge; + wr->wr_nds = 1; + wr->wr_opcode = IBT_WRC_SEND; + wr->wr_id = RDSV3_IB_ACK_WR_ID; + wr->wr_flags = IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; +} + +/* + * You'd think that with reliable IB connections you wouldn't need to ack + * messages that have been received. The problem is that IB hardware generates + * an ack message before it has DMAed the message into memory. This creates a + * potential message loss if the HCA is disabled for any reason between when it + * sends the ack and before the message is DMAed and processed. This is only a + * potential issue if another HCA is available for fail-over. + * + * When the remote host receives our ack they'll free the sent message from + * their send queue. To decrease the latency of this we always send an ack + * immediately after we've received messages. + * + * For simplicity, we only have one ack in flight at a time. This puts + * pressure on senders to have deep enough send queues to absorb the latency of + * a single ack frame being in flight. This might not be good enough. + * + * This is implemented by have a long-lived send_wr and sge which point to a + * statically allocated ack frame. This ack wr does not fall under the ring + * accounting that the tx and rx wrs do. The QP attribute specifically makes + * room for it beyond the ring size. Send completion notices its special + * wr_id and avoids working with the ring in that case. + */ +static void +rdsv3_ib_set_ack(struct rdsv3_ib_connection *ic, uint64_t seq, + int ack_required) +{ + RDSV3_DPRINTF4("rdsv3_ib_set_ack", "ic: %p, seq: %lld ack: %d", + ic, seq, ack_required); + + mutex_enter(&ic->i_ack_lock); + ic->i_ack_next = seq; + if (ack_required) + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + mutex_exit(&ic->i_ack_lock); +} + +static uint64_t +rdsv3_ib_get_ack(struct rdsv3_ib_connection *ic) +{ + uint64_t seq; + + RDSV3_DPRINTF4("rdsv3_ib_get_ack", "ic: %p", ic); + + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + + mutex_enter(&ic->i_ack_lock); + seq = ic->i_ack_next; + mutex_exit(&ic->i_ack_lock); + + return (seq); +} + +static void +rdsv3_ib_send_ack(struct rdsv3_ib_connection *ic, unsigned int adv_credits) +{ + struct rdsv3_header *hdr = ic->i_ack; + uint64_t seq; + int ret; + + RDSV3_DPRINTF4("rdsv3_ib_send_ack", "ic: %p adv_credits: %d", + ic, adv_credits); + + seq = rdsv3_ib_get_ack(ic); + + RDSV3_DPRINTF4("rdsv3_ib_send_ack", "send_ack: ic %p ack %llu", + ic, (unsigned long long) seq); + rdsv3_message_populate_header(hdr, 0, 0, 0); + hdr->h_ack = htonll(seq); + hdr->h_credit = adv_credits; + rdsv3_message_make_checksum(hdr); + ic->i_ack_queued = jiffies; + + ret = ibt_post_send(RDSV3_QP2CHANHDL(ic->i_cm_id->qp), &ic->i_ack_wr, 1, + NULL); + if (ret) { + /* + * Failed to send. Release the WR, and + * force another ACK. + */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + rdsv3_ib_stats_inc(s_ib_ack_send_failure); +#if 1 + RDSV3_DPRINTF2("rdsv3_ib_send_ack", "ibt_post_send FAIL"); +#else + /* Need to finesse this later. */ + RDSV3_PANIC(); +#endif + } else { + rdsv3_ib_stats_inc(s_ib_ack_sent); + } + RDSV3_DPRINTF4("rdsv3_ib_send_ack", "Return: ic: %p adv_credits: %d", + ic, adv_credits); +} + +/* + * There are 3 ways of getting acknowledgements to the peer: + * 1. We call rdsv3_ib_attempt_ack from the recv completion handler + * to send an ACK-only frame. + * However, there can be only one such frame in the send queue + * at any time, so we may have to postpone it. + * 2. When another (data) packet is transmitted while there's + * an ACK in the queue, we piggyback the ACK sequence number + * on the data packet. + * 3. If the ACK WR is done sending, we get called from the + * send queue completion handler, and check whether there's + * another ACK pending (postponed because the WR was on the + * queue). If so, we transmit it. + * + * We maintain 2 variables: + * - i_ack_flags, which keeps track of whether the ACK WR + * is currently in the send queue or not (IB_ACK_IN_FLIGHT) + * - i_ack_next, which is the last sequence number we received + * + * Potentially, send queue and receive queue handlers can run concurrently. + * It would be nice to not have to use a spinlock to synchronize things, + * but the one problem that rules this out is that 64bit updates are + * not atomic on all platforms. Things would be a lot simpler if + * we had atomic64 or maybe cmpxchg64 everywhere. + * + * Reconnecting complicates this picture just slightly. When we + * reconnect, we may be seeing duplicate packets. The peer + * is retransmitting them, because it hasn't seen an ACK for + * them. It is important that we ACK these. + * + * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with + * this flag set *MUST* be acknowledged immediately. + */ + +/* + * When we get here, we're called from the recv queue handler. + * Check whether we ought to transmit an ACK. + */ +void +rdsv3_ib_attempt_ack(struct rdsv3_ib_connection *ic) +{ + unsigned int adv_credits; + + RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "ic: %p", ic); + + if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) + return; + + if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) { + rdsv3_ib_stats_inc(s_ib_ack_send_delayed); + return; + } + + /* Can we get a send credit? */ + if (!rdsv3_ib_send_grab_credits(ic, 1, &adv_credits, 0, + RDSV3_MAX_ADV_CREDIT)) { + rdsv3_ib_stats_inc(s_ib_tx_throttle); + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + return; + } + + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + rdsv3_ib_send_ack(ic, adv_credits); + + RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "Return: ic: %p", ic); +} + +/* + * We get here from the send completion handler, when the + * adapter tells us the ACK frame was sent. + */ +void +rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection *ic) +{ + RDSV3_DPRINTF4("rdsv3_ib_ack_send_complete", "ic: %p", ic); + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + rdsv3_ib_attempt_ack(ic); +} + +/* + * This is called by the regular xmit code when it wants to piggyback + * an ACK on an outgoing frame. + */ +uint64_t +rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection *ic) +{ + RDSV3_DPRINTF4("rdsv3_ib_piggyb_ack", "ic: %p", ic); + if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) { + rdsv3_ib_stats_inc(s_ib_ack_send_piggybacked); + } + return (rdsv3_ib_get_ack(ic)); +} + +static struct rdsv3_header * +rdsv3_ib_get_header(struct rdsv3_connection *conn, + struct rdsv3_ib_recv_work *recv, + uint32_t data_len) +{ + struct rdsv3_ib_connection *ic = conn->c_transport_data; + void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs]; + + RDSV3_DPRINTF4("rdsv3_ib_get_header", "conn: %p, recv: %p len: %d", + conn, recv, data_len); + + /* + * Support header at the front (RDS 3.1+) as well as header-at-end. + * + * Cases: + * 1) header all in header buff (great!) + * 2) header all in data page (copy all to header buff) + * 3) header split across hdr buf + data page + * (move bit in hdr buff to end before copying other bit from + * data page) + */ + if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDSV3_FRAG_SIZE) + return (hdr_buff); + /* + * XXX - Need to discuss the support for version < RDS_PROTOCOL_3_1. + */ + if (conn->c_version == RDS_PROTOCOL_3_0) + return (hdr_buff); + + /* version < RDS_PROTOCOL_3_0 */ + RDSV3_DPRINTF2("rdsv3_ib_get_header", + "NULL header (version: 0x%x, data_len: %d)", conn->c_version, + data_len); + return (NULL); +} + +/* + * It's kind of lame that we're copying from the posted receive pages into + * long-lived bitmaps. We could have posted the bitmaps and rdma written into + * them. But receiving new congestion bitmaps should be a *rare* event, so + * hopefully we won't need to invest that complexity in making it more + * efficient. By copying we can share a simpler core with TCP which has to + * copy. + */ +static void +rdsv3_ib_cong_recv(struct rdsv3_connection *conn, + struct rdsv3_ib_incoming *ibinc) +{ + struct rdsv3_cong_map *map; + unsigned int map_off; + unsigned int map_page; + struct rdsv3_page_frag *frag; + unsigned long frag_off; + unsigned long to_copy; + unsigned long copied; + uint64_t uncongested = 0; + caddr_t addr; + + RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "conn: %p, ibinc: %p", + conn, ibinc); + + /* catch completely corrupt packets */ + if (ntohl(ibinc->ii_inc.i_hdr.h_len) != RDSV3_CONG_MAP_BYTES) + return; + + map = conn->c_fcong; + map_page = 0; + map_off = 0; + + frag = list_head(&ibinc->ii_frags); + frag_off = 0; + + copied = 0; + + while (copied < RDSV3_CONG_MAP_BYTES) { + uint64_t *src, *dst; + unsigned int k; + + to_copy = min(RDSV3_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); + ASSERT(!(to_copy & 7)); /* Must be 64bit aligned. */ + + addr = frag->f_page + frag->f_offset; + + src = (uint64_t *)(addr + frag_off); + dst = (uint64_t *)(map->m_page_addrs[map_page] + map_off); + RDSV3_DPRINTF4("rdsv3_ib_cong_recv", + "src: %p dst: %p copied: %d", src, dst, copied); + for (k = 0; k < to_copy; k += 8) { + /* + * Record ports that became uncongested, ie + * bits that changed from 0 to 1. + */ + uncongested |= ~(*src) & *dst; + *dst++ = *src++; + } + + copied += to_copy; + RDSV3_DPRINTF4("rdsv3_ib_cong_recv", + "src: %p dst: %p copied: %d", src, dst, copied); + + map_off += to_copy; + if (map_off == PAGE_SIZE) { + map_off = 0; + map_page++; + } + + frag_off += to_copy; + if (frag_off == RDSV3_FRAG_SIZE) { + frag = list_next(&ibinc->ii_frags, frag); + frag_off = 0; + } + } + +#if 0 +XXX + /* the congestion map is in little endian order */ + uncongested = le64_to_cpu(uncongested); +#endif + + rdsv3_cong_map_updated(map, uncongested); + + RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "Return: conn: %p, ibinc: %p", + conn, ibinc); +} + +/* + * Rings are posted with all the allocations they'll need to queue the + * incoming message to the receiving socket so this can't fail. + * All fragments start with a header, so we can make sure we're not receiving + * garbage, and we can tell a small 8 byte fragment from an ACK frame. + */ +struct rdsv3_ib_ack_state { + uint64_t ack_next; + uint64_t ack_recv; + unsigned int ack_required:1; + unsigned int ack_next_valid:1; + unsigned int ack_recv_valid:1; +}; + +static void +rdsv3_ib_process_recv(struct rdsv3_connection *conn, + struct rdsv3_ib_recv_work *recv, uint32_t data_len, + struct rdsv3_ib_ack_state *state) +{ + struct rdsv3_ib_connection *ic = conn->c_transport_data; + struct rdsv3_ib_incoming *ibinc = ic->i_ibinc; + struct rdsv3_header *ihdr, *hdr; + + /* XXX shut down the connection if port 0,0 are seen? */ + + RDSV3_DPRINTF5("rdsv3_ib_process_recv", + "ic %p ibinc %p recv %p byte len %u", ic, ibinc, recv, data_len); + + if (data_len < sizeof (struct rdsv3_header)) { + RDSV3_DPRINTF2("rdsv3_ib_process_recv", + "incoming message from %u.%u.%u.%u didn't include a " + "header, disconnecting and reconnecting", + NIPQUAD(conn->c_faddr)); + rdsv3_conn_drop(conn); + return; + } + data_len -= sizeof (struct rdsv3_header); + + if ((ihdr = rdsv3_ib_get_header(conn, recv, data_len)) == NULL) { + RDSV3_DPRINTF2("rdsv3_ib_process_recv", "incoming message " + "from %u.%u.%u.%u didn't have a proper version (0x%x) or" + "data_len (0x%x), disconnecting and " + "reconnecting", + NIPQUAD(conn->c_faddr), conn->c_version, data_len); + rdsv3_conn_drop(conn); + return; + } + + /* Validate the checksum. */ + if (!rdsv3_message_verify_checksum(ihdr)) { + RDSV3_DPRINTF2("rdsv3_ib_process_recv", "incoming message " + "from %u.%u.%u.%u has corrupted header - " + "forcing a reconnect", + NIPQUAD(conn->c_faddr)); + rdsv3_conn_drop(conn); + rdsv3_stats_inc(s_recv_drop_bad_checksum); + return; + } + + /* Process the ACK sequence which comes with every packet */ + state->ack_recv = ntohll(ihdr->h_ack); + state->ack_recv_valid = 1; + + /* Process the credits update if there was one */ + if (ihdr->h_credit) + rdsv3_ib_send_add_credits(conn, ihdr->h_credit); + + if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) { + /* + * This is an ACK-only packet. The fact that it gets + * special treatment here is that historically, ACKs + * were rather special beasts. + */ + rdsv3_ib_stats_inc(s_ib_ack_received); + + /* + * Usually the frags make their way on to incs and are then + * freed as + * the inc is freed. We don't go that route, so we have to + * drop the + * page ref ourselves. We can't just leave the page on the recv + * because that confuses the dma mapping of pages and each + * recv's use + * of a partial page. We can leave the frag, though, it will be + * reused. + * + * FIXME: Fold this into the code path below. + */ + rdsv3_ib_frag_drop_page(recv->r_frag); + return; + } + + /* + * If we don't already have an inc on the connection then this + * fragment has a header and starts a message.. copy its header + * into the inc and save the inc so we can hang upcoming fragments + * off its list. + */ + if (ibinc == NULL) { + ibinc = recv->r_ibinc; + recv->r_ibinc = NULL; + ic->i_ibinc = ibinc; + + hdr = &ibinc->ii_inc.i_hdr; + (void) memcpy(hdr, ihdr, sizeof (*hdr)); + ic->i_recv_data_rem = ntohl(hdr->h_len); + + RDSV3_DPRINTF5("rdsv3_ib_process_recv", + "ic %p ibinc %p rem %u flag 0x%x", ic, ibinc, + ic->i_recv_data_rem, hdr->h_flags); + } else { + hdr = &ibinc->ii_inc.i_hdr; + /* + * We can't just use memcmp here; fragments of a + * single message may carry different ACKs + */ + if (hdr->h_sequence != ihdr->h_sequence || + hdr->h_len != ihdr->h_len || + hdr->h_sport != ihdr->h_sport || + hdr->h_dport != ihdr->h_dport) { + RDSV3_DPRINTF2("rdsv3_ib_process_recv", + "fragment header mismatch; forcing reconnect"); + rdsv3_conn_drop(conn); + return; + } + } + + list_insert_tail(&ibinc->ii_frags, recv->r_frag); + recv->r_frag = NULL; + + if (ic->i_recv_data_rem > RDSV3_FRAG_SIZE) + ic->i_recv_data_rem -= RDSV3_FRAG_SIZE; + else { + ic->i_recv_data_rem = 0; + ic->i_ibinc = NULL; + + if (ibinc->ii_inc.i_hdr.h_flags == RDSV3_FLAG_CONG_BITMAP) + rdsv3_ib_cong_recv(conn, ibinc); + else { + rdsv3_recv_incoming(conn, conn->c_faddr, conn->c_laddr, + &ibinc->ii_inc, KM_NOSLEEP); + state->ack_next = ntohll(hdr->h_sequence); + state->ack_next_valid = 1; + } + + /* + * Evaluate the ACK_REQUIRED flag *after* we received + * the complete frame, and after bumping the next_rx + * sequence. + */ + if (hdr->h_flags & RDSV3_FLAG_ACK_REQUIRED) { + rdsv3_stats_inc(s_recv_ack_required); + state->ack_required = 1; + } + + rdsv3_inc_put(&ibinc->ii_inc); + } + + RDSV3_DPRINTF4("rdsv3_ib_process_recv", + "Return: conn: %p recv: %p len: %d state: %p", + conn, recv, data_len, state); +} + +/* + * Plucking the oldest entry from the ring can be done concurrently with + * the thread refilling the ring. Each ring operation is protected by + * spinlocks and the transient state of refilling doesn't change the + * recording of which entry is oldest. + * + * This relies on IB only calling one cq comp_handler for each cq so that + * there will only be one caller of rdsv3_recv_incoming() per RDS connection. + */ + +void +rdsv3_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rdsv3_connection *conn = context; + struct rdsv3_ib_connection *ic = conn->c_transport_data; + + RDSV3_DPRINTF4("rdsv3_ib_recv_cq_comp_handler", + "Enter(conn: %p cq: %p)", conn, cq); + + rdsv3_ib_stats_inc(s_ib_rx_cq_call); + + (void) ddi_taskq_dispatch(ic->i_recv_tasklet, rdsv3_ib_recv_tasklet_fn, + (void *)ic, DDI_SLEEP); +} + +static inline void +rdsv3_poll_cq(struct rdsv3_ib_connection *ic, struct rdsv3_ib_ack_state *state) +{ + struct rdsv3_connection *conn = ic->conn; + ibt_wc_t wc; + struct rdsv3_ib_recv_work *recv; + uint_t polled; + + while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_recv_cq), &wc, 1, &polled) == + IBT_SUCCESS) { + RDSV3_DPRINTF5("rdsv3_ib_recv_cq_comp_handler", + "rwc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc.wc_id, wc.wc_status, + wc.wc_bytes_xfer, ntohl(wc.wc_immed_data)); + rdsv3_ib_stats_inc(s_ib_rx_cq_event); + + recv = &ic->i_recvs[rdsv3_ib_ring_oldest(&ic->i_recv_ring)]; + + rdsv3_ib_recv_unmap_page(ic, recv); + + /* + * Also process recvs in connecting state because it is possible + * to get a recv completion _before_ the rdmacm ESTABLISHED + * event is processed. + */ + if (rdsv3_conn_up(conn) || rdsv3_conn_connecting(conn)) { + /* + * We expect errors as the qp is drained during + * shutdown + */ + if (wc.wc_status == IBT_WC_SUCCESS) { + rdsv3_ib_process_recv(conn, recv, + wc.wc_bytes_xfer, state); + } else { + RDSV3_DPRINTF2("rdsv3_ib_recv_cq_comp_handler", + "recv completion on " + "%u.%u.%u.%u had status %u, " + "disconnecting and reconnecting\n", + NIPQUAD(conn->c_faddr), + wc.wc_status); + rdsv3_conn_drop(conn); + } + } + + rdsv3_ib_ring_free(&ic->i_recv_ring, 1); + } +} + +static processorid_t rdsv3_taskq_bind_cpuid = 0; +void +rdsv3_ib_recv_tasklet_fn(void *data) +{ + struct rdsv3_ib_connection *ic = (struct rdsv3_ib_connection *)data; + struct rdsv3_connection *conn = ic->conn; + struct rdsv3_ib_ack_state state = { 0, }; + cpu_t *cp; + + RDSV3_DPRINTF4("rdsv3_ib_recv_tasklet_fn", "Enter: ic: %p", ic); + + /* If not already bound, bind this thread to a CPU */ + if (ic->i_recv_tasklet_cpuid != rdsv3_taskq_bind_cpuid) { + cp = cpu[rdsv3_taskq_bind_cpuid]; + mutex_enter(&cpu_lock); + if (cpu_is_online(cp)) { + if (ic->i_recv_tasklet_cpuid >= 0) + thread_affinity_clear(curthread); + thread_affinity_set(curthread, rdsv3_taskq_bind_cpuid); + ic->i_recv_tasklet_cpuid = rdsv3_taskq_bind_cpuid; + } + mutex_exit(&cpu_lock); + } + + rdsv3_poll_cq(ic, &state); + (void) ibt_enable_cq_notify(RDSV3_CQ2CQHDL(ic->i_recv_cq), + IBT_NEXT_SOLICITED); + rdsv3_poll_cq(ic, &state); + + if (state.ack_next_valid) + rdsv3_ib_set_ack(ic, state.ack_next, state.ack_required); + if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { + rdsv3_send_drop_acked(conn, state.ack_recv, NULL); + ic->i_ack_recv = state.ack_recv; + } + if (rdsv3_conn_up(conn)) + rdsv3_ib_attempt_ack(ic); + + /* + * If we ever end up with a really empty receive ring, we're + * in deep trouble, as the sender will definitely see RNR + * timeouts. + */ + if (rdsv3_ib_ring_empty(&ic->i_recv_ring)) + rdsv3_ib_stats_inc(s_ib_rx_ring_empty); + + /* + * If the ring is running low, then schedule the thread to refill. + */ + if (rdsv3_ib_ring_low(&ic->i_recv_ring) && + (rdsv3_conn_up(conn) || rdsv3_conn_connecting(conn))) + rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0); + + RDSV3_DPRINTF4("rdsv3_ib_recv_tasklet_fn", "Return: ic: %p", ic); +} + +int +rdsv3_ib_recv(struct rdsv3_connection *conn) +{ + struct rdsv3_ib_connection *ic = conn->c_transport_data; + int ret = 0; + + RDSV3_DPRINTF4("rdsv3_ib_recv", "conn %p\n", conn); + + /* + * If we get a temporary posting failure in this context then + * we're really low and we want the caller to back off for a bit. + */ + mutex_enter(&ic->i_recv_mutex); + if (rdsv3_ib_recv_refill(conn, KM_NOSLEEP, 0, 0)) + ret = -ENOMEM; + else + rdsv3_ib_stats_inc(s_ib_rx_refill_from_thread); + mutex_exit(&ic->i_recv_mutex); + + if (rdsv3_conn_up(conn)) + rdsv3_ib_attempt_ack(ic); + + RDSV3_DPRINTF4("rdsv3_ib_recv", "Return: conn: %p", conn); + + return (ret); +} + +uint_t MaxRecvMemory = 128 * 1024 * 1024; + +int +rdsv3_ib_recv_init(void) +{ + int ret = -ENOMEM; + + RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Enter"); + + /* XXX - hard code it to 128 MB */ + rdsv3_ib_sysctl_max_recv_allocation = MaxRecvMemory / RDSV3_FRAG_SIZE; + + rdsv3_ib_incoming_slab = kmem_cache_create("rdsv3_ib_incoming", + sizeof (struct rdsv3_ib_incoming), 0, NULL, NULL, NULL, + NULL, NULL, 0); + if (rdsv3_ib_incoming_slab == NULL) + goto out; + + rdsv3_ib_frag_slab = kmem_cache_create("rdsv3_ib_frag", + sizeof (struct rdsv3_page_frag), + 0, NULL, NULL, NULL, NULL, NULL, 0); + if (rdsv3_ib_frag_slab == NULL) + kmem_cache_destroy(rdsv3_ib_incoming_slab); + else + ret = 0; + + RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Return"); +out: + return (ret); +} + +void +rdsv3_ib_recv_exit(void) +{ + RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Enter"); + kmem_cache_destroy(rdsv3_ib_incoming_slab); + kmem_cache_destroy(rdsv3_ib_frag_slab); + RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Return"); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_ring.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_ring.c new file mode 100644 index 0000000000..889cc016d8 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_ring.c @@ -0,0 +1,208 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/rds.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/ib.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +/* + * Locking for IB rings. + * We assume that allocation is always protected by a mutex + * in the caller (this is a valid assumption for the current + * implementation). + * + * Freeing always happens in an interrupt, and hence only + * races with allocations, but not with other free()s. + * + * The interaction between allocation and freeing is that + * the alloc code has to determine the number of free entries. + * To this end, we maintain two counters; an allocation counter + * and a free counter. Both are allowed to run freely, and wrap + * around. + * The number of used entries is always (alloc_ctr - free_ctr) % NR. + * + * The current implementation makes free_ctr atomic. When the + * caller finds an allocation fails, it should set an "alloc fail" + * bit and retry the allocation. The "alloc fail" bit essentially tells + * the CQ completion handlers to wake it up after freeing some + * more entries. + */ + +/* + * This only happens on shutdown. + */ +rdsv3_wait_queue_t rdsv3_ib_ring_empty_wait; + +void +rdsv3_ib_ring_init(struct rdsv3_ib_work_ring *ring, uint32_t nr) +{ + (void) memset(ring, 0, sizeof (*ring)); + ring->w_nr = nr; + RDSV3_DPRINTF5("rdsv3_ib_ring_init", "ring %p nr %u", ring, ring->w_nr); +} + +static inline uint32_t +__rdsv3_ib_ring_used(struct rdsv3_ib_work_ring *ring) +{ + uint32_t diff; + + /* This assumes that atomic_t has at least as many bits as uint32_t */ + diff = ring->w_alloc_ctr - (uint32_t)atomic_get(&ring->w_free_ctr); + ASSERT(diff <= ring->w_nr); + + return (diff); +} + +void +rdsv3_ib_ring_resize(struct rdsv3_ib_work_ring *ring, uint32_t nr) +{ + /* + * We only ever get called from the connection setup code, + * prior to creating the QP. + */ + ASSERT(!__rdsv3_ib_ring_used(ring)); + ring->w_nr = nr; +} + +static int +__rdsv3_ib_ring_empty(struct rdsv3_ib_work_ring *ring) +{ + return (__rdsv3_ib_ring_used(ring) == 0); +} + +uint32_t +rdsv3_ib_ring_alloc(struct rdsv3_ib_work_ring *ring, uint32_t val, + uint32_t *pos) +{ + uint32_t ret = 0, avail; + + avail = ring->w_nr - __rdsv3_ib_ring_used(ring); + + RDSV3_DPRINTF5("rdsv3_ib_ring_alloc", + "ring %p val %u next %u free %u", ring, val, + ring->w_alloc_ptr, avail); + + if (val && avail) { + ret = min(val, avail); + *pos = ring->w_alloc_ptr; + + ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr; + ring->w_alloc_ctr += ret; + } + + return (ret); +} + +void +rdsv3_ib_ring_free(struct rdsv3_ib_work_ring *ring, uint32_t val) +{ + ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr; + atomic_add_32(&ring->w_free_ctr, val); + + if (__rdsv3_ib_ring_empty(ring)) + rdsv3_wake_up(&rdsv3_ib_ring_empty_wait); +} + +void +rdsv3_ib_ring_unalloc(struct rdsv3_ib_work_ring *ring, uint32_t val) +{ + ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr; + ring->w_alloc_ctr -= val; +} + +int +rdsv3_ib_ring_empty(struct rdsv3_ib_work_ring *ring) +{ + return (__rdsv3_ib_ring_empty(ring)); +} + +int +rdsv3_ib_ring_low(struct rdsv3_ib_work_ring *ring) +{ + return (__rdsv3_ib_ring_used(ring) <= (ring->w_nr >> 1)); +} + +/* + * returns the oldest alloced ring entry. This will be the next one + * freed. This can't be called if there are none allocated. + */ +uint32_t +rdsv3_ib_ring_oldest(struct rdsv3_ib_work_ring *ring) +{ + return (ring->w_free_ptr); +} + +/* + * returns the number of completed work requests. + */ + +uint32_t +rdsv3_ib_ring_completed(struct rdsv3_ib_work_ring *ring, + uint32_t wr_id, uint32_t oldest) +{ + uint32_t ret; + + if (oldest <= (unsigned long long)wr_id) + ret = (unsigned long long)wr_id - oldest + 1; + else + ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1; + + RDSV3_DPRINTF5("rdsv3_ib_ring_completed", + "ring %p ret %u wr_id %u oldest %u", ring, ret, wr_id, oldest); + return (ret); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_send.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_send.c new file mode 100644 index 0000000000..9a8ba2fd6c --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_send.c @@ -0,0 +1,1148 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/rds.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/rdma.h> +#include <sys/ib/clients/rdsv3/ib.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +static void +rdsv3_ib_send_rdma_complete(struct rdsv3_message *rm, + int wc_status) +{ + int notify_status; + + RDSV3_DPRINTF4("rdsv3_ib_send_rdma_complete", "rm: %p, wc_status: %d", + rm, wc_status); + + switch (wc_status) { + case IBT_WC_WR_FLUSHED_ERR: + return; + + case IBT_WC_SUCCESS: + notify_status = RDSV3_RDMA_SUCCESS; + break; + + case IBT_WC_REMOTE_ACCESS_ERR: + notify_status = RDSV3_RDMA_REMOTE_ERROR; + break; + + default: + notify_status = RDSV3_RDMA_OTHER_ERROR; + break; + } + rdsv3_rdma_send_complete(rm, notify_status); + + RDSV3_DPRINTF4("rdsv3_ib_send_rdma_complete", "rm: %p, wc_status: %d", + rm, wc_status); +} + +static void rdsv3_ib_dma_unmap_sg_rdma(struct ib_device *dev, + uint_t num, struct rdsv3_rdma_sg scat[]); + +void +rdsv3_ib_send_unmap_rdma(struct rdsv3_ib_connection *ic, + struct rdsv3_rdma_op *op) +{ + RDSV3_DPRINTF4("rdsv3_ib_send_unmap_rdma", "ic: %p, op: %p", ic, op); + if (op->r_mapped) { + op->r_mapped = 0; + if (ic->i_cm_id) { + rdsv3_ib_dma_unmap_sg_rdma(ic->i_cm_id->device, + op->r_nents, op->r_rdma_sg); + } else { + rdsv3_ib_dma_unmap_sg_rdma((struct ib_device *)NULL, + op->r_nents, op->r_rdma_sg); + } + } +} + +static void +rdsv3_ib_send_unmap_rm(struct rdsv3_ib_connection *ic, + struct rdsv3_ib_send_work *send, + int wc_status) +{ + struct rdsv3_message *rm = send->s_rm; + + RDSV3_DPRINTF4("rdsv3_ib_send_unmap_rm", "ic %p send %p rm %p\n", + ic, send, rm); + + rdsv3_ib_dma_unmap_sg(ic->i_cm_id->device, + rm->m_sg, rm->m_nents); + + if (rm->m_rdma_op != NULL) { + rdsv3_ib_send_unmap_rdma(ic, rm->m_rdma_op); + + /* + * If the user asked for a completion notification on this + * message, we can implement three different semantics: + * 1. Notify when we received the ACK on the RDS message + * that was queued with the RDMA. This provides reliable + * notification of RDMA status at the expense of a one-way + * packet delay. + * 2. Notify when the IB stack gives us the completion + * event for the RDMA operation. + * 3. Notify when the IB stack gives us the completion + * event for the accompanying RDS messages. + * Here, we implement approach #3. To implement approach #2, + * call rdsv3_rdma_send_complete from the cq_handler. + * To implement #1, + * don't call rdsv3_rdma_send_complete at all, and fall back to + * the notify + * handling in the ACK processing code. + * + * Note: There's no need to explicitly sync any RDMA buffers + * using + * ib_dma_sync_sg_for_cpu - the completion for the RDMA + * operation itself unmapped the RDMA buffers, which takes care + * of synching. + */ + rdsv3_ib_send_rdma_complete(rm, wc_status); + + if (rm->m_rdma_op->r_write) + rdsv3_stats_add(s_send_rdma_bytes, + rm->m_rdma_op->r_bytes); + else + rdsv3_stats_add(s_recv_rdma_bytes, + rm->m_rdma_op->r_bytes); + } + + /* + * If anyone waited for this message to get flushed out, wake + * them up now + */ + rdsv3_message_unmapped(rm); + + rdsv3_message_put(rm); + send->s_rm = NULL; +} + +void +rdsv3_ib_send_init_ring(struct rdsv3_ib_connection *ic) +{ + struct rdsv3_ib_send_work *send; + uint32_t i; + + RDSV3_DPRINTF4("rdsv3_ib_send_init_ring", "ic: %p", ic); + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + send->s_rm = NULL; + send->s_op = NULL; + } +} + +void +rdsv3_ib_send_clear_ring(struct rdsv3_ib_connection *ic) +{ + struct rdsv3_ib_send_work *send; + uint32_t i; + + RDSV3_DPRINTF4("rdsv3_ib_send_clear_ring", "ic: %p", ic); + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + if (send->s_opcode == 0xdd) + continue; + if (send->s_rm) + rdsv3_ib_send_unmap_rm(ic, send, IBT_WC_WR_FLUSHED_ERR); + if (send->s_op) + rdsv3_ib_send_unmap_rdma(ic, send->s_op); + } + + RDSV3_DPRINTF4("rdsv3_ib_send_clear_ring", "Return: ic: %p", ic); +} + +/* + * The _oldest/_free ring operations here race cleanly with the alloc/unalloc + * operations performed in the send path. As the sender allocs and potentially + * unallocs the next free entry in the ring it doesn't alter which is + * the next to be freed, which is what this is concerned with. + */ +void +rdsv3_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rdsv3_connection *conn = context; + struct rdsv3_ib_connection *ic = conn->c_transport_data; + ibt_wc_t wc; + struct rdsv3_ib_send_work *send; + uint32_t completed, polled; + uint32_t oldest; + uint32_t i = 0; + int ret; + + RDSV3_DPRINTF4("rdsv3_ib_send_cq_comp_handler", "conn: %p cq: %p", + conn, cq); + + rdsv3_ib_stats_inc(s_ib_tx_cq_call); + ret = ibt_enable_cq_notify(RDSV3_CQ2CQHDL(cq), IBT_NEXT_COMPLETION); + if (ret) + RDSV3_DPRINTF2("rdsv3_ib_send_cq_comp_handler", + "ib_req_notify_cq send failed: %d", ret); + + while (ibt_poll_cq(RDSV3_CQ2CQHDL(cq), &wc, 1, &polled) == + IBT_SUCCESS) { + RDSV3_DPRINTF5("rdsv3_ib_send_cq_comp_handler", + "swc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc.wc_id, wc.wc_status, + wc.wc_bytes_xfer, ntohl(wc.wc_immed_data)); + rdsv3_ib_stats_inc(s_ib_tx_cq_event); + + if (wc.wc_id == RDSV3_IB_ACK_WR_ID) { + if (ic->i_ack_queued + HZ/2 < jiffies) + rdsv3_ib_stats_inc(s_ib_tx_stalled); + rdsv3_ib_ack_send_complete(ic); + continue; + } + + oldest = rdsv3_ib_ring_oldest(&ic->i_send_ring); + + completed = rdsv3_ib_ring_completed(&ic->i_send_ring, + wc.wc_id, oldest); + + for (i = 0; i < completed; i++) { + send = &ic->i_sends[oldest]; + + /* + * In the error case, wc.opcode sometimes contains + * garbage + */ + switch (send->s_opcode) { + case IBT_WRC_SEND: + if (send->s_rm) + rdsv3_ib_send_unmap_rm(ic, send, + wc.wc_status); + break; + case IBT_WRC_RDMAW: + case IBT_WRC_RDMAR: + /* + * Nothing to be done - the SG list will + * be unmapped + * when the SEND completes. + */ + break; + default: +#ifndef __lock_lint + RDSV3_DPRINTF0("rdsv3_ib_send_cq_comp_handler", + "RDS/IB: %s: unexpected opcode " + "0x%x in WR!", + __func__, send->s_opcode); +#endif + break; + } + + send->s_opcode = 0xdd; + if (send->s_queued + HZ/2 < jiffies) + rdsv3_ib_stats_inc(s_ib_tx_stalled); + + /* + * If a RDMA operation produced an error, signal + * this right + * away. If we don't, the subsequent SEND that goes + * with this + * RDMA will be canceled with ERR_WFLUSH, and the + * application + * never learn that the RDMA failed. + */ + if (wc.wc_status == + IBT_WC_REMOTE_ACCESS_ERR && send->s_op) { + struct rdsv3_message *rm; + + rm = rdsv3_send_get_message(conn, send->s_op); + if (rm) { + if (rm->m_rdma_op != NULL) + rdsv3_ib_send_unmap_rdma(ic, + rm->m_rdma_op); + rdsv3_ib_send_rdma_complete(rm, + wc.wc_status); + rdsv3_message_put(rm); + } + } + + oldest = (oldest + 1) % ic->i_send_ring.w_nr; + } + + RDSV3_DPRINTF4("rdsv3_ib_send_cq_comp_handler", "compl: %d", + completed); + rdsv3_ib_ring_free(&ic->i_send_ring, completed); + + if (test_and_clear_bit(RDSV3_LL_SEND_FULL, &conn->c_flags) || + test_bit(0, &conn->c_map_queued)) + rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); + + /* We expect errors as the qp is drained during shutdown */ + if (wc.wc_status != IBT_WC_SUCCESS && rdsv3_conn_up(conn)) { + RDSV3_DPRINTF2("rdsv3_ib_send_cq_comp_handler", + "send completion on %u.%u.%u.%u " + "had status %u, disconnecting and reconnecting\n", + NIPQUAD(conn->c_faddr), wc.wc_status); + rdsv3_conn_drop(conn); + } + } + + RDSV3_DPRINTF4("rdsv3_ib_send_cq_comp_handler", + "Return: conn: %p, cq: %p", conn, cq); +} + +/* + * This is the main function for allocating credits when sending + * messages. + * + * Conceptually, we have two counters: + * - send credits: this tells us how many WRs we're allowed + * to submit without overruning the reciever's queue. For + * each SEND WR we post, we decrement this by one. + * + * - posted credits: this tells us how many WRs we recently + * posted to the receive queue. This value is transferred + * to the peer as a "credit update" in a RDS header field. + * Every time we transmit credits to the peer, we subtract + * the amount of transferred credits from this counter. + * + * It is essential that we avoid situations where both sides have + * exhausted their send credits, and are unable to send new credits + * to the peer. We achieve this by requiring that we send at least + * one credit update to the peer before exhausting our credits. + * When new credits arrive, we subtract one credit that is withheld + * until we've posted new buffers and are ready to transmit these + * credits (see rdsv3_ib_send_add_credits below). + * + * The RDS send code is essentially single-threaded; rdsv3_send_xmit + * grabs c_send_lock to ensure exclusive access to the send ring. + * However, the ACK sending code is independent and can race with + * message SENDs. + * + * In the send path, we need to update the counters for send credits + * and the counter of posted buffers atomically - when we use the + * last available credit, we cannot allow another thread to race us + * and grab the posted credits counter. Hence, we have to use a + * spinlock to protect the credit counter, or use atomics. + * + * Spinlocks shared between the send and the receive path are bad, + * because they create unnecessary delays. An early implementation + * using a spinlock showed a 5% degradation in throughput at some + * loads. + * + * This implementation avoids spinlocks completely, putting both + * counters into a single atomic, and updating that atomic using + * atomic_add (in the receive path, when receiving fresh credits), + * and using atomic_cmpxchg when updating the two counters. + */ +int +rdsv3_ib_send_grab_credits(struct rdsv3_ib_connection *ic, + uint32_t wanted, uint32_t *adv_credits, int need_posted, int max_posted) +{ + unsigned int avail, posted, got = 0, advertise; + long oldval, newval; + + RDSV3_DPRINTF4("rdsv3_ib_send_grab_credits", "ic: %p, %d %d %d %d", + ic, wanted, *adv_credits, need_posted, max_posted); + + *adv_credits = 0; + if (!ic->i_flowctl) + return (wanted); + +try_again: + advertise = 0; + oldval = newval = atomic_get(&ic->i_credits); + posted = IB_GET_POST_CREDITS(oldval); + avail = IB_GET_SEND_CREDITS(oldval); + + RDSV3_DPRINTF5("rdsv3_ib_send_grab_credits", + "wanted (%u): credits=%u posted=%u\n", wanted, avail, posted); + + /* The last credit must be used to send a credit update. */ + if (avail && !posted) + avail--; + + if (avail < wanted) { + struct rdsv3_connection *conn = ic->i_cm_id->context; + + /* Oops, there aren't that many credits left! */ + set_bit(RDSV3_LL_SEND_FULL, &conn->c_flags); + got = avail; + } else { + /* Sometimes you get what you want, lalala. */ + got = wanted; + } + newval -= IB_SET_SEND_CREDITS(got); + + /* + * If need_posted is non-zero, then the caller wants + * the posted regardless of whether any send credits are + * available. + */ + if (posted && (got || need_posted)) { + advertise = min(posted, max_posted); + newval -= IB_SET_POST_CREDITS(advertise); + } + + /* Finally bill everything */ + if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) + goto try_again; + + *adv_credits = advertise; + + RDSV3_DPRINTF4("rdsv3_ib_send_grab_credits", "ic: %p, %d %d %d %d", + ic, got, *adv_credits, need_posted, max_posted); + return (got); +} + +void +rdsv3_ib_send_add_credits(struct rdsv3_connection *conn, unsigned int credits) +{ + struct rdsv3_ib_connection *ic = conn->c_transport_data; + + if (credits == 0) + return; + + RDSV3_DPRINTF5("rdsv3_ib_send_add_credits", + "credits (%u): current=%u%s\n", + credits, + IB_GET_SEND_CREDITS(atomic_get(&ic->i_credits)), + test_bit(RDSV3_LL_SEND_FULL, &conn->c_flags) ? + ", ll_send_full" : ""); + + atomic_add_32(&ic->i_credits, IB_SET_SEND_CREDITS(credits)); + if (test_and_clear_bit(RDSV3_LL_SEND_FULL, &conn->c_flags)) + rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); + + ASSERT(!(IB_GET_SEND_CREDITS(credits) >= 16384)); + + rdsv3_ib_stats_inc(s_ib_rx_credit_updates); + + RDSV3_DPRINTF4("rdsv3_ib_send_add_credits", + "Return: conn: %p, credits: %d", + conn, credits); +} + +void +rdsv3_ib_advertise_credits(struct rdsv3_connection *conn, unsigned int posted) +{ + struct rdsv3_ib_connection *ic = conn->c_transport_data; + + RDSV3_DPRINTF4("rdsv3_ib_advertise_credits", "conn: %p, posted: %d", + conn, posted); + + if (posted == 0) + return; + + atomic_add_32(&ic->i_credits, IB_SET_POST_CREDITS(posted)); + + /* + * Decide whether to send an update to the peer now. + * If we would send a credit update for every single buffer we + * post, we would end up with an ACK storm (ACK arrives, + * consumes buffer, we refill the ring, send ACK to remote + * advertising the newly posted buffer... ad inf) + * + * Performance pretty much depends on how often we send + * credit updates - too frequent updates mean lots of ACKs. + * Too infrequent updates, and the peer will run out of + * credits and has to throttle. + * For the time being, 16 seems to be a good compromise. + */ + if (IB_GET_POST_CREDITS(atomic_get(&ic->i_credits)) >= 16) + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); +} + +static inline void +rdsv3_ib_xmit_populate_wr(struct rdsv3_ib_connection *ic, + ibt_send_wr_t *wr, unsigned int pos, + struct rdsv3_scatterlist *scat, unsigned int off, unsigned int length, + int send_flags) +{ + ibt_wr_ds_t *sge; + + RDSV3_DPRINTF4("rdsv3_ib_xmit_populate_wr", + "ic: %p, wr: %p scat: %p %d %d %d %d", + ic, wr, scat, pos, off, length, send_flags); + + wr->wr_id = pos; + wr->wr_trans = IBT_RC_SRV; + wr->wr_flags = send_flags; + wr->wr_opcode = IBT_WRC_SEND; + + if (length != 0) { + int ix, len, assigned; + ibt_wr_ds_t *sgl; + + ASSERT(length <= scat->length - off); + + sgl = scat->sgl; + if (off != 0) { + /* find the right sgl to begin with */ + while (sgl->ds_len <= off) { + off -= sgl->ds_len; + sgl++; + } + } + + ix = 1; /* first data sgl is at 1 */ + assigned = 0; + len = length; + do { + sge = &wr->wr_sgl[ix++]; + sge->ds_va = sgl->ds_va + off; + assigned = min(len, sgl->ds_len - off); + sge->ds_len = assigned; + sge->ds_key = sgl->ds_key; + len -= assigned; + if (len != 0) { + sgl++; + off = 0; + } + } while (len > 0); + + wr->wr_nds = ix; + } else { + /* + * We're sending a packet with no payload. There is only + * one SGE + */ + wr->wr_nds = 1; + } + + sge = &wr->wr_sgl[0]; + sge->ds_va = ic->i_send_hdrs_dma + (pos * sizeof (struct rdsv3_header)); + sge->ds_len = sizeof (struct rdsv3_header); + sge->ds_key = ic->i_mr->lkey; + + RDSV3_DPRINTF4("rdsv3_ib_xmit_populate_wr", + "Return: ic: %p, wr: %p scat: %p", ic, wr, scat); +} + +/* + * This can be called multiple times for a given message. The first time + * we see a message we map its scatterlist into the IB device so that + * we can provide that mapped address to the IB scatter gather entries + * in the IB work requests. We translate the scatterlist into a series + * of work requests that fragment the message. These work requests complete + * in order so we pass ownership of the message to the completion handler + * once we send the final fragment. + * + * The RDS core uses the c_send_lock to only enter this function once + * per connection. This makes sure that the tx ring alloc/unalloc pairs + * don't get out of sync and confuse the ring. + */ +int +rdsv3_ib_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off) +{ + struct rdsv3_ib_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct rdsv3_ib_send_work *send = NULL; + struct rdsv3_ib_send_work *first; + struct rdsv3_ib_send_work *prev; + ibt_send_wr_t *wr; + struct rdsv3_scatterlist *scat; + uint32_t pos; + uint32_t i; + uint32_t work_alloc; + uint32_t credit_alloc; + uint32_t posted; + uint32_t adv_credits = 0; + int send_flags = 0; + int sent; + int ret; + int flow_controlled = 0; + + RDSV3_DPRINTF4("rdsv3_ib_xmit", "conn: %p, rm: %p", conn, rm); + + ASSERT(!(off % RDSV3_FRAG_SIZE)); + ASSERT(!(hdr_off != 0 && hdr_off != sizeof (struct rdsv3_header))); + + /* Do not send cong updates to IB loopback */ + if (conn->c_loopback && + rm->m_inc.i_hdr.h_flags & RDSV3_FLAG_CONG_BITMAP) { + rdsv3_cong_map_updated(conn->c_fcong, ~(uint64_t)0); + return (sizeof (struct rdsv3_header) + RDSV3_CONG_MAP_BYTES); + } + +#ifndef __lock_lint + /* FIXME we may overallocate here */ + if (ntohl(rm->m_inc.i_hdr.h_len) == 0) + i = 1; + else + i = ceil(ntohl(rm->m_inc.i_hdr.h_len), RDSV3_FRAG_SIZE); +#endif + + work_alloc = rdsv3_ib_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc == 0) { + set_bit(RDSV3_LL_SEND_FULL, &conn->c_flags); + rdsv3_ib_stats_inc(s_ib_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + credit_alloc = work_alloc; + if (ic->i_flowctl) { + credit_alloc = rdsv3_ib_send_grab_credits(ic, work_alloc, + &posted, 0, RDSV3_MAX_ADV_CREDIT); + adv_credits += posted; + if (credit_alloc < work_alloc) { + rdsv3_ib_ring_unalloc(&ic->i_send_ring, + work_alloc - credit_alloc); + work_alloc = credit_alloc; + flow_controlled++; + } + if (work_alloc == 0) { + set_bit(RDSV3_LL_SEND_FULL, &conn->c_flags); + rdsv3_ib_stats_inc(s_ib_tx_throttle); + ret = -ENOMEM; + goto out; + } + } + + /* map the message the first time we see it */ + if (ic->i_rm == NULL) { + /* + * printk(KERN_NOTICE + * "rdsv3_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", + * be16_to_cpu(rm->m_inc.i_hdr.h_dport), + * rm->m_inc.i_hdr.h_flags, + * be32_to_cpu(rm->m_inc.i_hdr.h_len)); + */ + if (rm->m_nents) { + rm->m_count = rdsv3_ib_dma_map_sg(dev, + rm->m_sg, rm->m_nents); + RDSV3_DPRINTF5("rdsv3_ib_xmit", + "ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); + if (rm->m_count == 0) { + rdsv3_ib_stats_inc(s_ib_tx_sg_mapping_failure); + rdsv3_ib_ring_unalloc(&ic->i_send_ring, + work_alloc); + ret = -ENOMEM; /* XXX ? */ + RDSV3_DPRINTF2("rdsv3_ib_xmit", + "fail: ic %p mapping rm %p: %d\n", + ic, rm, rm->m_count); + goto out; + } + } else { + rm->m_count = 0; + } + + ic->i_unsignaled_wrs = rdsv3_ib_sysctl_max_unsig_wrs; + ic->i_unsignaled_bytes = rdsv3_ib_sysctl_max_unsig_bytes; + rdsv3_message_addref(rm); + ic->i_rm = rm; + + /* Finalize the header */ + if (test_bit(RDSV3_MSG_ACK_REQUIRED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDSV3_FLAG_ACK_REQUIRED; + if (test_bit(RDSV3_MSG_RETRANSMITTED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDSV3_FLAG_RETRANSMITTED; + + /* + * If it has a RDMA op, tell the peer we did it. This is + * used by the peer to release use-once RDMA MRs. + */ + if (rm->m_rdma_op) { + struct rdsv3_ext_header_rdma ext_hdr; + + ext_hdr.h_rdma_rkey = htonl(rm->m_rdma_op->r_key); + (void) rdsv3_message_add_extension(&rm->m_inc.i_hdr, + RDSV3_EXTHDR_RDMA, &ext_hdr, + sizeof (ext_hdr)); + } + if (rm->m_rdma_cookie) { + (void) rdsv3_message_add_rdma_dest_extension( + &rm->m_inc.i_hdr, + rdsv3_rdma_cookie_key(rm->m_rdma_cookie), + rdsv3_rdma_cookie_offset(rm->m_rdma_cookie)); + } + + /* + * Note - rdsv3_ib_piggyb_ack clears the ACK_REQUIRED bit, so + * we should not do this unless we have a chance of at least + * sticking the header into the send ring. Which is why we + * should call rdsv3_ib_ring_alloc first. + */ + rm->m_inc.i_hdr.h_ack = htonll(rdsv3_ib_piggyb_ack(ic)); + rdsv3_message_make_checksum(&rm->m_inc.i_hdr); + + /* + * Update adv_credits since we reset the ACK_REQUIRED bit. + */ + (void) rdsv3_ib_send_grab_credits(ic, 0, &posted, 1, + RDSV3_MAX_ADV_CREDIT - adv_credits); + adv_credits += posted; + ASSERT(adv_credits <= 255); + } else if (ic->i_rm != rm) + RDSV3_PANIC(); + + send = &ic->i_sends[pos]; + first = send; + prev = NULL; + scat = &rm->m_sg[sg]; + sent = 0; + i = 0; + + /* + * Sometimes you want to put a fence between an RDMA + * READ and the following SEND. + * We could either do this all the time + * or when requested by the user. Right now, we let + * the application choose. + */ + if (rm->m_rdma_op && rm->m_rdma_op->r_fence) + send_flags = IBT_WR_SEND_FENCE; + + /* + * We could be copying the header into the unused tail of the page. + * That would need to be changed in the future when those pages might + * be mapped userspace pages or page cache pages. So instead we always + * use a second sge and our long-lived ring of mapped headers. We send + * the header after the data so that the data payload can be aligned on + * the receiver. + */ + + /* handle a 0-len message */ + if (ntohl(rm->m_inc.i_hdr.h_len) == 0) { + wr = &ic->i_send_wrs[0]; + rdsv3_ib_xmit_populate_wr(ic, wr, pos, NULL, 0, 0, send_flags); + send->s_queued = jiffies; + send->s_op = NULL; + send->s_opcode = wr->wr_opcode; + goto add_header; + } + + /* if there's data reference it with a chain of work reqs */ + for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { + unsigned int len; + + send = &ic->i_sends[pos]; + + wr = &ic->i_send_wrs[i]; + len = min(RDSV3_FRAG_SIZE, + rdsv3_ib_sg_dma_len(dev, scat) - off); + rdsv3_ib_xmit_populate_wr(ic, wr, pos, scat, off, len, + send_flags); + send->s_queued = jiffies; + send->s_op = NULL; + send->s_opcode = wr->wr_opcode; + + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead + * time + * on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0) { + ic->i_unsignaled_wrs = rdsv3_ib_sysctl_max_unsig_wrs; + wr->wr_flags |= + IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; + } + + ic->i_unsignaled_bytes -= len; + if (ic->i_unsignaled_bytes <= 0) { + ic->i_unsignaled_bytes = + rdsv3_ib_sysctl_max_unsig_bytes; + wr->wr_flags |= + IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; + } + + /* + * Always signal the last one if we're stopping due to flow + * control. + */ + if (flow_controlled && i == (work_alloc-1)) { + wr->wr_flags |= + IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; + } + + RDSV3_DPRINTF5("rdsv3_ib_xmit", "send %p wr %p num_sge %u \n", + send, wr, wr->wr_nds); + + sent += len; + off += len; + if (off == rdsv3_ib_sg_dma_len(dev, scat)) { + scat++; + off = 0; + } + +add_header: + /* + * Tack on the header after the data. The header SGE + * should already + * have been set up to point to the right header buffer. + */ + (void) memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, + sizeof (struct rdsv3_header)); + + if (0) { + struct rdsv3_header *hdr = &ic->i_send_hdrs[pos]; + + RDSV3_DPRINTF0("rdsv3_ib_xmit", + "send WR dport=%u flags=0x%x len=%d", + ntohs(hdr->h_dport), + hdr->h_flags, + ntohl(hdr->h_len)); + } + if (adv_credits) { + struct rdsv3_header *hdr = &ic->i_send_hdrs[pos]; + + /* add credit and redo the header checksum */ + hdr->h_credit = adv_credits; + rdsv3_message_make_checksum(hdr); + adv_credits = 0; + rdsv3_ib_stats_inc(s_ib_tx_credit_updates); + } + + prev = send; + + pos = (pos + 1) % ic->i_send_ring.w_nr; + } + + /* + * Account the RDS header in the number of bytes we sent, but just once. + * The caller has no concept of fragmentation. + */ + if (hdr_off == 0) + sent += sizeof (struct rdsv3_header); + + /* if we finished the message then send completion owns it */ + if (scat == &rm->m_sg[rm->m_count]) { + prev->s_rm = ic->i_rm; + wr->wr_flags |= IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; + ic->i_rm = NULL; + } + + if (i < work_alloc) { + rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + if (ic->i_flowctl && i < credit_alloc) + rdsv3_ib_send_add_credits(conn, credit_alloc - i); + + /* XXX need to worry about failed_wr and partial sends. */ + ret = ibt_post_send(ib_get_ibt_channel_hdl(ic->i_cm_id), + ic->i_send_wrs, i, &posted); + if (posted != i) { + RDSV3_DPRINTF1("rdsv3_ib_xmit", + "ic %p first %p nwr: %d ret %d:%d", + ic, first, i, ret, posted); + } + if (ret) { + RDSV3_DPRINTF0("rdsv3_ib_xmit", + "RDS/IB: ib_post_send to %u.%u.%u.%u " + "returned %d\n", NIPQUAD(conn->c_faddr), ret); + rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + if (prev->s_rm) { + ic->i_rm = prev->s_rm; + prev->s_rm = NULL; + } +#if 1 + RDSV3_DPRINTF2("rdsv3_ib_xmit", "ibt_post_send FAIL"); + ret = -EAGAIN; +#else + /* Finesse this later */ + RDSV3_PANIC(); +#endif + goto out; + } + + ret = sent; + + RDSV3_DPRINTF4("rdsv3_ib_xmit", "Return: conn: %p, rm: %p", conn, rm); +out: + ASSERT(!adv_credits); + return (ret); +} + +static void +rdsv3_ib_dma_unmap_sg_rdma(struct ib_device *dev, uint_t num, + struct rdsv3_rdma_sg scat[]) +{ + ibt_hca_hdl_t hca_hdl; + int i; + int num_sgl; + + RDSV3_DPRINTF4("rdsv3_ib_dma_unmap_sg", "rdma_sg: %p", scat); + + if (dev) { + hca_hdl = ib_get_ibt_hca_hdl(dev); + } else { + hca_hdl = scat[0].hca_hdl; + RDSV3_DPRINTF2("rdsv3_ib_dma_unmap_sg_rdma", + "NULL dev use cached hca_hdl %p", hca_hdl); + } + + if (hca_hdl == NULL) + return; + scat[0].hca_hdl = NULL; + + for (i = 0; i < num; i++) { + if (scat[i].mihdl != NULL) { + num_sgl = (scat[i].iovec.bytes / PAGESIZE) + 2; + kmem_free(scat[i].swr.wr_sgl, + (num_sgl * sizeof (ibt_wr_ds_t))); + scat[i].swr.wr_sgl = NULL; + (void) ibt_unmap_mem_iov(hca_hdl, scat[i].mihdl); + scat[i].mihdl = NULL; + } else + break; + } +} + +/* ARGSUSED */ +uint_t +rdsv3_ib_dma_map_sg_rdma(struct ib_device *dev, struct rdsv3_rdma_sg scat[], + uint_t num, struct rdsv3_scatterlist **scatl) +{ + ibt_hca_hdl_t hca_hdl; + ibt_iov_attr_t iov_attr; + struct buf *bp; + uint_t i, j, k; + uint_t count; + struct rdsv3_scatterlist *sg; + int ret; + + RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg_rdma", "scat: %p, num: %d", + scat, num); + + hca_hdl = ib_get_ibt_hca_hdl(dev); + scat[0].hca_hdl = hca_hdl; + bzero(&iov_attr, sizeof (ibt_iov_attr_t)); + iov_attr.iov_flags = IBT_IOV_BUF; + iov_attr.iov_lso_hdr_sz = 0; + + for (i = 0, count = 0; i < num; i++) { + /* transpose umem_cookie to buf structure */ + bp = ddi_umem_iosetup(scat[i].umem_cookie, + scat[i].iovec.addr & PAGEOFFSET, scat[i].iovec.bytes, + B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP); + if (bp == NULL) { + /* free resources and return error */ + goto out; + } + /* setup ibt_map_mem_iov() attributes */ + iov_attr.iov_buf = bp; + iov_attr.iov_wr_nds = (scat[i].iovec.bytes / PAGESIZE) + 2; + scat[i].swr.wr_sgl = + kmem_zalloc(iov_attr.iov_wr_nds * sizeof (ibt_wr_ds_t), + KM_SLEEP); + + ret = ibt_map_mem_iov(hca_hdl, &iov_attr, + (ibt_all_wr_t *)&scat[i].swr, &scat[i].mihdl); + freerbuf(bp); + if (ret != IBT_SUCCESS) { + RDSV3_DPRINTF2("rdsv3_ib_dma_map_sg_rdma", + "ibt_map_mem_iov returned: %d", ret); + /* free resources and return error */ + kmem_free(scat[i].swr.wr_sgl, + iov_attr.iov_wr_nds * sizeof (ibt_wr_ds_t)); + goto out; + } + count += scat[i].swr.wr_nds; + +#ifdef DEBUG + for (j = 0; j < scat[i].swr.wr_nds; j++) { + RDSV3_DPRINTF5("rdsv3_ib_dma_map_sg_rdma", + "sgl[%d] va %llx len %x", j, + scat[i].swr.wr_sgl[j].ds_va, + scat[i].swr.wr_sgl[j].ds_len); + } +#endif + RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg_rdma", + "iovec.bytes: 0x%x scat[%d]swr.wr_nds: %d", + scat[i].iovec.bytes, i, scat[i].swr.wr_nds); + } + + count = ((count - 1) / RDSV3_IB_MAX_SGE) + 1; + RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg_rdma", "Ret: num: %d", count); + return (count); + +out: + rdsv3_ib_dma_unmap_sg_rdma(dev, num, scat); + return (0); +} + +int +rdsv3_ib_xmit_rdma(struct rdsv3_connection *conn, struct rdsv3_rdma_op *op) +{ + struct rdsv3_ib_connection *ic = conn->c_transport_data; + struct rdsv3_ib_send_work *send = NULL; + struct rdsv3_rdma_sg *scat; + uint64_t remote_addr; + uint32_t pos; + uint32_t work_alloc; + uint32_t i, j, k, idx; + uint32_t left, count; + uint32_t posted; + int sent; + ibt_status_t status; + ibt_send_wr_t *wr; + ibt_wr_ds_t *sge; + + RDSV3_DPRINTF4("rdsv3_ib_xmit_rdma", "rdsv3_ib_conn: %p", ic); + + /* map the message the first time we see it */ + if (!op->r_mapped) { + op->r_count = rdsv3_ib_dma_map_sg_rdma(ic->i_cm_id->device, + op->r_rdma_sg, op->r_nents, &op->r_sg); + RDSV3_DPRINTF5("rdsv3_ib_xmit_rdma", "ic %p mapping op %p: %d", + ic, op, op->r_count); + if (op->r_count == 0) { + rdsv3_ib_stats_inc(s_ib_tx_sg_mapping_failure); + RDSV3_DPRINTF2("rdsv3_ib_xmit_rdma", + "fail: ic %p mapping op %p: %d", + ic, op, op->r_count); + return (-ENOMEM); /* XXX ? */ + } + op->r_mapped = 1; + } + + /* + * Instead of knowing how to return a partial rdma read/write + * we insist that there + * be enough work requests to send the entire message. + */ + work_alloc = rdsv3_ib_ring_alloc(&ic->i_send_ring, op->r_count, &pos); + if (work_alloc != op->r_count) { + rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rdsv3_ib_stats_inc(s_ib_tx_ring_full); + return (-ENOMEM); + } + + /* + * take the scatter list and transpose into a list of + * send wr's each with a scatter list of RDSV3_IB_MAX_SGE + */ + scat = &op->r_rdma_sg[0]; + sent = 0; + remote_addr = op->r_remote_addr; + + for (i = 0, k = 0; i < op->r_nents; i++) { + left = scat[i].swr.wr_nds; + for (idx = 0; left > 0; k++) { + send = &ic->i_sends[pos]; + send->s_queued = jiffies; + send->s_opcode = op->r_write ? IBT_WRC_RDMAW : + IBT_WRC_RDMAR; + send->s_op = op; + + wr = &ic->i_send_wrs[k]; + wr->wr_flags = 0; + wr->wr_id = pos; + wr->wr_trans = IBT_RC_SRV; + wr->wr_opcode = op->r_write ? IBT_WRC_RDMAW : + IBT_WRC_RDMAR; + wr->wr.rc.rcwr.rdma.rdma_raddr = remote_addr; + wr->wr.rc.rcwr.rdma.rdma_rkey = op->r_key; + + if (left > RDSV3_IB_MAX_SGE) { + count = RDSV3_IB_MAX_SGE; + left -= RDSV3_IB_MAX_SGE; + } else { + count = left; + left = 0; + } + wr->wr_nds = count; + + for (j = 0; j < count; j++) { + sge = &wr->wr_sgl[j]; + *sge = scat[i].swr.wr_sgl[idx]; + remote_addr += scat[i].swr.wr_sgl[idx].ds_len; + sent += scat[i].swr.wr_sgl[idx].ds_len; + idx++; + RDSV3_DPRINTF4("xmit_rdma", + "send_wrs[%d]sgl[%d] va %llx len %x", + k, j, sge->ds_va, sge->ds_len); + } + RDSV3_DPRINTF4("rdsv3_ib_xmit_rdma", + "wr[%d] %p key: %x code: %d tlen: %d", + k, wr, wr->wr.rc.rcwr.rdma.rdma_rkey, + wr->wr_opcode, sent); + + /* + * We want to delay signaling completions just enough + * to get the batching benefits but not so much that + * we create dead time on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0) { + ic->i_unsignaled_wrs = + rdsv3_ib_sysctl_max_unsig_wrs; + wr->wr_flags = IBT_WR_SEND_SIGNAL; + } + + pos = (pos + 1) % ic->i_send_ring.w_nr; + } + } + + status = ibt_post_send(ib_get_ibt_channel_hdl(ic->i_cm_id), + ic->i_send_wrs, k, &posted); + if (status != IBT_SUCCESS) { + RDSV3_DPRINTF0("rdsv3_ib_xmit_rdma", + "RDS/IB: rdma ib_post_send returned %d", status); + rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + } + return (status); +} + +void +rdsv3_ib_xmit_complete(struct rdsv3_connection *conn) +{ + struct rdsv3_ib_connection *ic = conn->c_transport_data; + + RDSV3_DPRINTF4("rdsv3_ib_xmit_complete", "conn: %p", conn); + + /* + * We may have a pending ACK or window update we were unable + * to send previously (due to flow control). Try again. + */ + rdsv3_ib_attempt_ack(ic); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_stats.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_stats.c new file mode 100644 index 0000000000..2abdc26d49 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_stats.c @@ -0,0 +1,125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/rds.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/ib.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +RDSV3_DEFINE_PER_CPU(struct rdsv3_ib_statistics, rdsv3_ib_stats); + +static char *rdsv3_ib_stat_names[] = { + "ib_connect_raced", + "ib_listen_closed_stale", + "ib_tx_cq_call", + "ib_tx_cq_event", + "ib_tx_ring_full", + "ib_tx_throttle", + "ib_tx_sg_mapping_failure", + "ib_tx_stalled", + "ib_tx_credit_updates", + "ib_rx_cq_call", + "ib_rx_cq_event", + "ib_rx_ring_empty", + "ib_rx_refill_from_cq", + "ib_rx_refill_from_thread", + "ib_rx_alloc_limit", + "ib_rx_credit_updates", + "ib_ack_sent", + "ib_ack_send_failure", + "ib_ack_send_delayed", + "ib_ack_send_piggybacked", + "ib_ack_received", + "ib_rdma_mr_alloc", + "ib_rdma_mr_free", + "ib_rdma_mr_used", + "ib_rdma_mr_pool_flush", + "ib_rdma_mr_pool_wait", + "ib_rdma_mr_pool_depleted", +}; + +unsigned int +rdsv3_ib_stats_info_copy(struct rdsv3_info_iterator *iter, + unsigned int avail) +{ + struct rdsv3_ib_statistics stats = {0, }; + uint64_t *src; + uint64_t *sum; + size_t i; + int cpu; + + RDSV3_DPRINTF4("rdsv3_ib_stats_info_copy", "iter: %p, avail: %d", + iter, avail); + + if (avail < ARRAY_SIZE(rdsv3_ib_stat_names)) + goto out; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + src = (uint64_t *)&(rdsv3_per_cpu(rdsv3_ib_stats, cpu)); + sum = (uint64_t *)&stats; + for (i = 0; i < sizeof (stats) / sizeof (uint64_t); i++) + *(sum++) += *(src++); + } + + rdsv3_stats_info_copy(iter, (uint64_t *)&stats, rdsv3_ib_stat_names, + ARRAY_SIZE(rdsv3_ib_stat_names)); + + RDSV3_DPRINTF4("rdsv3_ib_stats_info_copy", + "Return: iter: %p, avail: %d", iter, avail); +out: + return (ARRAY_SIZE(rdsv3_ib_stat_names)); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_sysctl.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_sysctl.c new file mode 100644 index 0000000000..27bceddb48 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_sysctl.c @@ -0,0 +1,90 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/ib/clients/rdsv3/ib.h> + +unsigned long rdsv3_ib_sysctl_max_send_wr = RDSV3_IB_DEFAULT_SEND_WR; +unsigned long rdsv3_ib_sysctl_max_recv_wr = RDSV3_IB_DEFAULT_RECV_WR; +unsigned long rdsv3_ib_sysctl_max_recv_allocation = + (128 * 1024 * 1024) / RDSV3_FRAG_SIZE; +/* hardware will fail CQ creation long before this */ + +unsigned long rdsv3_ib_sysctl_max_unsig_wrs = 16; + +unsigned long rdsv3_ib_sysctl_max_unsig_bytes = (16 << 20); + +/* + * This sysctl does nothing. + * + * Backwards compatibility with RDS 3.0 wire protocol + * disables initial FC credit exchange. + * If it's ever possible to drop 3.0 support, + * setting this to 1 and moving init/refill of send/recv + * rings from ib_cm_connect_complete() back into ib_setup_qp() + * will cause credits to be added before protocol negotiation. + */ +unsigned int rdsv3_ib_sysctl_flow_control = 0; + +void +rdsv3_ib_sysctl_exit(void) +{ +} + +int +rdsv3_ib_sysctl_init(void) +{ + return (0); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/info.c b/usr/src/uts/common/io/ib/clients/rdsv3/info.c new file mode 100644 index 0000000000..88dc4ecb6d --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/info.c @@ -0,0 +1,155 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/rds.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> + +/* + * This file implements a getsockopt() call which copies a set of fixed + * sized structs into a user-specified buffer as a means of providing + * read-only information about RDS. + * + * For a given information source there are a given number of fixed sized + * structs at a given time. The structs are only copied if the user-specified + * buffer is big enough. The destination pages that make up the buffer + * are pinned for the duration of the copy. + * + * This gives us the following benefits: + * + * - simple implementation, no copy "position" across multiple calls + * - consistent snapshot of an info source + * - atomic copy works well with whatever locking info source has + * - one portable tool to get rds info across implementations + * - long-lived tool can get info without allocating + * + * at the following costs: + * + * - info source copy must be pinned, may be "large" + */ + +static kmutex_t rdsv3_info_lock; +static rdsv3_info_func rdsv3_info_funcs[RDSV3_INFO_LAST - RDSV3_INFO_FIRST + 1]; + +void +rdsv3_info_register_func(int optname, rdsv3_info_func func) +{ + int offset = optname - RDSV3_INFO_FIRST; + + ASSERT(optname >= RDSV3_INFO_FIRST && optname <= RDSV3_INFO_LAST); + + mutex_enter(&rdsv3_info_lock); + rdsv3_info_funcs[offset] = func; + mutex_exit(&rdsv3_info_lock); +} + +/* ARGSUSED */ +void +rdsv3_info_deregister_func(int optname, rdsv3_info_func func) +{ + int offset = optname - RDSV3_INFO_FIRST; + + ASSERT(optname >= RDSV3_INFO_FIRST && optname <= RDSV3_INFO_LAST); + + mutex_enter(&rdsv3_info_lock); + rdsv3_info_funcs[offset] = NULL; + mutex_exit(&rdsv3_info_lock); +} + +/* + * @optval points to the userspace buffer that the information snapshot + * will be copied into. + * + * @optlen on input is the size of the buffer in userspace. @optlen + * on output is the size of the requested snapshot in bytes. + * + * This function returns -errno if there is a failure, particularly -ENOSPC + * if the given userspace buffer was not large enough to fit the snapshot. + * On success it returns the positive number of bytes of each array element + * in the snapshot. + */ +int +rdsv3_info_getsockopt(struct rsock *sock, int optname, char *optval, + socklen_t *optlen) +{ + struct rdsv3_info_iterator iter; + struct rdsv3_info_lengths lens; + rdsv3_info_func func; + + func = rdsv3_info_funcs[optname - RDSV3_INFO_FIRST]; + if (func == NULL) { + return (-ENOPROTOOPT); + } + + if (*optlen == sizeof (struct rdsv3_info_lengths)) { + iter.addr = NULL; + } else { + iter.addr = optval; + } + + iter.offset = 0; + + func(sock, *optlen, &iter, &lens); + ASSERT(lens.each != 0); + + if (iter.addr == NULL) { + bcopy(&lens, optval, sizeof (struct rdsv3_info_lengths)); + } else { + *optlen = lens.nr * lens.each; + } + + return (0); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/loop.c b/usr/src/uts/common/io/ib/clients/rdsv3/loop.c new file mode 100644 index 0000000000..8ae25caae0 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/loop.c @@ -0,0 +1,242 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/rds.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/loop.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +kmutex_t loop_conns_lock; +list_t loop_conns; + +/* + * This 'loopback' transport is a special case for flows that originate + * and terminate on the same machine. + * + * Connection build-up notices if the destination address is thought of + * as a local address by a transport. At that time it decides to use the + * loopback transport instead of the bound transport of the sending socket. + * + * The loopback transport's sending path just hands the sent rds_message + * straight to the receiving path via an embedded rds_incoming. + */ + +/* + * Usually a message transits both the sender and receiver's conns as it + * flows to the receiver. In the loopback case, though, the receive path + * is handed the sending conn so the sense of the addresses is reversed. + */ +static int +rdsv3_loop_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm, + unsigned int hdr_off, unsigned int sg, + unsigned int off) +{ + ASSERT(!(hdr_off || sg || off)); + + RDSV3_DPRINTF4("rdsv3_loop_xmit", "Enter(conn: %p, rm: %p)", conn, rm); + + rdsv3_inc_init(&rm->m_inc, conn, conn->c_laddr); + rdsv3_message_addref(rm); /* for the inc */ + + rdsv3_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, + KM_NOSLEEP); + + rdsv3_send_drop_acked(conn, ntohll(rm->m_inc.i_hdr.h_sequence), + NULL); + + rdsv3_inc_put(&rm->m_inc); + + RDSV3_DPRINTF4("rdsv3_loop_xmit", "Return(conn: %p, rm: %p)", conn, rm); + + return (sizeof (struct rdsv3_header) + + ntohl(rm->m_inc.i_hdr.h_len)); +} + +static int +rdsv3_loop_xmit_cong_map(struct rdsv3_connection *conn, + struct rdsv3_cong_map *map, + unsigned long offset) +{ + RDSV3_DPRINTF4("rdsv3_loop_xmit_cong_map", "Enter(conn: %p)", conn); + + ASSERT(!offset); + ASSERT(map == conn->c_lcong); + + rdsv3_cong_map_updated(conn->c_fcong, ~(uint64_t)0); + + RDSV3_DPRINTF4("rdsv3_loop_xmit_cong_map", "Return(conn: %p)", conn); + + return (sizeof (struct rdsv3_header) + RDSV3_CONG_MAP_BYTES); +} + +/* we need to at least give the thread something to succeed */ +/* ARGSUSED */ +static int +rdsv3_loop_recv(struct rdsv3_connection *conn) +{ + return (0); +} + +struct rdsv3_loop_connection +{ + struct list_node loop_node; + struct rdsv3_connection *conn; +}; + +/* + * Even the loopback transport needs to keep track of its connections, + * so it can call rdsv3_conn_destroy() on them on exit. N.B. there are + * 1+ loopback addresses (127.*.*.*) so it's not a bug to have + * multiple loopback conns allocated, although rather useless. + */ +/* ARGSUSED */ +static int +rdsv3_loop_conn_alloc(struct rdsv3_connection *conn, int gfp) +{ + struct rdsv3_loop_connection *lc; + + RDSV3_DPRINTF4("rdsv3_loop_conn_alloc", "Enter(conn: %p)", conn); + + lc = kmem_zalloc(sizeof (struct rdsv3_loop_connection), KM_NOSLEEP); + if (lc == NULL) + return (-ENOMEM); + + list_link_init(&lc->loop_node); + lc->conn = conn; + conn->c_transport_data = lc; + + mutex_enter(&loop_conns_lock); + list_insert_tail(&loop_conns, lc); + mutex_exit(&loop_conns_lock); + + RDSV3_DPRINTF4("rdsv3_loop_conn_alloc", "Return(conn: %p)", conn); + + return (0); +} + +static void +rdsv3_loop_conn_free(void *arg) +{ + struct rdsv3_loop_connection *lc = arg; + RDSV3_DPRINTF5("rdsv3_loop_conn_free", "lc %p\n", lc); + list_remove_node(&lc->loop_node); + kmem_free(lc, sizeof (struct rdsv3_loop_connection)); +} + +static int +rdsv3_loop_conn_connect(struct rdsv3_connection *conn) +{ + rdsv3_connect_complete(conn); + return (0); +} + +/* ARGSUSED */ +static void +rdsv3_loop_conn_shutdown(struct rdsv3_connection *conn) +{ +} + +void +rdsv3_loop_exit(void) +{ + struct rdsv3_loop_connection *lc, *_lc; + list_t tmp_list; + + RDSV3_DPRINTF4("rdsv3_loop_exit", "Enter"); + + list_create(&tmp_list, sizeof (struct rdsv3_loop_connection), + offsetof(struct rdsv3_loop_connection, loop_node)); + + /* avoid calling conn_destroy with irqs off */ + mutex_enter(&loop_conns_lock); + list_splice(&loop_conns, &tmp_list); + mutex_exit(&loop_conns_lock); + + RDSV3_FOR_EACH_LIST_NODE_SAFE(lc, _lc, &tmp_list, loop_node) { + ASSERT(!lc->conn->c_passive); + rdsv3_conn_destroy(lc->conn); + } + + list_destroy(&loop_conns); + mutex_destroy(&loop_conns_lock); + + RDSV3_DPRINTF4("rdsv3_loop_exit", "Return"); +} + +/* + * This is missing .xmit_* because loop doesn't go through generic + * rdsv3_send_xmit() and doesn't call rdsv3_recv_incoming(). .listen_stop and + * .laddr_check are missing because transport.c doesn't iterate over + * rdsv3_loop_transport. + */ +#ifndef __lock_lint +struct rdsv3_transport rdsv3_loop_transport = { + .xmit = rdsv3_loop_xmit, + .xmit_cong_map = rdsv3_loop_xmit_cong_map, + .recv = rdsv3_loop_recv, + .conn_alloc = rdsv3_loop_conn_alloc, + .conn_free = rdsv3_loop_conn_free, + .conn_connect = rdsv3_loop_conn_connect, + .conn_shutdown = rdsv3_loop_conn_shutdown, + .inc_copy_to_user = rdsv3_message_inc_copy_to_user, + .inc_purge = rdsv3_message_inc_purge, + .inc_free = rdsv3_message_inc_free, + .t_name = "loopback", +}; +#else +struct rdsv3_transport rdsv3_loop_transport; +#endif diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/message.c b/usr/src/uts/common/io/ib/clients/rdsv3/message.c new file mode 100644 index 0000000000..470219c2da --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/message.c @@ -0,0 +1,473 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/rds.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/rdma.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +static rdsv3_wait_queue_t rdsv3_message_flush_waitq; + +#ifndef __lock_lint +static unsigned int rdsv3_exthdr_size[__RDSV3_EXTHDR_MAX] = { +[RDSV3_EXTHDR_NONE] = 0, +[RDSV3_EXTHDR_VERSION] = sizeof (struct rdsv3_ext_header_version), +[RDSV3_EXTHDR_RDMA] = sizeof (struct rdsv3_ext_header_rdma), +[RDSV3_EXTHDR_RDMA_DEST] = sizeof (struct rdsv3_ext_header_rdma_dest), +}; +#else +static unsigned int rdsv3_exthdr_size[__RDSV3_EXTHDR_MAX] = { + 0, + sizeof (struct rdsv3_ext_header_version), + sizeof (struct rdsv3_ext_header_rdma), + sizeof (struct rdsv3_ext_header_rdma_dest), +}; +#endif + +void +rdsv3_message_addref(struct rdsv3_message *rm) +{ + RDSV3_DPRINTF5("rdsv3_message_addref", "addref rm %p ref %d", + rm, atomic_get(&rm->m_refcount)); + atomic_add_32(&rm->m_refcount, 1); +} + +/* + * This relies on dma_map_sg() not touching sg[].page during merging. + */ +static void +rdsv3_message_purge(struct rdsv3_message *rm) +{ + unsigned long i; + + RDSV3_DPRINTF4("rdsv3_message_purge", "Enter(rm: %p)", rm); + + if (test_bit(RDSV3_MSG_PAGEVEC, &rm->m_flags)) + return; + + for (i = 0; i < rm->m_nents; i++) { + RDSV3_DPRINTF5("rdsv3_message_purge", "putting data page %p\n", + (void *)rdsv3_sg_page(&rm->m_sg[i])); + /* XXX will have to put_page for page refs */ + kmem_free(rdsv3_sg_page(&rm->m_sg[i]), + rdsv3_sg_len(&rm->m_sg[i])); + } + + if (rm->m_rdma_op) + rdsv3_rdma_free_op(rm->m_rdma_op); + if (rm->m_rdma_mr) { + struct rdsv3_mr *mr = rm->m_rdma_mr; + if (mr->r_refcount == 0) { + RDSV3_DPRINTF4("rdsv3_message_purge ASSERT 0", + "rm %p mr %p", rm, mr); + return; + } + if (mr->r_refcount == 0xdeadbeef) { + RDSV3_DPRINTF4("rdsv3_message_purge ASSERT deadbeef", + "rm %p mr %p", rm, mr); + return; + } + if (atomic_dec_and_test(&mr->r_refcount)) { + rm->m_rdma_mr = NULL; + __rdsv3_put_mr_final(mr); + } + } + + RDSV3_DPRINTF4("rdsv3_message_purge", "Return(rm: %p)", rm); + +} + +void +rdsv3_message_inc_purge(struct rdsv3_incoming *inc) +{ + struct rdsv3_message *rm = + container_of(inc, struct rdsv3_message, m_inc); + rdsv3_message_purge(rm); +} + +void +rdsv3_message_put(struct rdsv3_message *rm) +{ + RDSV3_DPRINTF5("rdsv3_message_put", + "put rm %p ref %d\n", rm, atomic_get(&rm->m_refcount)); + + if (atomic_dec_and_test(&rm->m_refcount)) { + ASSERT(!list_link_active(&rm->m_sock_item)); + ASSERT(!list_link_active(&rm->m_conn_item)); + rdsv3_message_purge(rm); + + kmem_free(rm, sizeof (struct rdsv3_message) + + (rm->m_nents * sizeof (struct rdsv3_scatterlist))); + } +} + +void +rdsv3_message_inc_free(struct rdsv3_incoming *inc) +{ + struct rdsv3_message *rm = + container_of(inc, struct rdsv3_message, m_inc); + rdsv3_message_put(rm); +} + +void +rdsv3_message_populate_header(struct rdsv3_header *hdr, uint16_be_t sport, + uint16_be_t dport, uint64_t seq) +{ + hdr->h_flags = 0; + hdr->h_sport = sport; + hdr->h_dport = dport; + hdr->h_sequence = htonll(seq); + hdr->h_exthdr[0] = RDSV3_EXTHDR_NONE; +} + +int +rdsv3_message_add_extension(struct rdsv3_header *hdr, + unsigned int type, const void *data, unsigned int len) +{ + unsigned int ext_len = sizeof (uint8_t) + len; + unsigned char *dst; + + RDSV3_DPRINTF4("rdsv3_message_add_extension", "Enter"); + + /* For now, refuse to add more than one extension header */ + if (hdr->h_exthdr[0] != RDSV3_EXTHDR_NONE) + return (0); + + if (type >= __RDSV3_EXTHDR_MAX || + len != rdsv3_exthdr_size[type]) + return (0); + + if (ext_len >= RDSV3_HEADER_EXT_SPACE) + return (0); + dst = hdr->h_exthdr; + + *dst++ = type; + (void) memcpy(dst, data, len); + + dst[len] = RDSV3_EXTHDR_NONE; + + RDSV3_DPRINTF4("rdsv3_message_add_extension", "Return"); + return (1); +} + +/* + * If a message has extension headers, retrieve them here. + * Call like this: + * + * unsigned int pos = 0; + * + * while (1) { + * buflen = sizeof(buffer); + * type = rdsv3_message_next_extension(hdr, &pos, buffer, &buflen); + * if (type == RDSV3_EXTHDR_NONE) + * break; + * ... + * } + */ +int +rdsv3_message_next_extension(struct rdsv3_header *hdr, + unsigned int *pos, void *buf, unsigned int *buflen) +{ + unsigned int offset, ext_type, ext_len; + uint8_t *src = hdr->h_exthdr; + + RDSV3_DPRINTF4("rdsv3_message_next_extension", "Enter"); + + offset = *pos; + if (offset >= RDSV3_HEADER_EXT_SPACE) + goto none; + + /* + * Get the extension type and length. For now, the + * length is implied by the extension type. + */ + ext_type = src[offset++]; + + if (ext_type == RDSV3_EXTHDR_NONE || ext_type >= __RDSV3_EXTHDR_MAX) + goto none; + ext_len = rdsv3_exthdr_size[ext_type]; + if (offset + ext_len > RDSV3_HEADER_EXT_SPACE) + goto none; + + *pos = offset + ext_len; + if (ext_len < *buflen) + *buflen = ext_len; + (void) memcpy(buf, src + offset, *buflen); + return (ext_type); + +none: + *pos = RDSV3_HEADER_EXT_SPACE; + *buflen = 0; + return (RDSV3_EXTHDR_NONE); +} + +int +rdsv3_message_add_version_extension(struct rdsv3_header *hdr, + unsigned int version) +{ + struct rdsv3_ext_header_version ext_hdr; + + ext_hdr.h_version = htonl(version); + return (rdsv3_message_add_extension(hdr, RDSV3_EXTHDR_VERSION, + &ext_hdr, sizeof (ext_hdr))); +} + +int +rdsv3_message_get_version_extension(struct rdsv3_header *hdr, + unsigned int *version) +{ + struct rdsv3_ext_header_version ext_hdr; + unsigned int pos = 0, len = sizeof (ext_hdr); + + RDSV3_DPRINTF4("rdsv3_message_get_version_extension", "Enter"); + + /* + * We assume the version extension is the only one present + */ + if (rdsv3_message_next_extension(hdr, &pos, &ext_hdr, &len) != + RDSV3_EXTHDR_VERSION) + return (0); + *version = ntohl(ext_hdr.h_version); + return (1); +} + +int +rdsv3_message_add_rdma_dest_extension(struct rdsv3_header *hdr, uint32_t r_key, + uint32_t offset) +{ + struct rdsv3_ext_header_rdma_dest ext_hdr; + + ext_hdr.h_rdma_rkey = htonl(r_key); + ext_hdr.h_rdma_offset = htonl(offset); + return (rdsv3_message_add_extension(hdr, RDSV3_EXTHDR_RDMA_DEST, + &ext_hdr, sizeof (ext_hdr))); +} + +struct rdsv3_message * +rdsv3_message_alloc(unsigned int nents, int gfp) +{ + struct rdsv3_message *rm; + + RDSV3_DPRINTF4("rdsv3_message_alloc", "Enter(nents: %d)", nents); + + rm = kmem_zalloc(sizeof (struct rdsv3_message) + + (nents * sizeof (struct rdsv3_scatterlist)), gfp); + if (!rm) + goto out; + + rm->m_refcount = 1; + list_link_init(&rm->m_sock_item); + list_link_init(&rm->m_conn_item); + mutex_init(&rm->m_rs_lock, NULL, MUTEX_DRIVER, NULL); + + RDSV3_DPRINTF4("rdsv3_message_alloc", "Return(rm: %p)", rm); +out: + return (rm); +} + +struct rdsv3_message * +rdsv3_message_map_pages(unsigned long *page_addrs, unsigned int total_len) +{ + struct rdsv3_message *rm; + unsigned int i; + + RDSV3_DPRINTF4("rdsv3_message_map_pages", "Enter(len: %d)", total_len); + +#ifndef __lock_lint + rm = rdsv3_message_alloc(ceil(total_len, PAGE_SIZE), KM_NOSLEEP); +#else + rm = NULL; +#endif + if (rm == NULL) + return (ERR_PTR(-ENOMEM)); + + set_bit(RDSV3_MSG_PAGEVEC, &rm->m_flags); + rm->m_inc.i_hdr.h_len = htonl(total_len); +#ifndef __lock_lint + rm->m_nents = ceil(total_len, PAGE_SIZE); +#else + rm->m_nents = 0; +#endif + + for (i = 0; i < rm->m_nents; ++i) { + rdsv3_sg_set_page(&rm->m_sg[i], + page_addrs[i], + PAGE_SIZE, 0); + } + + return (rm); +} + +struct rdsv3_message * +rdsv3_message_copy_from_user(struct uio *uiop, + size_t total_len) +{ + struct rdsv3_message *rm; + struct rdsv3_scatterlist *sg; + int ret; + + RDSV3_DPRINTF4("rdsv3_message_copy_from_user", "Enter: %d", total_len); + +#ifndef __lock_lint + rm = rdsv3_message_alloc(ceil(total_len, PAGE_SIZE), KM_NOSLEEP); +#else + rm = NULL; +#endif + if (rm == NULL) { + ret = -ENOMEM; + goto out; + } + + rm->m_inc.i_hdr.h_len = htonl(total_len); + + /* + * now allocate and copy in the data payload. + */ + sg = rm->m_sg; + + while (total_len) { + if (rdsv3_sg_page(sg) == NULL) { + ret = rdsv3_page_remainder_alloc(sg, total_len, 0); + if (ret) + goto out; + rm->m_nents++; + } + + ret = uiomove(rdsv3_sg_page(sg), rdsv3_sg_len(sg), UIO_WRITE, + uiop); + if (ret) + goto out; + + total_len -= rdsv3_sg_len(sg); + sg++; + } + + ret = 0; +out: + if (ret) { + if (rm) + rdsv3_message_put(rm); + rm = ERR_PTR(ret); + } + return (rm); +} + +int +rdsv3_message_inc_copy_to_user(struct rdsv3_incoming *inc, + uio_t *uiop, size_t size) +{ + struct rdsv3_message *rm; + struct rdsv3_scatterlist *sg; + unsigned long to_copy; + unsigned long vec_off; + int copied; + int ret; + uint32_t len; + + rm = container_of(inc, struct rdsv3_message, m_inc); + len = ntohl(rm->m_inc.i_hdr.h_len); + + RDSV3_DPRINTF4("rdsv3_message_inc_copy_to_user", + "Enter(rm: %p, len: %d)", rm, len); + + sg = rm->m_sg; + vec_off = 0; + copied = 0; + + while (copied < size && copied < len) { + + to_copy = min(len - copied, sg->length - vec_off); + to_copy = min(size - copied, to_copy); + + RDSV3_DPRINTF5("rdsv3_message_inc_copy_to_user", + "copying %lu bytes to user iov %p from sg [%p, %u] + %lu\n", + to_copy, uiop, + rdsv3_sg_page(sg), sg->length, vec_off); + + ret = uiomove(rdsv3_sg_page(sg), to_copy, UIO_READ, uiop); + if (ret) + break; + + vec_off += to_copy; + copied += to_copy; + + if (vec_off == sg->length) { + vec_off = 0; + sg++; + } + } + + return (copied); +} + +/* + * If the message is still on the send queue, wait until the transport + * is done with it. This is particularly important for RDMA operations. + */ +void +rdsv3_message_wait(struct rdsv3_message *rm) +{ + rdsv3_wait_event(rdsv3_message_flush_waitq, + !test_bit(RDSV3_MSG_MAPPED, &rm->m_flags)); +} + +void +rdsv3_message_unmapped(struct rdsv3_message *rm) +{ + clear_bit(RDSV3_MSG_MAPPED, &rm->m_flags); + rdsv3_wake_up_all(&rdsv3_message_flush_waitq); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/page.c b/usr/src/uts/common/io/ib/clients/rdsv3/page.c new file mode 100644 index 0000000000..356917c711 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/page.c @@ -0,0 +1,102 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/rds.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +/* + * @bytes - the number of bytes needed. + * + * XXX - This is different from Linux. + */ +int +rdsv3_page_remainder_alloc(struct rdsv3_scatterlist *scat, unsigned long bytes, + int gfp) +{ + caddr_t page; + int ret; + + ASSERT(rdsv3_sg_page(scat) == NULL); + + if (bytes >= PAGE_SIZE) { + page = kmem_alloc(PAGE_SIZE, gfp); + if (page == NULL) { + ret = -ENOMEM; + } else { + rdsv3_sg_set_page(scat, page, PAGE_SIZE, 0); + ret = 0; + } + goto out; + } + + /* + * XXX - This is not same as linux. + */ + page = kmem_alloc(bytes, KM_NOSLEEP); + if (page == NULL) { + ret = -ENOMEM; + goto out; + } + + rdsv3_sg_set_page(scat, page, bytes, 0); + ret = 0; +out: + RDSV3_DPRINTF5("rdsv3_page_remainder_alloc", "bytes %lu %p %u", + bytes, rdsv3_sg_page(scat), scat->length); + return (ret); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdma.c b/usr/src/uts/common/io/ib/clients/rdsv3/rdma.c new file mode 100644 index 0000000000..8f795120cd --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdma.c @@ -0,0 +1,672 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2007 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/ib/clients/of/rdma/ib_verbs.h> +#include <sys/ib/clients/of/rdma/ib_addr.h> +#include <sys/ib/clients/of/rdma/rdma_cm.h> + +#include <sys/ib/clients/rdsv3/ib.h> +#include <sys/ib/clients/rdsv3/rdma.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +#define DMA_TO_DEVICE 0 +#define DMA_FROM_DEVICE 1 +#define RB_CLEAR_NODE(nodep) AVL_SETPARENT(nodep, nodep); + +/* + * XXX + * - build with sparse + * - should we limit the size of a mr region? let transport return failure? + * - should we detect duplicate keys on a socket? hmm. + * - an rdma is an mlock, apply rlimit? + */ + +/* + * get the number of pages by looking at the page indices that the start and + * end addresses fall in. + * + * Returns 0 if the vec is invalid. It is invalid if the number of bytes + * causes the address to wrap or overflows an unsigned int. This comes + * from being stored in the 'length' member of 'struct rdsv3_scatterlist'. + */ +static unsigned int +rdsv3_pages_in_vec(struct rdsv3_iovec *vec) +{ + if ((vec->addr + vec->bytes <= vec->addr) || + (vec->bytes > (uint64_t)UINT_MAX)) { + return (0); + } + + return (((vec->addr + vec->bytes + PAGESIZE - 1) >> + PAGESHIFT) - (vec->addr >> PAGESHIFT)); +} + +static struct rdsv3_mr * +rdsv3_mr_tree_walk(struct avl_tree *root, uint32_t key, + struct rdsv3_mr *insert) +{ + struct rdsv3_mr *mr; + avl_index_t where; + + mr = avl_find(root, &key, &where); + if ((mr == NULL) && (insert != NULL)) { + avl_insert(root, (void *)insert, where); + atomic_add_32(&insert->r_refcount, 1); + return (NULL); + } + + return (mr); +} + +/* + * Destroy the transport-specific part of a MR. + */ +static void +rdsv3_destroy_mr(struct rdsv3_mr *mr) +{ + struct rdsv3_sock *rs = mr->r_sock; + void *trans_private = NULL; + avl_node_t *np; + + RDSV3_DPRINTF5("rdsv3_destroy_mr", + "RDS: destroy mr key is %x refcnt %u", + mr->r_key, atomic_get(&mr->r_refcount)); + + if (test_and_set_bit(RDSV3_MR_DEAD, &mr->r_state)) + return; + + mutex_enter(&rs->rs_rdma_lock); + np = &mr->r_rb_node; + if (AVL_XPARENT(np) != np) + avl_remove(&rs->rs_rdma_keys, mr); + trans_private = mr->r_trans_private; + mr->r_trans_private = NULL; + mutex_exit(&rs->rs_rdma_lock); + + if (trans_private) + mr->r_trans->free_mr(trans_private, mr->r_invalidate); +} + +void +__rdsv3_put_mr_final(struct rdsv3_mr *mr) +{ + rdsv3_destroy_mr(mr); + kmem_free(mr, sizeof (*mr)); +} + +/* + * By the time this is called we can't have any more ioctls called on + * the socket so we don't need to worry about racing with others. + */ +void +rdsv3_rdma_drop_keys(struct rdsv3_sock *rs) +{ + struct rdsv3_mr *mr; + struct avl_node *node; + + /* Release any MRs associated with this socket */ + mutex_enter(&rs->rs_rdma_lock); + while ((node = avl_first(&rs->rs_rdma_keys))) { + mr = container_of(node, struct rdsv3_mr, r_rb_node); + if (mr->r_trans == rs->rs_transport) + mr->r_invalidate = 0; + avl_remove(&rs->rs_rdma_keys, &mr->r_rb_node); + RB_CLEAR_NODE(&mr->r_rb_node) + mutex_exit(&rs->rs_rdma_lock); + rdsv3_destroy_mr(mr); + rdsv3_mr_put(mr); + mutex_enter(&rs->rs_rdma_lock); + } + mutex_exit(&rs->rs_rdma_lock); + + if (rs->rs_transport && rs->rs_transport->flush_mrs) + rs->rs_transport->flush_mrs(); +} + +/* + * Helper function to pin user pages. + */ +#if 0 +static int +rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, + struct page **pages, int write) +{ + unsigned long l_user_addr = user_addr; + unsigned int l_nr_pages = nr_pages; + struct page **l_pages = pages; + int l_write = write; + + /* memory pin in rds_ib_get_mr() */ + return (0); +} +#endif + +static int +__rdsv3_rdma_map(struct rdsv3_sock *rs, struct rdsv3_get_mr_args *args, + uint64_t *cookie_ret, struct rdsv3_mr **mr_ret) +{ + struct rdsv3_mr *mr = NULL, *found; + void *trans_private; + rdsv3_rdma_cookie_t cookie; + unsigned int nents = 0; + int ret; + + if (rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } + + if (rs->rs_transport->get_mr == NULL) { + ret = -EOPNOTSUPP; + goto out; + } + + mr = kmem_zalloc(sizeof (struct rdsv3_mr), KM_NOSLEEP); + if (mr == NULL) { + ret = -ENOMEM; + goto out; + } + + mr->r_refcount = 1; + RB_CLEAR_NODE(&mr->r_rb_node); + mr->r_trans = rs->rs_transport; + mr->r_sock = rs; + + if (args->flags & RDSV3_RDMA_USE_ONCE) + mr->r_use_once = 1; + if (args->flags & RDSV3_RDMA_INVALIDATE) + mr->r_invalidate = 1; + if (args->flags & RDSV3_RDMA_READWRITE) + mr->r_write = 1; + + /* + * Obtain a transport specific MR. If this succeeds, the + * s/g list is now owned by the MR. + * Note that dma_map() implies that pending writes are + * flushed to RAM, so no dma_sync is needed here. + */ + trans_private = rs->rs_transport->get_mr(&args->vec, nents, rs, + &mr->r_key); + + if (IS_ERR(trans_private)) { + ret = PTR_ERR(trans_private); + goto out; + } + + mr->r_trans_private = trans_private; + + /* + * The user may pass us an unaligned address, but we can only + * map page aligned regions. So we keep the offset, and build + * a 64bit cookie containing <R_Key, offset> and pass that + * around. + */ + cookie = rdsv3_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGEMASK); + if (cookie_ret) + *cookie_ret = cookie; + + /* + * copy value of cookie to user address at args->cookie_addr + */ + if (args->cookie_addr) { + ret = ddi_copyout((void *)&cookie, + (void *)((intptr_t)args->cookie_addr), + sizeof (rdsv3_rdma_cookie_t), 0); + if (ret != 0) { + ret = -EFAULT; + goto out; + } + } + + RDSV3_DPRINTF5("__rdsv3_rdma_map", + "RDS: get_mr mr 0x%p addr 0x%llx key 0x%x", + mr, args->vec.addr, mr->r_key); + /* + * Inserting the new MR into the rbtree bumps its + * reference count. + */ + mutex_enter(&rs->rs_rdma_lock); + found = rdsv3_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr); + mutex_exit(&rs->rs_rdma_lock); + + ASSERT(!(found && found != mr)); + + if (mr_ret) { + atomic_add_32(&mr->r_refcount, 1); + *mr_ret = mr; + } + + ret = 0; +out: + if (mr) + rdsv3_mr_put(mr); + return (ret); +} + +int +rdsv3_get_mr(struct rdsv3_sock *rs, const void *optval, int optlen) +{ + struct rdsv3_get_mr_args args; + + if (optlen != sizeof (struct rdsv3_get_mr_args)) + return (-EINVAL); + +#if 1 + bcopy((struct rdsv3_get_mr_args *)optval, &args, + sizeof (struct rdsv3_get_mr_args)); +#else + if (ddi_copyin(optval, &args, optlen, 0)) + return (-EFAULT); +#endif + + return (__rdsv3_rdma_map(rs, &args, NULL, NULL)); +} + +/* + * Free the MR indicated by the given R_Key + */ +int +rdsv3_free_mr(struct rdsv3_sock *rs, const void *optval, int optlen) +{ + struct rdsv3_free_mr_args args; + struct rdsv3_mr *mr; + + if (optlen != sizeof (struct rdsv3_free_mr_args)) + return (-EINVAL); + +#if 1 + bcopy((struct rdsv3_free_mr_args *)optval, &args, + sizeof (struct rdsv3_free_mr_args)); +#else + if (ddi_copyin((struct rdsv3_free_mr_args *)optval, &args, + sizeof (struct rdsv3_free_mr_args), 0)) + return (-EFAULT); +#endif + + /* Special case - a null cookie means flush all unused MRs */ + if (args.cookie == 0) { + if (!rs->rs_transport || !rs->rs_transport->flush_mrs) + return (-EINVAL); + rs->rs_transport->flush_mrs(); + return (0); + } + + /* + * Look up the MR given its R_key and remove it from the rbtree + * so nobody else finds it. + * This should also prevent races with rdsv3_rdma_unuse. + */ + mutex_enter(&rs->rs_rdma_lock); + mr = rdsv3_mr_tree_walk(&rs->rs_rdma_keys, + rdsv3_rdma_cookie_key(args.cookie), NULL); + if (mr) { + avl_remove(&rs->rs_rdma_keys, &mr->r_rb_node); + RB_CLEAR_NODE(&mr->r_rb_node); + if (args.flags & RDSV3_RDMA_INVALIDATE) + mr->r_invalidate = 1; + } + mutex_exit(&rs->rs_rdma_lock); + + if (!mr) + return (-EINVAL); + + /* + * call rdsv3_destroy_mr() ourselves so that we're sure it's done + * by time we return. If we let rdsv3_mr_put() do it it might not + * happen until someone else drops their ref. + */ + rdsv3_destroy_mr(mr); + rdsv3_mr_put(mr); + return (0); +} + +/* + * This is called when we receive an extension header that + * tells us this MR was used. It allows us to implement + * use_once semantics + */ +void +rdsv3_rdma_unuse(struct rdsv3_sock *rs, uint32_t r_key, int force) +{ + struct rdsv3_mr *mr; + int zot_me = 0; + + RDSV3_DPRINTF4("rdsv3_rdma_unuse", "Enter rkey: 0x%x", r_key); + + mutex_enter(&rs->rs_rdma_lock); + mr = rdsv3_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); + if (mr && (mr->r_use_once || force)) { + avl_remove(&rs->rs_rdma_keys, &mr->r_rb_node); + RB_CLEAR_NODE(&mr->r_rb_node); + zot_me = 1; + } else if (mr) + atomic_add_32(&mr->r_refcount, 1); + mutex_exit(&rs->rs_rdma_lock); + + /* + * May have to issue a dma_sync on this memory region. + * Note we could avoid this if the operation was a RDMA READ, + * but at this point we can't tell. + */ + if (mr != NULL) { + RDSV3_DPRINTF4("rdsv3_rdma_unuse", "mr: %p zot_me %d", + mr, zot_me); + if (mr->r_trans->sync_mr) + mr->r_trans->sync_mr(mr->r_trans_private, + DMA_FROM_DEVICE); + + /* + * If the MR was marked as invalidate, this will + * trigger an async flush. + */ + if (zot_me) + rdsv3_destroy_mr(mr); + rdsv3_mr_put(mr); + } + RDSV3_DPRINTF4("rdsv3_rdma_unuse", "Return"); +} + +void +rdsv3_rdma_free_op(struct rdsv3_rdma_op *ro) +{ + unsigned int i; + + /* deallocate RDMA resources on rdsv3_message */ + + for (i = 0; i < ro->r_nents; i++) { + ddi_umem_unlock(ro->r_rdma_sg[i].umem_cookie); + } + + if (ro->r_notifier) + kmem_free(ro->r_notifier, sizeof (*ro->r_notifier)); + kmem_free(ro, sizeof (*ro)); +} + +extern struct umem_callback_ops rdsv3_umem_cbops; +/* + * args is a pointer to an in-kernel copy in the sendmsg cmsg. + */ +static struct rdsv3_rdma_op * +rdsv3_rdma_prepare(struct rdsv3_sock *rs, struct rdsv3_rdma_args *args) +{ + struct rdsv3_iovec vec; + struct rdsv3_rdma_op *op = NULL; + unsigned int nr_bytes; + struct rdsv3_iovec *local_vec; + unsigned int nr; + unsigned int i; + ddi_umem_cookie_t umem_cookie; + size_t umem_len; + caddr_t umem_addr; + int umem_flags; + int ret; + + if (rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } + + if (args->nr_local > (uint64_t)UINT_MAX) { + ret = -EMSGSIZE; + goto out; + } + + op = kmem_zalloc(offsetof(struct rdsv3_rdma_op, + r_rdma_sg[args->nr_local]), KM_NOSLEEP); + if (op == NULL) { + ret = -ENOMEM; + goto out; + } + + op->r_write = !!(args->flags & RDSV3_RDMA_READWRITE); + op->r_fence = !!(args->flags & RDSV3_RDMA_FENCE); + op->r_notify = !!(args->flags & RDSV3_RDMA_NOTIFY_ME); + op->r_recverr = rs->rs_recverr; + + if (op->r_notify || op->r_recverr) { + /* + * We allocate an uninitialized notifier here, because + * we don't want to do that in the completion handler. We + * would have to use GFP_ATOMIC there, and don't want to deal + * with failed allocations. + */ + op->r_notifier = kmem_alloc(sizeof (struct rdsv3_notifier), + KM_NOSLEEP); + if (!op->r_notifier) { + ret = -ENOMEM; + goto out; + } + op->r_notifier->n_user_token = args->user_token; + op->r_notifier->n_status = RDSV3_RDMA_SUCCESS; + } + + /* + * The cookie contains the R_Key of the remote memory region, and + * optionally an offset into it. This is how we implement RDMA into + * unaligned memory. + * When setting up the RDMA, we need to add that offset to the + * destination address (which is really an offset into the MR) + * FIXME: We may want to move this into ib_rdma.c + */ + op->r_key = rdsv3_rdma_cookie_key(args->cookie); + op->r_remote_addr = args->remote_vec.addr + + rdsv3_rdma_cookie_offset(args->cookie); + + nr_bytes = 0; + + RDSV3_DPRINTF5("rdsv3_rdma_prepare", + "RDS: rdma prepare nr_local %llu rva %llx rkey %x", + (unsigned long long)args->nr_local, + (unsigned long long)args->remote_vec.addr, + op->r_key); + + local_vec = (struct rdsv3_iovec *)(unsigned long) args->local_vec_addr; + + /* pin the scatter list of user buffers */ + for (i = 0; i < args->nr_local; i++) { + if (ddi_copyin(&local_vec[i], &vec, + sizeof (struct rdsv3_iovec), 0)) { + ret = -EFAULT; + goto out; + } + + nr = rdsv3_pages_in_vec(&vec); + if (nr == 0) { + RDSV3_DPRINTF2("rdsv3_rdma_prepare", + "rdsv3_pages_in_vec returned 0"); + ret = -EINVAL; + goto out; + } + + rs->rs_user_addr = vec.addr; + rs->rs_user_bytes = vec.bytes; + + /* pin user memory pages */ + umem_len = ptob(btopr(vec.bytes + + ((uintptr_t)vec.addr & PAGEOFFSET))); + umem_addr = (caddr_t)((uintptr_t)vec.addr & ~PAGEOFFSET); + umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ | + DDI_UMEMLOCK_LONGTERM); + ret = umem_lockmemory(umem_addr, umem_len, umem_flags, + &umem_cookie, &rdsv3_umem_cbops, NULL); + if (ret != 0) { + RDSV3_DPRINTF2("rdsv3_rdma_prepare", + "umem_lockmemory() returned %d", ret); + ret = -EFAULT; + goto out; + } + op->r_rdma_sg[i].umem_cookie = umem_cookie; + op->r_rdma_sg[i].iovec = vec; + nr_bytes += vec.bytes; + + RDSV3_DPRINTF5("rdsv3_rdma_prepare", + "RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx", + nr_bytes, nr, vec.bytes, vec.addr); + } + op->r_nents = i; + + if (nr_bytes > args->remote_vec.bytes) { + RDSV3_DPRINTF2("rdsv3_rdma_prepare", + "RDS nr_bytes %u remote_bytes %u do not match", + nr_bytes, (unsigned int) args->remote_vec.bytes); + ret = -EINVAL; + goto out; + } + op->r_bytes = nr_bytes; + + ret = 0; +out: + if (ret) { + if (op) + rdsv3_rdma_free_op(op); + op = ERR_PTR(ret); + } + return (op); +} + +/* + * The application asks for a RDMA transfer. + * Extract all arguments and set up the rdma_op + */ +int +rdsv3_cmsg_rdma_args(struct rdsv3_sock *rs, struct rdsv3_message *rm, + struct cmsghdr *cmsg) +{ + struct rdsv3_rdma_op *op; + struct rdsv3_rdma_args *ap; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof (struct rdsv3_rdma_args)) || + rm->m_rdma_op != NULL) + return (-EINVAL); + + /* uint64_t alignment on struct rdsv3_get_mr_args */ + ap = (struct rdsv3_rdma_args *)kmem_alloc(cmsg->cmsg_len, KM_SLEEP); + bcopy(CMSG_DATA(cmsg), ap, cmsg->cmsg_len); + op = rdsv3_rdma_prepare(rs, ap); + kmem_free(ap, cmsg->cmsg_len); + if (IS_ERR(op)) + return (PTR_ERR(op)); + rdsv3_stats_inc(s_send_rdma); + rm->m_rdma_op = op; + return (0); +} + +/* + * The application wants us to pass an RDMA destination (aka MR) + * to the remote + */ +int +rdsv3_cmsg_rdma_dest(struct rdsv3_sock *rs, struct rdsv3_message *rm, + struct cmsghdr *cmsg) +{ + struct rdsv3_mr *mr; + uint32_t r_key; + int err = 0; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof (rdsv3_rdma_cookie_t)) || + rm->m_rdma_cookie != 0) + return (-EINVAL); + + (void) memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), + sizeof (rm->m_rdma_cookie)); + + /* + * We are reusing a previously mapped MR here. Most likely, the + * application has written to the buffer, so we need to explicitly + * flush those writes to RAM. Otherwise the HCA may not see them + * when doing a DMA from that buffer. + */ + r_key = rdsv3_rdma_cookie_key(rm->m_rdma_cookie); + + mutex_enter(&rs->rs_rdma_lock); + mr = rdsv3_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); + if (mr == NULL) + err = -EINVAL; /* invalid r_key */ + else + atomic_add_32(&mr->r_refcount, 1); + mutex_exit(&rs->rs_rdma_lock); + + if (mr) { + mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); + rm->m_rdma_mr = mr; + } + return (err); +} + +/* + * The application passes us an address range it wants to enable RDMA + * to/from. We map the area, and save the <R_Key,offset> pair + * in rm->m_rdma_cookie. This causes it to be sent along to the peer + * in an extension header. + */ +int +rdsv3_cmsg_rdma_map(struct rdsv3_sock *rs, struct rdsv3_message *rm, + struct cmsghdr *cmsg) +{ + struct rdsv3_get_mr_args *mrp; + int status; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof (struct rdsv3_get_mr_args)) || + rm->m_rdma_cookie != 0) + return (-EINVAL); + + /* uint64_t alignment on struct rdsv3_get_mr_args */ + mrp = (struct rdsv3_get_mr_args *)kmem_alloc(cmsg->cmsg_len, KM_SLEEP); + bcopy(CMSG_DATA(cmsg), mrp, cmsg->cmsg_len); + status = __rdsv3_rdma_map(rs, mrp, &rm->m_rdma_cookie, &rm->m_rdma_mr); + kmem_free(mrp, cmsg->cmsg_len); + return (status); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdma_transport.c b/usr/src/uts/common/io/ib/clients/rdsv3/rdma_transport.c new file mode 100644 index 0000000000..1c87e52cdf --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdma_transport.c @@ -0,0 +1,292 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2009 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/ib/clients/of/rdma/ib_verbs.h> +#include <sys/ib/clients/of/rdma/ib_addr.h> +#include <sys/ib/clients/of/rdma/rdma_cm.h> + +#include <sys/ib/clients/rdsv3/ib.h> +#include <sys/ib/clients/rdsv3/rdma_transport.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +kmutex_t rdsv3_rdma_listen_id_lock; +struct rdma_cm_id *rdsv3_rdma_listen_id = NULL; + +int +rdsv3_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + /* this can be null in the listening path */ + struct rdsv3_connection *conn = cm_id->context; + struct rdsv3_transport *trans; + int ret = 0; + + RDSV3_DPRINTF2("rdsv3_rdma_cm_event_handler", + "conn %p id %p handling event %u", conn, cm_id, event->event); + + trans = &rdsv3_ib_transport; + + /* + * Prevent shutdown from tearing down the connection + * while we're executing. + */ + if (conn) { + mutex_enter(&conn->c_cm_lock); + + /* + * If the connection is being shut down, bail out + * right away. We return 0 so cm_id doesn't get + * destroyed prematurely + */ + if (rdsv3_conn_state(conn) == RDSV3_CONN_DISCONNECTING) { + /* + * Reject incoming connections while we're tearing + * down an existing one. + */ + if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) + ret = 1; + RDSV3_DPRINTF2("rdsv3_rdma_cm_event_handler", + "conn %p id %p incoming event %u when " + "disconnecting", conn, cm_id, event->event); + goto out; + } + } + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = trans->cm_handle_connect(cm_id, event); + break; + + case RDMA_CM_EVENT_ADDR_RESOLVED: + /* XXX do we need to clean up if this fails? */ + ret = rdma_resolve_route(cm_id, + RDSV3_RDMA_RESOLVE_TIMEOUT_MS); + break; + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + /* XXX worry about racing with listen acceptance */ + ret = trans->cm_initiate_connect(cm_id); + break; + + case RDMA_CM_EVENT_ESTABLISHED: + trans->cm_connect_complete(conn, event); + break; + + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_ADDR_CHANGE: + if (conn) + rdsv3_conn_drop(conn); + break; + + case RDMA_CM_EVENT_DISCONNECTED: + RDSV3_DPRINTF2("rdsv3_rdma_cm_event_handler", + "RDS/RDMA: DISCONNECT event - dropping connection " + "cm_id: %p", cm_id); + if (conn) { + RDSV3_DPRINTF0("rdsv3_rdma_cm_event_handler", + "RDS/RDMA: DISCONNECT event - dropping connection " + "%u.%u.%u.%u ->%u.%u.%u.%u", NIPQUAD(conn->c_laddr), + NIPQUAD(conn->c_faddr)); + rdsv3_conn_drop(conn); + } + break; + + default: + /* things like device disconnect? */ + RDSV3_DPRINTF0("rdsv3_rdma_cm_event_handler", + "unknown event %u\n", event->event); + RDSV3_PANIC(); + break; + } + +out: + if (conn) { +#ifndef __lock_lint + // struct rds_iw_connection *ic = conn->c_transport_data; + + /* If we return non-zero, we must to hang on to the cm_id */ + // BUG_ON(ic->i_cm_id == cm_id && ret); +#endif + + mutex_exit(&conn->c_cm_lock); + } + + RDSV3_DPRINTF2("rdsv3_rdma_cm_event_handler", + "id %p event %u handling ret %d", cm_id, event->event, ret); + + return (ret); +} + +static int +rdsv3_rdma_listen_init(void) +{ + struct sockaddr_in sin; + struct rdma_cm_id *cm_id; + int ret; + + RDSV3_DPRINTF2("rdsv3_rdma_listen_init", "Enter"); + + cm_id = rdma_create_id(rdsv3_rdma_cm_event_handler, NULL, RDMA_PS_TCP); + if (IS_ERR(cm_id)) { + ret = PTR_ERR(cm_id); + RDSV3_DPRINTF0("rdsv3_rdma_listen_init", + "RDS/RDMA: failed to setup listener, " + "rdma_create_id() returned %d", ret); + goto out; + } + + sin.sin_family = PF_INET; + sin.sin_addr.s_addr = (uint32_t)htonl(INADDR_ANY); + sin.sin_port = (uint16_t)htons(RDSV3_PORT); + + /* + * XXX I bet this binds the cm_id to a device. If we want to support + * fail-over we'll have to take this into consideration. + */ + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + if (ret) { + RDSV3_DPRINTF0("rdsv3_rdma_listen_init", + "RDS/RDMA: failed to setup listener, " + "rdma_bind_addr() returned %d", ret); + goto out; + } + + ret = rdma_listen(cm_id, 128); + if (ret) { + RDSV3_DPRINTF0("rdsv3_rdma_listen_init", + "RDS/RDMA: failed to setup listener, " + "rdma_listen() returned %d", ret); + goto out; + } + + RDSV3_DPRINTF5("rdsv3_rdma_listen_init", + "cm %p listening on port %u", cm_id, RDSV3_PORT); + + rdsv3_rdma_listen_id = cm_id; + cm_id = NULL; + + RDSV3_DPRINTF2("rdsv3_rdma_listen_init", + "Return: rdsv3_rdma_listen_id: %p", rdsv3_rdma_listen_id); +out: + if (cm_id) + rdma_destroy_id(cm_id); + return (ret); +} + +static void rdsv3_rdma_listen_stop(void) +{ + RDSV3_DPRINTF2("rdsv3_rdma_listen_stop", "cm %p", rdsv3_rdma_listen_id); + rdma_destroy_id(rdsv3_rdma_listen_id); + + RDSV3_DPRINTF2("rdsv3_rdma_listen_stop", "Return"); +} + +/* + * This function can be called via two routes. + * 1. During attach on a worker thread. + * 2. From rdsv3_create() for 1st socket. + */ +void +rdsv3_rdma_init() +{ + int ret; + + RDSV3_DPRINTF2("rdsv3_rdma_init", "Enter"); + + mutex_enter(&rdsv3_rdma_listen_id_lock); + if (rdsv3_rdma_listen_id != NULL) { + RDSV3_DPRINTF2("rdsv3_rdma_init", + "rdsv3_rdma_listen_id is already initialized: %p", + rdsv3_rdma_listen_id); + mutex_exit(&rdsv3_rdma_listen_id_lock); + return; + } + + ret = rdsv3_rdma_listen_init(); + if (ret) { + mutex_exit(&rdsv3_rdma_listen_id_lock); + return; + } + + ret = rdsv3_ib_init(); + if (ret) { + rdsv3_rdma_listen_stop(); + } + mutex_exit(&rdsv3_rdma_listen_id_lock); + + RDSV3_DPRINTF2("rdsv3_rdma_init", "Return"); +} + +/*ARGSUSED*/ +void +rdsv3_rdma_exit(void *arg) +{ + RDSV3_DPRINTF2("rdsv3_rdma_exit", "Enter"); + + /* stop listening first to ensure no new connections are attempted */ + if (rdsv3_rdma_listen_id) { + rdsv3_rdma_listen_stop(); + rdsv3_ib_exit(); + rdsv3_rdma_listen_id = NULL; + } + + RDSV3_DPRINTF2("rdsv3_rdma_exit", "Return"); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rds_recv.c b/usr/src/uts/common/io/ib/clients/rdsv3/rds_recv.c new file mode 100644 index 0000000000..59ffba52a8 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/rds_recv.c @@ -0,0 +1,677 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/rds.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/rdma.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +void +rdsv3_inc_init(struct rdsv3_incoming *inc, struct rdsv3_connection *conn, + uint32_be_t saddr) +{ + RDSV3_DPRINTF5("rdsv3_inc_init", "Enter(inc: %p, conn: %p)", inc, conn); + inc->i_refcount = 1; + list_link_init(&inc->i_item); + inc->i_conn = conn; + inc->i_saddr = saddr; + inc->i_rdma_cookie = 0; +} + +void +rdsv3_inc_addref(struct rdsv3_incoming *inc) +{ + RDSV3_DPRINTF4("rdsv3_inc_addref", + "addref inc %p ref %d", inc, atomic_get(&inc->i_refcount)); + atomic_add_32(&inc->i_refcount, 1); +} + +void +rdsv3_inc_put(struct rdsv3_incoming *inc) +{ + RDSV3_DPRINTF4("rdsv3_inc_put", "put inc %p ref %d", + inc, atomic_get(&inc->i_refcount)); + if (atomic_dec_and_test(&inc->i_refcount)) { + ASSERT(!list_link_active(&inc->i_item)); + + inc->i_conn->c_trans->inc_free(inc); + } +} + +/*ARGSUSED*/ +static void +rdsv3_recv_rcvbuf_delta(struct rdsv3_sock *rs, struct rsock *sk, + struct rdsv3_cong_map *map, + int delta, uint16_be_t port) +{ + int now_congested; + + RDSV3_DPRINTF4("rdsv3_recv_rcvbuf_delta", + "Enter(rs: %p, map: %p, delta: %d, port: %d)", + rs, map, delta, port); + + if (delta == 0) + return; + + rs->rs_rcv_bytes += delta; + now_congested = rs->rs_rcv_bytes > rdsv3_sk_rcvbuf(rs); + + RDSV3_DPRINTF5("rdsv3_recv_rcvbuf_delta", + "rs %p (%u.%u.%u.%u:%u) recv bytes %d buf %d " + "now_cong %d delta %d", + rs, NIPQUAD(rs->rs_bound_addr), + (int)ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, + rdsv3_sk_rcvbuf(rs), now_congested, delta); + + /* wasn't -> am congested */ + if (!rs->rs_congested && now_congested) { + rs->rs_congested = 1; + rdsv3_cong_set_bit(map, port); + rdsv3_cong_queue_updates(map); + } + /* was -> aren't congested */ + /* + * Require more free space before reporting uncongested to prevent + * bouncing cong/uncong state too often + */ + else if (rs->rs_congested && + (rs->rs_rcv_bytes < (rdsv3_sk_rcvbuf(rs)/2))) { + rs->rs_congested = 0; + rdsv3_cong_clear_bit(map, port); + rdsv3_cong_queue_updates(map); + } + + /* do nothing if no change in cong state */ + + RDSV3_DPRINTF4("rdsv3_recv_rcvbuf_delta", "Return(rs: %p)", rs); +} + +/* + * Process all extension headers that come with this message. + */ +static void +rdsv3_recv_incoming_exthdrs(struct rdsv3_incoming *inc, struct rdsv3_sock *rs) +{ + struct rdsv3_header *hdr = &inc->i_hdr; + unsigned int pos = 0, type, len; + union { + struct rdsv3_ext_header_version version; + struct rdsv3_ext_header_rdma rdma; + struct rdsv3_ext_header_rdma_dest rdma_dest; + } buffer; + + RDSV3_DPRINTF4("rdsv3_recv_incoming_exthdrs", "Enter"); + while (1) { + len = sizeof (buffer); + type = rdsv3_message_next_extension(hdr, &pos, &buffer, &len); + if (type == RDSV3_EXTHDR_NONE) + break; + RDSV3_DPRINTF4("recv_incoming_exthdrs", "type %d", type); + /* Process extension header here */ + switch (type) { + case RDSV3_EXTHDR_RDMA: + rdsv3_rdma_unuse(rs, ntohl(buffer.rdma.h_rdma_rkey), + 0); + break; + + case RDSV3_EXTHDR_RDMA_DEST: + /* + * We ignore the size for now. We could stash it + * somewhere and use it for error checking. + */ + inc->i_rdma_cookie = rdsv3_rdma_make_cookie( + ntohl(buffer.rdma_dest.h_rdma_rkey), + ntohl(buffer.rdma_dest.h_rdma_offset)); + + break; + } + } + RDSV3_DPRINTF4("rdsv3_recv_incoming_exthdrs", "Return"); +} + +/* + * The transport must make sure that this is serialized against other + * rx and conn reset on this specific conn. + * + * We currently assert that only one fragmented message will be sent + * down a connection at a time. This lets us reassemble in the conn + * instead of per-flow which means that we don't have to go digging through + * flows to tear down partial reassembly progress on conn failure and + * we save flow lookup and locking for each frag arrival. It does mean + * that small messages will wait behind large ones. Fragmenting at all + * is only to reduce the memory consumption of pre-posted buffers. + * + * The caller passes in saddr and daddr instead of us getting it from the + * conn. This lets loopback, who only has one conn for both directions, + * tell us which roles the addrs in the conn are playing for this message. + */ +/* ARGSUSED */ +void +rdsv3_recv_incoming(struct rdsv3_connection *conn, uint32_be_t saddr, + uint32_be_t daddr, struct rdsv3_incoming *inc, int gfp) +{ + struct rdsv3_sock *rs = NULL; + struct rsock *sk; + + inc->i_conn = conn; + inc->i_rx_jiffies = jiffies; + + RDSV3_DPRINTF5("rdsv3_recv_incoming", + "conn %p next %llu inc %p seq %llu len %u sport %u dport %u " + "flags 0x%x rx_jiffies %lu", conn, + (unsigned long long)conn->c_next_rx_seq, + inc, + (unsigned long long)ntohll(inc->i_hdr.h_sequence), + ntohl(inc->i_hdr.h_len), + ntohs(inc->i_hdr.h_sport), + ntohs(inc->i_hdr.h_dport), + inc->i_hdr.h_flags, + inc->i_rx_jiffies); + + /* + * Sequence numbers should only increase. Messages get their + * sequence number as they're queued in a sending conn. They + * can be dropped, though, if the sending socket is closed before + * they hit the wire. So sequence numbers can skip forward + * under normal operation. They can also drop back in the conn + * failover case as previously sent messages are resent down the + * new instance of a conn. We drop those, otherwise we have + * to assume that the next valid seq does not come after a + * hole in the fragment stream. + * + * The headers don't give us a way to realize if fragments of + * a message have been dropped. We assume that frags that arrive + * to a flow are part of the current message on the flow that is + * being reassembled. This means that senders can't drop messages + * from the sending conn until all their frags are sent. + * + * XXX we could spend more on the wire to get more robust failure + * detection, arguably worth it to avoid data corruption. + */ + if (ntohll(inc->i_hdr.h_sequence) < conn->c_next_rx_seq && + (inc->i_hdr.h_flags & RDSV3_FLAG_RETRANSMITTED)) { + rdsv3_stats_inc(s_recv_drop_old_seq); + goto out; + } + conn->c_next_rx_seq = ntohll(inc->i_hdr.h_sequence) + 1; + + if (rdsv3_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { + rdsv3_stats_inc(s_recv_ping); + (void) rdsv3_send_pong(conn, inc->i_hdr.h_sport); + goto out; + } + + rs = rdsv3_find_bound(daddr, inc->i_hdr.h_dport); + if (rs == NULL) { + rdsv3_stats_inc(s_recv_drop_no_sock); + goto out; + } + + /* Process extension headers */ + rdsv3_recv_incoming_exthdrs(inc, rs); + + /* We can be racing with rdsv3_release() which marks the socket dead. */ + sk = rdsv3_rs_to_sk(rs); + + /* serialize with rdsv3_release -> sock_orphan */ + rw_enter(&rs->rs_recv_lock, RW_WRITER); + if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD)) { + int error, bytes; + RDSV3_DPRINTF5("rdsv3_recv_incoming", + "adding inc %p to rs %p's recv queue", inc, rs); + rdsv3_stats_inc(s_recv_queued); + rdsv3_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, + ntohl(inc->i_hdr.h_len), + inc->i_hdr.h_dport); + rdsv3_inc_addref(inc); + list_insert_tail(&rs->rs_recv_queue, inc); + bytes = rs->rs_rcv_bytes; + rw_exit(&rs->rs_recv_lock); + + __rdsv3_wake_sk_sleep(sk); + + /* wake up anyone waiting in poll */ + sk->sk_upcalls->su_recv(sk->sk_upper_handle, NULL, + bytes, 0, &error, NULL); + if (error != 0) { + RDSV3_DPRINTF2("rdsv3_recv_incoming", + "su_recv returned: %d", error); + } + } else { + rdsv3_stats_inc(s_recv_drop_dead_sock); + rw_exit(&rs->rs_recv_lock); + } + +out: + if (rs) + rdsv3_sock_put(rs); +} + +/* + * be very careful here. This is being called as the condition in + * wait_event_*() needs to cope with being called many times. + */ +static int +rdsv3_next_incoming(struct rdsv3_sock *rs, struct rdsv3_incoming **inc) +{ + if (*inc == NULL) { + rw_enter(&rs->rs_recv_lock, RW_READER); + if (!list_is_empty(&rs->rs_recv_queue)) { + *inc = list_head(&rs->rs_recv_queue); + rdsv3_inc_addref(*inc); + } + rw_exit(&rs->rs_recv_lock); + } + + return (*inc != NULL); +} + +static int +rdsv3_still_queued(struct rdsv3_sock *rs, struct rdsv3_incoming *inc, + int drop) +{ + struct rsock *sk = rdsv3_rs_to_sk(rs); + int ret = 0; + + RDSV3_DPRINTF4("rdsv3_still_queued", "Enter rs: %p inc: %p drop: %d", + rs, inc, drop); + + rw_enter(&rs->rs_recv_lock, RW_WRITER); + if (list_link_active(&inc->i_item)) { + ret = 1; + if (drop) { + /* XXX make sure this i_conn is reliable */ + rdsv3_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, + -ntohl(inc->i_hdr.h_len), + inc->i_hdr.h_dport); + list_remove_node(&inc->i_item); + rdsv3_inc_put(inc); + } + } + rw_exit(&rs->rs_recv_lock); + + RDSV3_DPRINTF5("rdsv3_still_queued", + "inc %p rs %p still %d dropped %d", inc, rs, ret, drop); + return (ret); +} + +/* + * Pull errors off the error queue. + * If msghdr is NULL, we will just purge the error queue. + */ +int +rdsv3_notify_queue_get(struct rdsv3_sock *rs, struct msghdr *msghdr) +{ + struct rdsv3_notifier *notifier; + struct rdsv3_rdma_notify cmsg; + unsigned int count = 0, max_messages = ~0U; + list_t copy; + int err = 0; + + RDSV3_DPRINTF4("rdsv3_notify_queue_get", "Enter(rs: %p)", rs); + + list_create(©, sizeof (struct rdsv3_notifier), + offsetof(struct rdsv3_notifier, n_list)); + + + /* + * put_cmsg copies to user space and thus may sleep. We can't do this + * with rs_lock held, so first grab as many notifications as we can + * stuff + * in the user provided cmsg buffer. We don't try to copy more, to avoid + * losing notifications - except when the buffer is so small that + * it wouldn't + * even hold a single notification. Then we give him as much of this + * single + * msg as we can squeeze in, and set MSG_CTRUNC. + */ + if (msghdr) { + max_messages = + msghdr->msg_controllen / CMSG_SPACE(sizeof (cmsg)); + if (!max_messages) + max_messages = 1; + } + + mutex_enter(&rs->rs_lock); + while (!list_is_empty(&rs->rs_notify_queue) && count < max_messages) { + notifier = list_remove_head(&rs->rs_notify_queue); + list_insert_tail(©, notifier); + count++; + } + mutex_exit(&rs->rs_lock); + + if (!count) + return (0); + + while (!list_is_empty(©)) { + notifier = list_remove_head(©); + + if (msghdr) { + cmsg.user_token = notifier->n_user_token; + cmsg.status = notifier->n_status; + + err = rdsv3_put_cmsg(msghdr, SOL_RDS, + RDSV3_CMSG_RDMA_STATUS, sizeof (cmsg), &cmsg); + if (err) + break; + } + + kmem_free(notifier, sizeof (struct rdsv3_notifier)); + } + + /* + * If we bailed out because of an error in put_cmsg, + * we may be left with one or more notifications that we + * didn't process. Return them to the head of the list. + */ + if (!list_is_empty(©)) { + mutex_enter(&rs->rs_lock); + list_splice(©, &rs->rs_notify_queue); + mutex_exit(&rs->rs_lock); + } + + RDSV3_DPRINTF4("rdsv3_notify_queue_get", "Return(rs: %p)", rs); + + return (err); +} + +/* + * Queue a congestion notification + */ +static int +rdsv3_notify_cong(struct rdsv3_sock *rs, struct msghdr *msghdr) +{ + uint64_t notify = rs->rs_cong_notify; + int err; + + err = rdsv3_put_cmsg(msghdr, SOL_RDS, RDSV3_CMSG_CONG_UPDATE, + sizeof (notify), ¬ify); + if (err) + return (err); + + mutex_enter(&rs->rs_lock); + rs->rs_cong_notify &= ~notify; + mutex_exit(&rs->rs_lock); + + return (0); +} + +/* + * Receive any control messages. + */ +static int +rdsv3_cmsg_recv(struct rdsv3_incoming *inc, struct msghdr *msg) +{ + return (rdsv3_put_cmsg(msg, SOL_RDS, RDSV3_CMSG_RDMA_DEST, + sizeof (inc->i_rdma_cookie), &inc->i_rdma_cookie)); +} + +int +rdsv3_recvmsg(struct rdsv3_sock *rs, uio_t *uio, + struct nmsghdr *msg, size_t size, int msg_flags) +{ + struct rsock *sk = rdsv3_rs_to_sk(rs); + long timeo; + int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; + struct sockaddr_in *sin = NULL; + struct rdsv3_incoming *inc = NULL; + + RDSV3_DPRINTF4("rdsv3_recvmsg", + "Enter(rs: %p size: %d msg_flags: 0x%x)", rs, size, msg_flags); + + /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */ + timeo = rdsv3_rcvtimeo(sk, nonblock); + + if (msg_flags & MSG_OOB) + goto out; + + /* mark the first cmsg position */ + if (msg) { + msg->msg_control = NULL; + } + + while (1) { + /* + * If there are pending notifications, do those - + * and nothing else + */ + if (!list_is_empty(&rs->rs_notify_queue)) { + ret = rdsv3_notify_queue_get(rs, msg); + + if (msg && msg->msg_namelen) { + sin = kmem_zalloc(sizeof (struct sockaddr_in), + KM_SLEEP); + sin->sin_family = AF_INET_OFFLOAD; + if (inc) { + sin->sin_port = inc->i_hdr.h_sport; + sin->sin_addr.s_addr = inc->i_saddr; + } + msg->msg_namelen = sizeof (struct sockaddr_in); + msg->msg_name = sin; + } + break; + } + + if (rs->rs_cong_notify) { + ret = rdsv3_notify_cong(rs, msg); + goto out; + } + + if (!rdsv3_next_incoming(rs, &inc)) { + if (nonblock) { + ret = -EAGAIN; + break; + } + + RDSV3_DPRINTF3("rdsv3_recvmsg", + "Before wait (rs: %p)", rs); + + mutex_enter(&sk->sk_sleep->waitq_mutex); + while ((list_is_empty(&rs->rs_notify_queue) && + !rs->rs_cong_notify && + !rdsv3_next_incoming(rs, &inc))) { +#if 0 + ret = cv_timedwait_sig(&sk->sk_sleep->waitq_cv, + &sk->sk_sleep->waitq_mutex, + timeo * drv_usectohz(1000000) + + ddi_get_lbolt()); + if (ret <= 0) { + /* signal/timeout pending */ + RDSV3_DPRINTF2("rdsv3_recvmsg", + "woke due to signal/timeout: %d", + ret); + ret = (ret == 0) ? -ERESTART : + -ETIMEDOUT; + break; + } +#else + ret = cv_wait_sig(&sk->sk_sleep->waitq_cv, + &sk->sk_sleep->waitq_mutex); + if (ret == 0) { + /* signal/timeout pending */ + RDSV3_DPRINTF2("rdsv3_recvmsg", + "woke due to signal"); + ret = -ERESTART; + break; + } +#endif + } + mutex_exit(&sk->sk_sleep->waitq_mutex); + + RDSV3_DPRINTF5("rdsv3_recvmsg", + "recvmsg woke rs: %p inc %p ret %d", + rs, inc, -ret); + + if (ret < 0) + break; + + /* + * if the wakeup was due to rs_notify_queue or + * rs_cong_notify then we need to handle those first. + */ + continue; + } + + RDSV3_DPRINTF5("rdsv3_recvmsg", + "copying inc %p from %u.%u.%u.%u:%u to user", inc, + NIPQUAD(inc->i_conn->c_faddr), + ntohs(inc->i_hdr.h_sport)); + ret = inc->i_conn->c_trans->inc_copy_to_user(inc, uio, size); + if (ret < 0) + break; + + /* + * if the message we just copied isn't at the head of the + * recv queue then someone else raced us to return it, try + * to get the next message. + */ + if (!rdsv3_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) { + rdsv3_inc_put(inc); + inc = NULL; + rdsv3_stats_inc(s_recv_deliver_raced); + continue; + } + + if (ret < ntohl(inc->i_hdr.h_len)) { + if (msg_flags & MSG_TRUNC) + ret = ntohl(inc->i_hdr.h_len); + msg->msg_flags |= MSG_TRUNC; + } + + if (rdsv3_cmsg_recv(inc, msg)) { + ret = -EFAULT; + goto out; + } + + rdsv3_stats_inc(s_recv_delivered); + + if (msg->msg_namelen) { + sin = kmem_alloc(sizeof (struct sockaddr_in), KM_SLEEP); + sin->sin_family = AF_INET_OFFLOAD; + sin->sin_port = inc->i_hdr.h_sport; + sin->sin_addr.s_addr = inc->i_saddr; + (void) memset(sin->sin_zero, 0, + sizeof (sin->sin_zero)); + msg->msg_namelen = sizeof (struct sockaddr_in); + msg->msg_name = sin; + } + break; + } + + if (inc) + rdsv3_inc_put(inc); + +out: + RDSV3_DPRINTF4("rdsv3_recvmsg", "Return(rs: %p, ret: %d)", rs, ret); + + return (ret); +} + +/* + * The socket is being shut down and we're asked to drop messages that were + * queued for recvmsg. The caller has unbound the socket so the receive path + * won't queue any more incoming fragments or messages on the socket. + */ +void +rdsv3_clear_recv_queue(struct rdsv3_sock *rs) +{ + struct rsock *sk = rdsv3_rs_to_sk(rs); + struct rdsv3_incoming *inc, *tmp; + + RDSV3_DPRINTF4("rdsv3_clear_recv_queue", "Enter(rs: %p)", rs); + + rw_enter(&rs->rs_recv_lock, RW_WRITER); + RDSV3_FOR_EACH_LIST_NODE_SAFE(inc, tmp, &rs->rs_recv_queue, i_item) { + rdsv3_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, + -ntohl(inc->i_hdr.h_len), + inc->i_hdr.h_dport); + list_remove_node(&inc->i_item); + rdsv3_inc_put(inc); + } + rw_exit(&rs->rs_recv_lock); + + RDSV3_DPRINTF4("rdsv3_clear_recv_queue", "Return(rs: %p)", rs); +} + +/* + * inc->i_saddr isn't used here because it is only set in the receive + * path. + */ +void +rdsv3_inc_info_copy(struct rdsv3_incoming *inc, + struct rdsv3_info_iterator *iter, + uint32_be_t saddr, uint32_be_t daddr, int flip) +{ + struct rdsv3_info_message minfo; + + minfo.seq = ntohll(inc->i_hdr.h_sequence); + minfo.len = ntohl(inc->i_hdr.h_len); + + if (flip) { + minfo.laddr = daddr; + minfo.faddr = saddr; + minfo.lport = inc->i_hdr.h_dport; + minfo.fport = inc->i_hdr.h_sport; + } else { + minfo.laddr = saddr; + minfo.faddr = daddr; + minfo.lport = inc->i_hdr.h_sport; + minfo.fport = inc->i_hdr.h_dport; + } + + rdsv3_info_copy(iter, &minfo, sizeof (minfo)); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3.conf b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3.conf new file mode 100644 index 0000000000..c17689cf40 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3.conf @@ -0,0 +1,25 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# +name="rdsv3" parent="ib" unit-address="0"; +ddi-forceattach=1; diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_ddi.c b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_ddi.c new file mode 100644 index 0000000000..82417cba04 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_ddi.c @@ -0,0 +1,303 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/modctl.h> +#include <sys/strsubr.h> +#include <sys/socketvar.h> +#include <sys/rds.h> + +#include <sys/ib/ibtl/ibti.h> +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +extern int rdsv3_init(void); +extern void rdsv3_exit(void); +extern void rdsv3_cong_init(void); +extern void rdsv3_cong_exit(void); +extern void rdsv3_trans_init(void); +extern void rdsv3_trans_exit(void); +extern int rdsv3_sock_init(void); +extern void rdsv3_sock_exit(void); + +/* global */ +dev_info_t *rdsv3_dev_info = NULL; +kmem_cache_t *rdsv3_alloc_cache = NULL; + +extern kmutex_t rdsv3_rdma_listen_id_lock; +extern struct rdma_cm_id *rdsv3_rdma_listen_id; + +extern kmutex_t rdsv3_sock_lock; +extern list_t rdsv3_sock_list; + +extern void rdsv3_bind_tree_init(); +extern void rdsv3_bind_tree_exit(); + +int +rdsv3_sock_init() +{ + RDSV3_DPRINTF4("rdsv3_sock_init", "Enter"); + + rdsv3_alloc_cache = kmem_cache_create("rdsv3_alloc_cache", + sizeof (struct rsock) + sizeof (struct rdsv3_sock), 0, NULL, + NULL, NULL, NULL, NULL, 0); + if (rdsv3_alloc_cache == NULL) { + RDSV3_DPRINTF1("rdsv3_alloc_cache", + "kmem_cache_create(rdsv3_alloc_cache) failed"); + return (-1); + } + rdsv3_bind_tree_init(); + + mutex_init(&rdsv3_sock_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&rdsv3_sock_list, sizeof (struct rdsv3_sock), + offsetof(struct rdsv3_sock, rs_item)); + + RDSV3_DPRINTF4("rdsv3_sock_init", "Return"); + + return (0); +} + +void +rdsv3_sock_exit() +{ + RDSV3_DPRINTF2("rdsv3_sock_exit", "Enter"); + + rdsv3_bind_tree_exit(); + + kmem_cache_destroy(rdsv3_alloc_cache); + + list_destroy(&rdsv3_sock_list); + mutex_destroy(&rdsv3_sock_lock); + + RDSV3_DPRINTF2("rdsv3_sock_exit", "Return"); +} + +static int +rdsv3_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int ret; + + RDSV3_DPRINTF2("rdsv3_attach", "Enter (dip: %p)", dip); + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (rdsv3_dev_info != NULL) { + RDSV3_DPRINTF1("rdsv3_attach", "Multiple RDS instances are" + " not supported (rdsv3_dev_info: 0x%p)", rdsv3_dev_info); + return (DDI_FAILURE); + } + rdsv3_dev_info = dip; + + mutex_init(&rdsv3_rdma_listen_id_lock, NULL, MUTEX_DRIVER, NULL); + rdsv3_rdma_listen_id = NULL; + + rdsv3_trans_init(); + ret = rdsv3_init(); + if (ret) { + RDSV3_DPRINTF1("rdsv3_attach", "rdsv3_init failed: %d", ret); + rdsv3_trans_exit(); + mutex_destroy(&rdsv3_rdma_listen_id_lock); + rdsv3_dev_info = NULL; + return (DDI_FAILURE); + } + + ret = rdsv3_sock_init(); + if (ret) { + rdsv3_exit(); + rdsv3_trans_exit(); + mutex_destroy(&rdsv3_rdma_listen_id_lock); + rdsv3_dev_info = NULL; + return (DDI_FAILURE); + } + + ret = ddi_create_minor_node(dip, "rdsv3", S_IFCHR, 0, DDI_PSEUDO, 0); + if (ret != DDI_SUCCESS) { + cmn_err(CE_CONT, "ddi_create_minor_node failed: %d", ret); + rdsv3_sock_exit(); + rdsv3_exit(); + rdsv3_trans_exit(); + mutex_destroy(&rdsv3_rdma_listen_id_lock); + rdsv3_dev_info = NULL; + return (DDI_FAILURE); + } + + RDSV3_DPRINTF2("rdsv3_attach", "Return"); + + return (DDI_SUCCESS); +} + +static int +rdsv3_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + RDSV3_DPRINTF2("rdsv3_detach", "Enter (dip: %p)", dip); + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + rdsv3_sock_exit(); + rdsv3_exit(); + rdsv3_trans_exit(); + ddi_remove_minor_node(dip, "rdsv3"); + rdsv3_dev_info = NULL; + + RDSV3_DPRINTF2("rdsv3_detach", "Return"); + + return (DDI_SUCCESS); +} + +/* ARGSUSED */ +static int +rdsv3_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) +{ + int ret = DDI_FAILURE; + + RDSV3_DPRINTF2("rdsv3_info", "Enter (dip: %p, cmd: %d)", dip, cmd); + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + if (rdsv3_dev_info != NULL) { + *result = (void *)rdsv3_dev_info; + ret = DDI_SUCCESS; + } + break; + + case DDI_INFO_DEVT2INSTANCE: + *result = NULL; + ret = DDI_SUCCESS; + break; + + default: + break; + } + + RDSV3_DPRINTF4("rdsv3_info", "Return"); + + return (ret); +} + +/* Driver entry points */ +static struct cb_ops rdsv3_cb_ops = { + nulldev, /* open */ + nulldev, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + nodev, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, /* prop_op */ + NULL, /* stream */ + D_MP, /* cb_flag */ + CB_REV, /* rev */ + nodev, /* int (*cb_aread)() */ + nodev, /* int (*cb_awrite)() */ +}; + +/* Device options */ +static struct dev_ops rdsv3_ops = { + DEVO_REV, /* devo_rev, */ + 0, /* refcnt */ + rdsv3_info, /* info */ + nulldev, /* identify */ + nulldev, /* probe */ + rdsv3_attach, /* attach */ + rdsv3_detach, /* detach */ + nodev, /* reset */ + &rdsv3_cb_ops, /* driver ops - devctl interfaces */ + NULL, /* bus operations */ + NULL, /* power */ + ddi_quiesce_not_needed /* quiesce */ +}; + +/* + * Module linkage information. + */ +#define RDSV3_DEVDESC "RDSv3 IB transport driver" +static struct modldrv rdsv3_modldrv = { + &mod_driverops, /* Driver module */ + RDSV3_DEVDESC, /* Driver name and version */ + &rdsv3_ops, /* Driver ops */ +}; + +static struct modlinkage rdsv3_modlinkage = { + MODREV_1, + (void *)&rdsv3_modldrv, + NULL +}; + +int +_init(void) +{ + int ret; + + if (ibt_hw_is_present() == 0) { + return (ENODEV); + } + + /* Initialize logging */ + rdsv3_logging_initialization(); + + ret = mod_install(&rdsv3_modlinkage); + if (ret != 0) { + /* + * Could not load module + */ + rdsv3_logging_destroy(); + return (ret); + } + + return (0); +} + +int +_fini() +{ + int ret; + + /* + * Remove module + */ + if ((ret = mod_remove(&rdsv3_modlinkage)) != 0) { + return (ret); + } + + /* Stop logging */ + rdsv3_logging_destroy(); + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&rdsv3_modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_debug.c b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_debug.c new file mode 100644 index 0000000000..8327b5b866 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_debug.c @@ -0,0 +1,348 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ +#include <sys/types.h> +#include <sys/varargs.h> +#include <sys/cmn_err.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +/* + * This file contains the debug defines and routines. + * Debugging information is collected in a circular kernel buffer. Debug + * messages with level lower than rdsv3dbglvl are ignored. The size of the + * of the debug buffer can be changed by setting 'rdsv3_debug_buf_size' in + * bytes in /etc/system. + * + * The debug buffer can be cleared by setting 'rdsv3_clear_debug_buf_flag = 1' + * on a running system. + */ + +#define RDSV3_DEBUG_SIZE_EXTRA_ALLOC 8 +#define RDSV3_MIN_DEBUG_BUF_SIZE 0x1000 +#define RDSV3_FUNCNAME_LEN 40 +#define RDSV3_PRINTBUF_LEN 4096 +#ifdef DEBUG +#define RDSV3_DEBUG_BUF_SIZE 0x200000 /* 2M size */ +#else +#define RDSV3_DEBUG_BUF_SIZE 0x2000 +#endif /* DEBUG */ + +/* Max length of a debug statement */ +#define RDSV3_PRINT_BUF_LEN 4096 + +static int rdsv3_suppress_dprintf; /* Suppress debug printing */ +static int rdsv3_buffer_dprintf = 1; /* Use debug buffer (0 == console) */ +static int rdsv3_debug_buf_size = RDSV3_DEBUG_BUF_SIZE; /* Sz of Debug buf */ +static int rdsv3_allow_intr_msgs = 0; /* log "intr" messages */ +char *rdsv3_debug_buf = NULL; /* The Debug Buf */ +char *rdsv3_buf_sptr, *rdsv3_buf_eptr; /* debug buffer temp pointer */ +int rdsv3_clear_debug_buf_flag = 0; /* Clear debug buffer */ +uint_t rdsv3dbglvl = RDSV3_LOG_L4; + +/* + * Print Buffer protected by mutex for debug stuff. The mutex also + * ensures serializing debug messages. + */ +static kmutex_t rdsv3_debug_mutex; +static char rdsv3_print_buf[RDSV3_PRINT_BUF_LEN]; + +/* Function Prototypes */ +static void rdsv3_clear_print_buf(); + +/* RDS logging init */ +void +rdsv3_logging_initialization() +{ + boolean_t flag = B_FALSE; + + mutex_init(&rdsv3_debug_mutex, NULL, MUTEX_DRIVER, NULL); + mutex_enter(&rdsv3_debug_mutex); + + if (rdsv3_debug_buf_size <= RDSV3_DEBUG_SIZE_EXTRA_ALLOC) { + rdsv3_debug_buf_size = RDSV3_MIN_DEBUG_BUF_SIZE; + flag = B_TRUE; + } + + /* if it is less that RDSV3_MIN_DEBUG_BUF_SIZE, adjust it */ + rdsv3_debug_buf_size = max(RDSV3_MIN_DEBUG_BUF_SIZE, + rdsv3_debug_buf_size); + + rdsv3_debug_buf = (char *)kmem_alloc(rdsv3_debug_buf_size, KM_SLEEP); + rdsv3_clear_print_buf(); + mutex_exit(&rdsv3_debug_mutex); + + if (flag == B_TRUE) { + RDSV3_DPRINTF2("RDS", "rdsv3_debug_buf_size was too small, " + "adjusted to %x", rdsv3_debug_buf_size); + } +} + + +/* RDS logging destroy */ +void +rdsv3_logging_destroy() +{ + mutex_enter(&rdsv3_debug_mutex); + if (rdsv3_debug_buf) { + kmem_free(rdsv3_debug_buf, rdsv3_debug_buf_size); + rdsv3_debug_buf = NULL; + } + mutex_exit(&rdsv3_debug_mutex); + mutex_destroy(&rdsv3_debug_mutex); +} + + +/* + * debug, log, and console message handling + */ + +/* + * clear the RDS debug buffer + */ +static void +rdsv3_clear_print_buf() +{ + ASSERT(MUTEX_HELD(&rdsv3_debug_mutex)); + if (rdsv3_debug_buf) { + rdsv3_buf_sptr = rdsv3_debug_buf; + rdsv3_buf_eptr = rdsv3_debug_buf + rdsv3_debug_buf_size - + RDSV3_DEBUG_SIZE_EXTRA_ALLOC; + + bzero(rdsv3_debug_buf, rdsv3_debug_buf_size); + } +} + + +static void +rdsv3_vlog(char *name, uint_t level, char *fmt, va_list ap) +{ + char *label = (name == NULL) ? "rds" : name; + char *msg_ptr; + size_t len; + + mutex_enter(&rdsv3_debug_mutex); + + /* if not using logging scheme; quit */ + if (rdsv3_suppress_dprintf || (rdsv3_debug_buf == NULL)) { + mutex_exit(&rdsv3_debug_mutex); + return; + } + + /* If user requests to clear debug buffer, go ahead */ + if (rdsv3_clear_debug_buf_flag != 0) { + rdsv3_clear_print_buf(); + rdsv3_clear_debug_buf_flag = 0; + } + + /* + * put "label" into the buffer + */ + len = snprintf(rdsv3_print_buf, RDSV3_FUNCNAME_LEN, "%s:\t", label); + + msg_ptr = rdsv3_print_buf + len; + len += vsnprintf(msg_ptr, RDSV3_PRINT_BUF_LEN - len - 2, fmt, ap); + + len = min(len, RDSV3_PRINT_BUF_LEN - 2); + ASSERT(len == strlen(rdsv3_print_buf)); + rdsv3_print_buf[len++] = '\n'; + rdsv3_print_buf[len] = '\0'; + + /* + * stuff the message in the debug buf + */ + if (rdsv3_buffer_dprintf) { + + /* + * overwrite >>>> that might be over the end of the + * the buffer + */ + *rdsv3_buf_sptr = '\0'; + + if (rdsv3_buf_sptr + len > rdsv3_buf_eptr) { + size_t left = (uintptr_t)rdsv3_buf_eptr - + (uintptr_t)rdsv3_buf_sptr; + + bcopy((caddr_t)rdsv3_print_buf, + (caddr_t)rdsv3_buf_sptr, left); + bcopy((caddr_t)rdsv3_print_buf + left, + (caddr_t)rdsv3_debug_buf, len - left); + rdsv3_buf_sptr = rdsv3_debug_buf + len - left; + } else { + bcopy((caddr_t)rdsv3_print_buf, rdsv3_buf_sptr, len); + rdsv3_buf_sptr += len; + } + + /* add marker */ + (void) sprintf(rdsv3_buf_sptr, ">>>>"); + } + + /* + * LINTR, L5-L2 message may go to the rdsv3_debug_buf + * L1 messages will go to the /var/adm/messages (debug & non-debug). + * L0 messages will go to console (debug & non-debug). + */ + switch (level) { + case RDSV3_LOG_LINTR: + case RDSV3_LOG_L5: + case RDSV3_LOG_L4: + case RDSV3_LOG_L3: + case RDSV3_LOG_L2: + if (!rdsv3_buffer_dprintf) { + cmn_err(CE_CONT, "^%s", rdsv3_print_buf); + } + break; + case RDSV3_LOG_L1: + if (!rdsv3_buffer_dprintf) { + cmn_err(CE_CONT, "^%s", rdsv3_print_buf); + } else { + /* go to messages file */ + cmn_err(CE_CONT, "!%s", rdsv3_print_buf); + } + break; + case RDSV3_LOG_L0: + /* Strip the "\n" added earlier */ + if (rdsv3_print_buf[len - 1] == '\n') { + rdsv3_print_buf[len - 1] = '\0'; + } + if (msg_ptr[len - 1] == '\n') { + msg_ptr[len - 1] = '\0'; + } + /* go to console */ + cmn_err(CE_CONT, "^%s", rdsv3_print_buf); + break; + } + + mutex_exit(&rdsv3_debug_mutex); +} + +void +rdsv3_dprintf_intr(char *name, char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + rdsv3_vlog(name, RDSV3_LOG_LINTR, fmt, ap); + va_end(ap); +} + +/* + * Check individual subsystem err levels + */ +#define RDSV3_CHECK_ERR_LEVEL(level) \ + if (rdsv3dbglvl < level) \ + return; \ + +void +rdsv3_dprintf5(char *name, char *fmt, ...) +{ + va_list ap; + + RDSV3_CHECK_ERR_LEVEL(RDSV3_LOG_L5); + + va_start(ap, fmt); + rdsv3_vlog(name, RDSV3_LOG_L5, fmt, ap); + va_end(ap); +} + +void +rdsv3_dprintf4(char *name, char *fmt, ...) +{ + va_list ap; + + RDSV3_CHECK_ERR_LEVEL(RDSV3_LOG_L4); + + va_start(ap, fmt); + rdsv3_vlog(name, RDSV3_LOG_L4, fmt, ap); + va_end(ap); +} + +void +rdsv3_dprintf3(char *name, char *fmt, ...) +{ + va_list ap; + + RDSV3_CHECK_ERR_LEVEL(RDSV3_LOG_L3); + + va_start(ap, fmt); + rdsv3_vlog(name, RDSV3_LOG_L3, fmt, ap); + va_end(ap); +} + +void +rdsv3_dprintf2(char *name, char *fmt, ...) +{ + va_list ap; + + RDSV3_CHECK_ERR_LEVEL(RDSV3_LOG_L2); + + va_start(ap, fmt); + rdsv3_vlog(name, RDSV3_LOG_L2, fmt, ap); + va_end(ap); +} + +void +rdsv3_dprintf1(char *name, char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + rdsv3_vlog(name, RDSV3_LOG_L1, fmt, ap); + va_end(ap); +} + + +/* + * Function: + * rdsv3_dprintf0 + * Input: + * name - Name of the function generating the debug message + * fmt - The message to be displayed. + * Output: + * none + * Returns: + * none + * Description: + * A generic log function to display RDS debug messages. + */ +void +rdsv3_dprintf0(char *name, char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + rdsv3_vlog(name, RDSV3_LOG_L0, fmt, ap); + va_end(ap); +} + +/* For ofed rdstrace */ +void +rdsv3_trace(char *name, uint8_t lvl, char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + rdsv3_vlog(name, lvl, fmt, ap); + va_end(ap); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_impl.c b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_impl.c new file mode 100644 index 0000000000..9ea3b28bec --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_impl.c @@ -0,0 +1,1294 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ +#include <sys/types.h> +#include <sys/stream.h> +#include <sys/dlpi.h> +#include <sys/stropts.h> +#include <sys/strsun.h> +#include <sys/sysmacros.h> +#include <sys/strlog.h> +#include <sys/ddi.h> +#include <sys/cmn_err.h> +#include <sys/socket.h> +#include <net/if.h> +#include <net/if_types.h> +#include <netinet/in.h> +#include <sys/ethernet.h> +#include <inet/arp.h> +#include <inet/ip.h> +#include <inet/ip6.h> +#include <inet/ip_ire.h> +#include <inet/ip_if.h> +#include <inet/ip_ftable.h> + +#include <sys/sunddi.h> +#include <sys/ksynch.h> + +#include <sys/rds.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sockio.h> +#include <sys/sysmacros.h> +#include <inet/common.h> +#include <inet/ip.h> +#include <net/if_types.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/rdma.h> +#include <sys/ib/clients/rdsv3/ib.h> +#include <sys/ib/clients/rdsv3/rdsv3_impl.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +#include <sys/dls.h> +#include <sys/mac.h> +#include <sys/mac_client.h> +#include <sys/mac_provider.h> +#include <sys/mac_client_priv.h> + +ddi_taskq_t *rdsv3_taskq = NULL; +extern kmem_cache_t *rdsv3_alloc_cache; + +extern unsigned int ip_ocsum(ushort_t *address, int halfword_count, + unsigned int sum); + +/* + * Check if the IP interface named by `lifrp' is RDS-capable. + */ +boolean_t +rdsv3_capable_interface(struct lifreq *lifrp) +{ + char ifname[LIFNAMSIZ]; + char drv[MAXLINKNAMELEN]; + uint_t ppa; + char *cp; + + RDSV3_DPRINTF4("rdsv3_capable_interface", "Enter"); + + if (lifrp->lifr_type == IFT_IB) + return (B_TRUE); + + /* + * Strip off the logical interface portion before getting + * intimate with the name. + */ + (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); + if ((cp = strchr(ifname, ':')) != NULL) + *cp = '\0'; + + if (strcmp("lo0", ifname) == 0) { + /* + * loopback is considered RDS-capable + */ + return (B_TRUE); + } + + return (ddi_parse(ifname, drv, &ppa) == DDI_SUCCESS && + rdsv3_if_lookup_by_name(drv)); +} + +int +rdsv3_do_ip_ioctl(ksocket_t so4, void **ipaddrs, int *size, int *nifs) +{ + struct lifnum lifn; + struct lifconf lifc; + struct lifreq *lp, *rlp, lifr; + int rval = 0; + int numifs; + int bufsize, rbufsize; + void *buf, *rbuf; + int i, j, n, rc; + + *ipaddrs = NULL; + *size = 0; + *nifs = 0; + + RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Enter"); + +retry_count: + /* snapshot the current number of interfaces */ + lifn.lifn_family = PF_UNSPEC; + lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES; + lifn.lifn_count = 0; + rval = ksocket_ioctl(so4, SIOCGLIFNUM, (intptr_t)&lifn, &rval, + CRED()); + if (rval != 0) { + RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", + "ksocket_ioctl returned: %d", rval); + return (rval); + } + + numifs = lifn.lifn_count; + if (numifs <= 0) { + RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No interfaces found"); + return (0); + } + + /* allocate extra room in case more interfaces appear */ + numifs += 10; + + /* get the interface names and ip addresses */ + bufsize = numifs * sizeof (struct lifreq); + buf = kmem_alloc(bufsize, KM_SLEEP); + + lifc.lifc_family = AF_UNSPEC; + lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES; + lifc.lifc_len = bufsize; + lifc.lifc_buf = buf; + rc = ksocket_ioctl(so4, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED()); + if (rc != 0) { + RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "SIOCGLIFCONF failed"); + kmem_free(buf, bufsize); + return (rc); + } + /* if our extra room is used up, try again */ + if (bufsize <= lifc.lifc_len) { + kmem_free(buf, bufsize); + buf = NULL; + goto retry_count; + } + /* calc actual number of ifconfs */ + n = lifc.lifc_len / sizeof (struct lifreq); + + /* + * Count the RDS interfaces + */ + for (i = 0, j = 0, lp = lifc.lifc_req; i < n; i++, lp++) { + + /* + * Copy as the SIOCGLIFFLAGS ioctl is destructive + */ + bcopy(lp, &lifr, sizeof (struct lifreq)); + /* + * fetch the flags using the socket of the correct family + */ + switch (lifr.lifr_addr.ss_family) { + case AF_INET: + rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)&lifr, + &rval, CRED()); + break; + default: + continue; + } + + if (rc != 0) continue; + + /* + * If we got the flags, skip uninteresting + * interfaces based on flags + */ + if ((lifr.lifr_flags & IFF_UP) != IFF_UP) + continue; + if (lifr.lifr_flags & + (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED)) + continue; + if (!rdsv3_capable_interface(&lifr)) + continue; + j++; + } + + if (j <= 0) { + RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No RDS interfaces"); + kmem_free(buf, bufsize); + return (rval); + } + + numifs = j; + + /* This is the buffer we pass back */ + rbufsize = numifs * sizeof (struct lifreq); + rbuf = kmem_alloc(rbufsize, KM_SLEEP); + rlp = (struct lifreq *)rbuf; + + /* + * Examine the array of interfaces and filter uninteresting ones + */ + for (i = 0, lp = lifc.lifc_req; i < n; i++, lp++) { + + /* + * Copy the address as the SIOCGLIFFLAGS ioctl is destructive + */ + bcopy(lp, &lifr, sizeof (struct lifreq)); + /* + * fetch the flags using the socket of the correct family + */ + switch (lifr.lifr_addr.ss_family) { + case AF_INET: + rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)&lifr, + &rval, CRED()); + break; + default: + continue; + } + + + if (rc != 0) { + RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", + "ksocket_ioctl failed" " for %s", lifr.lifr_name); + continue; + } + + /* + * If we got the flags, skip uninteresting + * interfaces based on flags + */ + if ((lifr.lifr_flags & IFF_UP) != IFF_UP) + continue; + if (lifr.lifr_flags & + (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED)) + continue; + if (!rdsv3_capable_interface(&lifr)) + continue; + + /* save the record */ + bcopy(lp, rlp, sizeof (struct lifreq)); + rlp++; + } + + kmem_free(buf, bufsize); + + *ipaddrs = rbuf; + *size = rbufsize; + *nifs = numifs; + + RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Return"); + + return (rval); +} + +/* + * Check if the IP interface named by `ifrp' is RDS-capable. + */ +boolean_t +rdsv3_capable_interface_old(struct ifreq *ifrp) +{ + char ifname[IFNAMSIZ]; + char drv[MAXLINKNAMELEN]; + uint_t ppa; + char *cp; + + RDSV3_DPRINTF4("rdsv3_capable_interface_old", "Enter"); + + /* + * Strip off the logical interface portion before getting + * intimate with the name. + */ + (void) strlcpy(ifname, ifrp->ifr_name, IFNAMSIZ); + if ((cp = strchr(ifname, ':')) != NULL) + *cp = '\0'; + + RDSV3_DPRINTF4("rdsv3_capable_interface_old", "ifname: %s", ifname); + + if ((strcmp("lo0", ifname) == 0) || + (strncmp("ibd", ifname, 3) == 0)) { + /* + * loopback and IB are considered RDS-capable + */ + return (B_TRUE); + } + + return (ddi_parse(ifname, drv, &ppa) == DDI_SUCCESS && + rdsv3_if_lookup_by_name(drv)); +} + +int +rdsv3_do_ip_ioctl_old(ksocket_t so4, void **ipaddrs, int *size, int *nifs) +{ + uint_t ifn; + struct ifconf ifc; + struct ifreq *lp, *rlp, ifr; + int rval = 0; + int numifs; + int bufsize, rbufsize; + void *buf, *rbuf; + int i, j, n, rc; + + *ipaddrs = NULL; + *size = 0; + *nifs = 0; + + RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Enter"); + +retry_count: + rval = ksocket_ioctl(so4, SIOCGIFNUM, (intptr_t)&ifn, &rval, + CRED()); + if (rval != 0) { + RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", + "ksocket_ioctl(SIOCGIFNUM) returned: %d", rval); + return (rval); + } + + numifs = ifn; + if (numifs <= 0) { + RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No interfaces found"); + return (0); + } + + /* allocate extra room in case more interfaces appear */ + numifs += 10; + + /* get the interface names and ip addresses */ + bufsize = numifs * sizeof (struct ifreq); + buf = kmem_alloc(bufsize, KM_SLEEP); + + ifc.ifc_len = bufsize; + ifc.ifc_buf = buf; + rc = ksocket_ioctl(so4, SIOCGIFCONF, (intptr_t)&ifc, &rval, CRED()); + if (rc != 0) { + RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", + "SIOCGLIFCONF failed: %d", rc); + kmem_free(buf, bufsize); + return (rc); + } + /* if our extra room is used up, try again */ + if (bufsize <= ifc.ifc_len) { + kmem_free(buf, bufsize); + buf = NULL; + goto retry_count; + } + /* calc actual number of ifconfs */ + n = ifc.ifc_len / sizeof (struct ifreq); + + /* + * Count the RDS interfaces + */ + for (i = 0, j = 0, lp = ifc.ifc_req; i < n; i++, lp++) { + + /* + * Copy as the SIOCGIFFLAGS ioctl is destructive + */ + bcopy(lp, &ifr, sizeof (struct ifreq)); + /* + * fetch the flags using the socket of the correct family + */ + switch (ifr.ifr_addr.sa_family) { + case AF_INET: + rc = ksocket_ioctl(so4, SIOCGIFFLAGS, (intptr_t)&ifr, + &rval, CRED()); + break; + default: + continue; + } + + if (rc != 0) continue; + + RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", + "1. ifr_name: %s, flags: %d", ifr.ifr_name, + (ushort_t)ifr.ifr_flags); + + /* + * If we got the flags, skip uninteresting + * interfaces based on flags + */ + if ((((ushort_t)ifr.ifr_flags) & IFF_UP) != IFF_UP) + continue; + RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", + "2. ifr_name: %s, flags: %d", ifr.ifr_name, + (ushort_t)ifr.ifr_flags); + if (((ushort_t)ifr.ifr_flags) & + (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED)) + continue; + RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", + "3. ifr_name: %s, flags: %d", ifr.ifr_name, + (ushort_t)ifr.ifr_flags); + if (!rdsv3_capable_interface_old(&ifr)) + continue; + RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", + "4. ifr_name: %s, flags: %d", ifr.ifr_name, + (ushort_t)ifr.ifr_flags); + j++; + } + + if (j <= 0) { + RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No RDS interfaces"); + kmem_free(buf, bufsize); + return (rval); + } + + numifs = j; + + /* This is the buffer we pass back */ + rbufsize = numifs * sizeof (struct ifreq); + rbuf = kmem_alloc(rbufsize, KM_SLEEP); + rlp = (struct ifreq *)rbuf; + + /* + * Examine the array of interfaces and filter uninteresting ones + */ + for (i = 0, lp = ifc.ifc_req; i < n; i++, lp++) { + + /* + * Copy the address as the SIOCGIFFLAGS ioctl is destructive + */ + bcopy(lp, &ifr, sizeof (struct ifreq)); + /* + * fetch the flags using the socket of the correct family + */ + switch (ifr.ifr_addr.sa_family) { + case AF_INET: + rc = ksocket_ioctl(so4, SIOCGIFFLAGS, (intptr_t)&ifr, + &rval, CRED()); + break; + default: + continue; + } + + + if (rc != 0) { + RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", + "ksocket_ioctl failed: %d for %s", + rc, ifr.ifr_name); + continue; + } + + /* + * If we got the flags, skip uninteresting + * interfaces based on flags + */ + if ((((ushort_t)ifr.ifr_flags) & IFF_UP) != IFF_UP) + continue; + if (((ushort_t)ifr.ifr_flags) & + (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED)) + continue; + if (!rdsv3_capable_interface_old(&ifr)) + continue; + + /* save the record */ + bcopy(lp, rlp, sizeof (struct ifreq)); + rlp++; + } + + kmem_free(buf, bufsize); + + *ipaddrs = rbuf; + *size = rbufsize; + *nifs = numifs; + + RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Return"); + + return (rval); +} + +boolean_t +rdsv3_isloopback(ipaddr_t addr) +{ + ip_stack_t *ipst; + + ipst = netstack_find_by_zoneid(GLOBAL_ZONEID)->netstack_ip; + ASSERT(ipst != NULL); + if (ip_type_v4(addr, ipst) != IRE_LOOPBACK) { + netstack_rele(ipst->ips_netstack); + return (B_FALSE); + } + netstack_rele(ipst->ips_netstack); + return (B_TRUE); +} + +/* + * Work Queue Implementation + */ + +#define RDSV3_WQ_THREAD_IDLE 0 +#define RDSV3_WQ_THREAD_RUNNING 1 +#define RDSV3_WQ_THREAD_FLUSHING 2 +#define RDSV3_WQ_THREAD_EXITING 3 + +/* worker thread */ +void +rdsv3_worker_thread(void *arg) +{ + rdsv3_workqueue_struct_t *wq = arg; + rdsv3_work_t *work; + + RDSV3_DPRINTF4("rdsv3_worker_thread", "Enter(wq: 0x%p)", wq); + + mutex_enter(&wq->wq_lock); + work = list_remove_head(&wq->wq_queue); + while (work) { + mutex_exit(&wq->wq_lock); + + /* process work */ + work->func(work); + + mutex_enter(&wq->wq_lock); + work = list_remove_head(&wq->wq_queue); + } + + /* No more work, go home, until called again */ + if (wq->wq_state != RDSV3_WQ_THREAD_EXITING) { + wq->wq_state = RDSV3_WQ_THREAD_IDLE; + } + mutex_exit(&wq->wq_lock); + + RDSV3_DPRINTF4("rdsv3_worker_thread", "Return(wq: 0x%p)", wq); +} + +/* XXX */ +void +rdsv3_flush_workqueue(rdsv3_workqueue_struct_t *wq) +{ + RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Enter(wq: %p)", wq); + + mutex_enter(&wq->wq_lock); + switch (wq->wq_state) { + case RDSV3_WQ_THREAD_IDLE: + /* nothing to do */ + ASSERT(list_is_empty(&wq->wq_queue)); + break; + + case RDSV3_WQ_THREAD_RUNNING: + wq->wq_state = RDSV3_WQ_THREAD_FLUSHING; + /* FALLTHRU */ + case RDSV3_WQ_THREAD_FLUSHING: + /* already flushing, wait until the flushing is complete */ + do { + mutex_exit(&wq->wq_lock); + delay(drv_usectohz(1000000)); + mutex_enter(&wq->wq_lock); + } while (wq->wq_state == RDSV3_WQ_THREAD_FLUSHING); + break; + case RDSV3_WQ_THREAD_EXITING: + mutex_exit(&wq->wq_lock); + rdsv3_worker_thread(wq); + return; + } + mutex_exit(&wq->wq_lock); + + RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Return(wq: %p)", wq); +} + +void +rdsv3_queue_work(rdsv3_workqueue_struct_t *wq, rdsv3_work_t *wp) +{ + RDSV3_DPRINTF4("rdsv3_queue_work", "Enter(wq: %p, wp: %p)", wq, wp); + + mutex_enter(&wq->wq_lock); + + if (list_link_active(&wp->work_item)) { + /* This is already in the queue, ignore this call */ + mutex_exit(&wq->wq_lock); + RDSV3_DPRINTF3("rdsv3_queue_work", "already queued: %p", wp); + return; + } + + switch (wq->wq_state) { + case RDSV3_WQ_THREAD_RUNNING: + list_insert_tail(&wq->wq_queue, wp); + mutex_exit(&wq->wq_lock); + break; + + case RDSV3_WQ_THREAD_FLUSHING: + do { + mutex_exit(&wq->wq_lock); + delay(drv_usectohz(1000000)); + mutex_enter(&wq->wq_lock); + } while (wq->wq_state == RDSV3_WQ_THREAD_FLUSHING); + + if (wq->wq_state == RDSV3_WQ_THREAD_RUNNING) { + list_insert_tail(&wq->wq_queue, wp); + mutex_exit(&wq->wq_lock); + break; + } + /* FALLTHRU */ + + case RDSV3_WQ_THREAD_IDLE: + list_insert_tail(&wq->wq_queue, wp); + wq->wq_state = RDSV3_WQ_THREAD_RUNNING; + mutex_exit(&wq->wq_lock); + + (void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_worker_thread, wq, + DDI_SLEEP); + break; + + case RDSV3_WQ_THREAD_EXITING: + mutex_exit(&wq->wq_lock); + break; + } + + RDSV3_DPRINTF4("rdsv3_queue_work", "Return(wq: %p, wp: %p)", wq, wp); +} + +/* timeout handler for delayed work queuing */ +void +rdsv3_work_timeout_handler(void *arg) +{ + rdsv3_delayed_work_t *dwp = (rdsv3_delayed_work_t *)arg; + + RDSV3_DPRINTF4("rdsv3_work_timeout_handler", + "Enter(wq: %p, wp: %p)", dwp->wq, &dwp->work); + + mutex_enter(&dwp->lock); + dwp->timeid = 0; + mutex_exit(&dwp->lock); + + mutex_enter(&dwp->wq->wq_lock); + dwp->wq->wq_pending--; + if (dwp->wq->wq_state == RDSV3_WQ_THREAD_EXITING) { + mutex_exit(&dwp->wq->wq_lock); + return; + } + mutex_exit(&dwp->wq->wq_lock); + + rdsv3_queue_work(dwp->wq, &dwp->work); + + RDSV3_DPRINTF4("rdsv3_work_timeout_handler", + "Return(wq: %p, wp: %p)", dwp->wq, &dwp->work); +} + +void +rdsv3_queue_delayed_work(rdsv3_workqueue_struct_t *wq, + rdsv3_delayed_work_t *dwp, uint_t delay) +{ + RDSV3_DPRINTF4("rdsv3_queue_delayed_work", + "Enter(wq: %p, wp: %p)", wq, dwp); + + if (delay == 0) { + rdsv3_queue_work(wq, &dwp->work); + return; + } + + mutex_enter(&wq->wq_lock); + if (wq->wq_state == RDSV3_WQ_THREAD_EXITING) { + mutex_exit(&wq->wq_lock); + RDSV3_DPRINTF4("rdsv3_queue_delayed_work", + "WQ exiting - don't queue (wq: %p, wp: %p)", wq, dwp); + return; + } + wq->wq_pending++; + mutex_exit(&wq->wq_lock); + + mutex_enter(&dwp->lock); + if (dwp->timeid == 0) { + dwp->wq = wq; + dwp->timeid = timeout(rdsv3_work_timeout_handler, dwp, + jiffies + (delay * rdsv3_one_sec_in_hz)); + mutex_exit(&dwp->lock); + } else { + mutex_exit(&dwp->lock); + RDSV3_DPRINTF4("rdsv3_queue_delayed_work", "Already queued: %p", + dwp); + mutex_enter(&wq->wq_lock); + wq->wq_pending--; + mutex_exit(&wq->wq_lock); + } + + RDSV3_DPRINTF4("rdsv3_queue_delayed_work", + "Return(wq: %p, wp: %p)", wq, dwp); +} + +void +rdsv3_cancel_delayed_work(rdsv3_delayed_work_t *dwp) +{ + RDSV3_DPRINTF4("rdsv3_cancel_delayed_work", + "Enter(wq: %p, dwp: %p)", dwp->wq, dwp); + + mutex_enter(&dwp->lock); + if (dwp->timeid != 0) { + (void) untimeout(dwp->timeid); + dwp->timeid = 0; + } else { + RDSV3_DPRINTF4("rdsv3_cancel_delayed_work", + "Nothing to cancel (wq: %p, dwp: %p)", dwp->wq, dwp); + mutex_exit(&dwp->lock); + return; + } + mutex_exit(&dwp->lock); + + mutex_enter(&dwp->wq->wq_lock); + dwp->wq->wq_pending--; + mutex_exit(&dwp->wq->wq_lock); + + RDSV3_DPRINTF4("rdsv3_cancel_delayed_work", + "Return(wq: %p, dwp: %p)", dwp->wq, dwp); +} + +void +rdsv3_destroy_task_workqueue(rdsv3_workqueue_struct_t *wq) +{ + RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Enter"); + + ASSERT(wq); + + mutex_enter(&wq->wq_lock); + wq->wq_state = RDSV3_WQ_THREAD_EXITING; + + while (wq->wq_pending > 0) { + mutex_exit(&wq->wq_lock); + delay(drv_usectohz(1000000)); + mutex_enter(&wq->wq_lock); + }; + mutex_exit(&wq->wq_lock); + + rdsv3_flush_workqueue(wq); + + list_destroy(&wq->wq_queue); + mutex_destroy(&wq->wq_lock); + kmem_free(wq, sizeof (rdsv3_workqueue_struct_t)); + + ASSERT(rdsv3_taskq); + ddi_taskq_destroy(rdsv3_taskq); + + wq = NULL; + rdsv3_taskq = NULL; + + RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Return"); +} + +/* ARGSUSED */ +void +rdsv3_rdma_init_worker(struct rdsv3_work_s *work) +{ + rdsv3_rdma_init(); +} + +#define RDSV3_NUM_TASKQ_THREADS 4 +rdsv3_workqueue_struct_t * +rdsv3_create_task_workqueue(char *name) +{ + rdsv3_workqueue_struct_t *wq; + + RDSV3_DPRINTF2("create_singlethread_workqueue", "Enter (dip: %p)", + rdsv3_dev_info); + + rdsv3_taskq = ddi_taskq_create(rdsv3_dev_info, name, + RDSV3_NUM_TASKQ_THREADS, TASKQ_DEFAULTPRI, 0); + if (rdsv3_taskq == NULL) { + RDSV3_DPRINTF1(__FILE__, + "ddi_taskq_create failed for rdsv3_taskq"); + return (NULL); + } + + wq = kmem_zalloc(sizeof (rdsv3_workqueue_struct_t), KM_NOSLEEP); + if (wq == NULL) { + RDSV3_DPRINTF1(__FILE__, "kmem_zalloc failed for wq"); + ddi_taskq_destroy(rdsv3_taskq); + return (NULL); + } + + list_create(&wq->wq_queue, sizeof (struct rdsv3_work_s), + offsetof(struct rdsv3_work_s, work_item)); + mutex_init(&wq->wq_lock, NULL, MUTEX_DRIVER, NULL); + wq->wq_state = RDSV3_WQ_THREAD_IDLE; + wq->wq_pending = 0; + rdsv3_one_sec_in_hz = drv_usectohz(1000000); + + RDSV3_DPRINTF2("create_singlethread_workqueue", "Return"); + + return (wq); +} + +/* + * Implementation for struct sock + */ + +void +rdsv3_sock_exit_data(struct rsock *sk) +{ + struct rdsv3_sock *rs = sk->sk_protinfo; + + RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs, sk); + + ASSERT(rs != NULL); + ASSERT(rdsv3_sk_sock_flag(sk, SOCK_DEAD)); + + rs->rs_sk = NULL; + + list_destroy(&rs->rs_send_queue); + list_destroy(&rs->rs_notify_queue); + list_destroy(&rs->rs_recv_queue); + + rw_destroy(&rs->rs_recv_lock); + mutex_destroy(&rs->rs_lock); + + mutex_destroy(&rs->rs_rdma_lock); + avl_destroy(&rs->rs_rdma_keys); + + rdsv3_exit_waitqueue(sk->sk_sleep); + kmem_free(sk->sk_sleep, sizeof (rdsv3_wait_queue_t)); + mutex_destroy(&sk->sk_lock); + + kmem_cache_free(rdsv3_alloc_cache, sk); + RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs, sk); +} + +/* XXX - figure out right values */ +#define RDSV3_RECV_HIWATER (256 * 1024) +#define RDSV3_RECV_LOWATER 128 +#define RDSV3_XMIT_HIWATER (256 * 1024) +#define RDSV3_XMIT_LOWATER 1024 + +struct rsock * +rdsv3_sk_alloc() +{ + struct rsock *sk; + + sk = kmem_cache_alloc(rdsv3_alloc_cache, KM_SLEEP); + if (sk == NULL) { + RDSV3_DPRINTF2("rdsv3_create", "kmem_cache_alloc failed"); + return (NULL); + } + + bzero(sk, sizeof (struct rsock) + sizeof (struct rdsv3_sock)); + return (sk); +} + +void +rdsv3_sock_init_data(struct rsock *sk) +{ + sk->sk_sleep = kmem_zalloc(sizeof (rdsv3_wait_queue_t), KM_SLEEP); + rdsv3_init_waitqueue(sk->sk_sleep); + + mutex_init(&sk->sk_lock, NULL, MUTEX_DRIVER, NULL); + sk->sk_refcount = 1; + sk->sk_protinfo = (struct rdsv3_sock *)(sk + 1); + sk->sk_sndbuf = RDSV3_XMIT_HIWATER; + sk->sk_rcvbuf = RDSV3_RECV_HIWATER; +} + +/* XXX - not complete */ +void +rdsv3_poll_wait(struct rsock *sk, rdsv3_wait_queue_t *waitq, short events) +{ + struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); + + if (events & POLLIN) { + rw_enter(&rs->rs_recv_lock, RW_READER); + while (list_is_empty(&rs->rs_recv_queue) && + list_is_empty(&rs->rs_notify_queue)) { + rw_exit(&rs->rs_recv_lock); + mutex_enter(&waitq->waitq_mutex); + (void) cv_wait_sig(&waitq->waitq_cv, + &waitq->waitq_mutex); + mutex_exit(&waitq->waitq_mutex); + rw_enter(&rs->rs_recv_lock, RW_READER); + } + rw_exit(&rs->rs_recv_lock); + } +} + +/* + * Connection cache + */ +/* ARGSUSED */ +int +rdsv3_conn_constructor(void *buf, void *arg, int kmflags) +{ + struct rdsv3_connection *conn = buf; + + bzero(conn, sizeof (struct rdsv3_connection)); + + conn->c_next_tx_seq = 1; + mutex_init(&conn->c_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&conn->c_send_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&conn->c_send_queue, sizeof (struct rdsv3_message), + offsetof(struct rdsv3_message, m_conn_item)); + list_create(&conn->c_retrans, sizeof (struct rdsv3_message), + offsetof(struct rdsv3_message, m_conn_item)); + return (0); +} + +/* ARGSUSED */ +void +rdsv3_conn_destructor(void *buf, void *arg) +{ + struct rdsv3_connection *conn = buf; + + ASSERT(list_is_empty(&conn->c_send_queue)); + ASSERT(list_is_empty(&conn->c_retrans)); + list_destroy(&conn->c_send_queue); + list_destroy(&conn->c_retrans); + mutex_destroy(&conn->c_send_lock); + mutex_destroy(&conn->c_lock); +} + +int +rdsv3_conn_compare(const void *conn1, const void *conn2) +{ + uint32_be_t laddr1, faddr1, laddr2, faddr2; + + laddr1 = ((rdsv3_conn_info_t *)conn1)->c_laddr; + laddr2 = ((struct rdsv3_connection *)conn2)->c_laddr; + + if (laddr1 == laddr2) { + faddr1 = ((rdsv3_conn_info_t *)conn1)->c_faddr; + faddr2 = ((struct rdsv3_connection *)conn2)->c_faddr; + if (faddr1 == faddr2) + return (0); + if (faddr1 < faddr2) + return (-1); + return (1); + } + + if (laddr1 < laddr2) + return (-1); + + return (1); +} + +/* loop.c */ +extern kmutex_t loop_conns_lock; +extern list_t loop_conns; + +struct rdsv3_loop_connection +{ + struct list_node loop_node; + struct rdsv3_connection *conn; +}; + +void +rdsv3_loop_init(void) +{ + list_create(&loop_conns, sizeof (struct rdsv3_loop_connection), + offsetof(struct rdsv3_loop_connection, loop_node)); + mutex_init(&loop_conns_lock, NULL, MUTEX_DRIVER, NULL); +} + +/* rdma.c */ +/* IB Rkey is used here for comparison */ +int +rdsv3_mr_compare(const void *mr1, const void *mr2) +{ + uint32_t key1 = *(uint32_t *)mr1; + uint32_t key2 = ((struct rdsv3_mr *)mr2)->r_key; + + if (key1 < key2) + return (-1); + if (key1 > key2) + return (1); + return (0); +} + +/* transport.c */ +extern list_t transports; +extern krwlock_t trans_sem; + +void +rdsv3_trans_exit(void) +{ + struct rdsv3_transport *trans; + + RDSV3_DPRINTF2("rdsv3_trans_exit", "Enter"); + + /* currently, only IB transport */ + rw_enter(&trans_sem, RW_READER); + if (!list_is_empty(&transports)) + trans = list_head(&transports); + else + trans = NULL; + rw_exit(&trans_sem); + + /* trans->exit() will remove the trans from the list */ + if (trans) + trans->exit(); + + list_destroy(&transports); + rw_destroy(&trans_sem); + + RDSV3_DPRINTF2("rdsv3_trans_exit", "Return"); +} + +void +rdsv3_trans_init() +{ + RDSV3_DPRINTF2("rdsv3_trans_init", "Enter"); + + list_create(&transports, sizeof (struct rdsv3_transport), + offsetof(struct rdsv3_transport, t_item)); + rw_init(&trans_sem, NULL, RW_DRIVER, NULL); + + RDSV3_DPRINTF2("rdsv3_trans_init", "Return"); +} + +int +rdsv3_put_cmsg(struct nmsghdr *msg, int level, int type, size_t size, + void *payload) +{ + struct cmsghdr *cp; + char *bp; + size_t cmlen; + size_t cmspace; + size_t bufsz; + + RDSV3_DPRINTF4("rdsv3_put_cmsg", + "Enter(msg: %p level: %d type: %d sz: %d)", + msg, level, type, size); + + if (msg == NULL || msg->msg_controllen == 0 || payload == NULL) { + return (0); + } + /* check for first cmsg or this is another cmsg to be appended */ + if (msg->msg_control == NULL) + msg->msg_controllen = 0; + + cmlen = CMSG_LEN(size); + cmspace = CMSG_SPACE(size); + bufsz = msg->msg_controllen + cmspace; + + /* extend the existing cmsg to append the next cmsg */ + bp = kmem_alloc(bufsz, KM_SLEEP); + if (msg->msg_control) { + bcopy(msg->msg_control, bp, msg->msg_controllen); + kmem_free(msg->msg_control, (size_t)msg->msg_controllen); + } + + /* assign payload the proper cmsg location */ + cp = (struct cmsghdr *)(bp + msg->msg_controllen); + cp->cmsg_len = cmlen; + cp->cmsg_level = level; + cp->cmsg_type = type; + + bcopy(payload, CMSG_DATA(cp), cmlen - + (unsigned int)_CMSG_DATA_ALIGN(sizeof (struct cmsghdr))); + + msg->msg_control = bp; + msg->msg_controllen = bufsz; + + RDSV3_DPRINTF4("rdsv3_put_cmsg", "Return(cmsg_len: %d)", cp->cmsg_len); + + return (0); +} + +/* bind.c */ +extern kmutex_t rdsv3_bind_lock; +extern avl_tree_t rdsv3_bind_tree; + +/* ARGSUSED */ +int +rdsv3_verify_bind_address(ipaddr_t addr) +{ + return (1); +} + +/* XXX - need to enhance to compare IP address and port */ +int +rdsv3_bind_node_compare(const void *a, const void *b) +{ + uint16_be_t port = *(in_port_t *)a; + struct rdsv3_sock *rs = (struct rdsv3_sock *)b; + + RDSV3_DPRINTF5("rdsv3_bind_node_compare", "Enter (%x %x)", port, + rs->rs_bound_port); + + if (port > rs->rs_bound_port) + return (+1); + else if (port < rs->rs_bound_port) + return (-1); + + return (0); +} + +void +rdsv3_bind_tree_init() +{ + RDSV3_DPRINTF4("rdsv3_bind_tree_init", "Enter"); + + mutex_init(&rdsv3_bind_lock, NULL, MUTEX_DRIVER, NULL); + avl_create(&rdsv3_bind_tree, rdsv3_bind_node_compare, + sizeof (struct rdsv3_sock), + offsetof(struct rdsv3_sock, rs_bound_node)); + + RDSV3_DPRINTF4("rdsv3_bind_tree_init", "Return"); +} + +void +rdsv3_bind_tree_exit() +{ + RDSV3_DPRINTF2("rdsv3_bind_tree_exit", "Enter"); + + ASSERT(avl_is_empty(&rdsv3_bind_tree)); + avl_destroy(&rdsv3_bind_tree); + mutex_destroy(&rdsv3_bind_lock); + + RDSV3_DPRINTF2("rdsv3_bind_tree_exit", "Return"); +} + +/* checksum */ +uint16_t +rdsv3_ip_fast_csum(void *hdr, size_t length) +{ + return (0xffff & + (uint16_t)(~ip_ocsum((ushort_t *)hdr, (int)length <<1, 0))); +} + +/* scatterlist implementation */ +/* ARGSUSED */ +caddr_t +rdsv3_ib_sg_dma_address(ib_device_t *dev, struct rdsv3_scatterlist *scat, + uint_t offset) +{ + return (0); +} + +uint_t +rdsv3_ib_dma_map_sg(struct ib_device *dev, struct rdsv3_scatterlist *scat, + uint_t num) +{ + struct rdsv3_scatterlist *s, *first; + ibt_iov_t *iov; + ibt_wr_ds_t *sgl; + ibt_iov_attr_t iov_attr; + ibt_send_wr_t swr; + uint_t i; + + RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg", "scat %p, num: %d", scat, num); + + s = first = &scat[0]; + ASSERT(first->mihdl == NULL); + + iov = kmem_alloc(num * sizeof (ibt_iov_t), KM_SLEEP); + sgl = kmem_zalloc((num * 2) * sizeof (ibt_wr_ds_t), KM_SLEEP); + + for (i = 0; i < num; i++, s++) { + iov[i].iov_addr = s->vaddr; + iov[i].iov_len = s->length; + } + + iov_attr.iov_as = NULL; + iov_attr.iov = iov; + iov_attr.iov_buf = NULL; + iov_attr.iov_list_len = num; + iov_attr.iov_wr_nds = num * 2; + iov_attr.iov_lso_hdr_sz = 0; + iov_attr.iov_flags = IBT_IOV_SLEEP; + + swr.wr_sgl = sgl; + + i = ibt_map_mem_iov(ib_get_ibt_hca_hdl(dev), + &iov_attr, (ibt_all_wr_t *)&swr, &first->mihdl); + kmem_free(iov, num * sizeof (ibt_iov_t)); + if (i != IBT_SUCCESS) { + RDSV3_DPRINTF2("rdsv3_ib_dma_map_sg", + "ibt_map_mem_iov returned: %d", i); + return (0); + } + + s = first; + for (i = 0; i < num; i++, s++, sgl++) { + s->sgl = sgl; + } + + return (num); +} + +void +rdsv3_ib_dma_unmap_sg(ib_device_t *dev, struct rdsv3_scatterlist *scat, + uint_t num) +{ + /* Zero length messages have no scatter gather entries */ + if (num != 0) { + ASSERT(scat->mihdl != NULL); + ASSERT(scat->sgl != NULL); + + (void) ibt_unmap_mem_iov(ib_get_ibt_hca_hdl(dev), scat->mihdl); + + kmem_free(scat->sgl, (num * 2) * sizeof (ibt_wr_ds_t)); + scat->sgl = NULL; + scat->mihdl = NULL; + } +} + +int +rdsv3_ib_alloc_hdrs(ib_device_t *dev, struct rdsv3_ib_connection *ic) +{ + caddr_t addr; + size_t size; + ibt_mr_attr_t mr_attr; + ibt_mr_desc_t mr_desc; + ibt_mr_hdl_t mr_hdl; + int ret; + + RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Enter(dev: %p)", dev); + + ASSERT(ic->i_mr == NULL); + + size = (ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr + 1) * + sizeof (struct rdsv3_header); + + addr = kmem_zalloc(size, KM_NOSLEEP); + if (addr == NULL) + return (-1); + + mr_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)addr; + mr_attr.mr_len = size; + mr_attr.mr_as = NULL; + mr_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE; + ret = ibt_register_mr(ib_get_ibt_hca_hdl(dev), RDSV3_PD2PDHDL(ic->i_pd), + &mr_attr, &mr_hdl, &mr_desc); + if (ret != IBT_SUCCESS) { + RDSV3_DPRINTF2("rdsv3_ib_alloc_hdrs", + "ibt_register_mr returned: " "%d", ret); + return (-1); + } + + ic->i_mr = + (struct rdsv3_hdrs_mr *)kmem_alloc(sizeof (struct rdsv3_hdrs_mr), + KM_SLEEP); + ic->i_mr->addr = addr; + ic->i_mr->size = size; + ic->i_mr->hdl = mr_hdl; + ic->i_mr->lkey = mr_desc.md_lkey; + + ic->i_send_hdrs = (struct rdsv3_header *)addr; + ic->i_send_hdrs_dma = (uint64_t)(uintptr_t)addr; + + ic->i_recv_hdrs = (struct rdsv3_header *)(addr + + (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header))); + ic->i_recv_hdrs_dma = (uint64_t)(uintptr_t)(addr + + (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header))); + ic->i_recv_tasklet_cpuid = -1; + + ic->i_ack = (struct rdsv3_header *)(addr + + ((ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr) * + sizeof (struct rdsv3_header))); + ic->i_ack_dma = (uint64_t)(uintptr_t)(addr + + ((ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr) * + sizeof (struct rdsv3_header))); + + RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Return(dev: %p)", dev); + + return (0); +} + +void +rdsv3_ib_free_hdrs(ib_device_t *dev, struct rdsv3_ib_connection *ic) +{ + RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Enter(dev: %p)", dev); + ASSERT(ic->i_mr != NULL); + + ic->i_send_hdrs = NULL; + ic->i_send_hdrs_dma = NULL; + + ic->i_recv_hdrs = NULL; + ic->i_recv_hdrs_dma = NULL; + + ic->i_ack = NULL; + ic->i_ack_dma = NULL; + + (void) ibt_deregister_mr(ib_get_ibt_hca_hdl(dev), ic->i_mr->hdl); + + kmem_free(ic->i_mr->addr, ic->i_mr->size); + kmem_free(ic->i_mr, sizeof (struct rdsv3_hdrs_mr)); + + ic->i_mr = NULL; + RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Return(dev: %p)", dev); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_sc.c b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_sc.c new file mode 100644 index 0000000000..8510746b9e --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_sc.c @@ -0,0 +1,395 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ +#include <sys/types.h> +#include <sys/sunddi.h> +#include <sys/dlpi.h> +#include <sys/ib/clients/rdsv3/rdsv3_sc.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +/* + * RDS Path MAP + * + * N - Node record, P - Path record + * + * rds_path_map - + * | + * v + * --------- --------- --------- + * | N |------>| N |------>| N |------> NULL + * NULL <-------| |<------| |<------| | + * --------- --------- --------- + * | | | + * | | | + * v v v + * -------- --------- --------- + * | P | | P | | P | + * -------- --------- --------- + * | ^ | ^ | ^ + * | | | | | | + * v | v | v | + * -------- -------- --------- + * | P | | P | | P | + * -------- -------- --------- + * o o o + * o o o + * o o o + */ + +typedef struct rds_path_record_s { + ipaddr_t libd_ip; + ipaddr_t ribd_ip; + struct rds_path_record_s *up; + struct rds_path_record_s *downp; + char lifname[MAXNAMELEN]; + char rifname[MAXNAMELEN]; +} rds_path_record_t; + +typedef struct rds_node_record_s { + struct rds_node_record_s *nextp; + ipaddr_t lnode_ip; /* local ip */ + ipaddr_t rnode_ip; /* remote ip */ + struct rds_path_record_s *downp; + struct rds_node_record_s *prevp; +} rds_node_record_t; + +static char sc_device_name[MAXNAMELEN] = "NotInitialized"; +static kmutex_t rdsv3_pathmap_lock; +static rds_node_record_t *rdsv3_pathmap = NULL; + +#define RDS_VALIDATE_PATH(p) \ + if ((p->local.iftype != DL_IB) || (p->remote.iftype != DL_IB)) \ + return + +#define isalpha(ch) (((ch) >= 'a' && (ch) <= 'z') || \ + ((ch) >= 'A' && (ch) <= 'Z')) + +/* + * Called by SC to register the Sun Cluster device name + */ +void +rdsv3_clif_name(char *name) +{ + int i; + + ASSERT(name != NULL); + + mutex_enter(&rdsv3_pathmap_lock); + + /* extract the device name from the interface name */ + i = strlen(name) - 1; + while ((i >= 0) && (!isalpha(name[i]))) i--; + if (i >= 0) { + (void) strncpy(sc_device_name, name, i + 1); + sc_device_name[i + 1] = '\0'; + } + + mutex_exit(&rdsv3_pathmap_lock); +} + +/* + * Called by SC on discovering a new path + */ +void +rdsv3_path_up(rds_path_t *path) +{ + rds_node_record_t *p; + rds_path_record_t *p1; + + ASSERT(path != NULL); + + /* ignore if the end points are not of type DL_IB */ + RDS_VALIDATE_PATH(path); + + mutex_enter(&rdsv3_pathmap_lock); + + p = rdsv3_pathmap; + while ((p) && ((p->lnode_ip != path->local.node_ipaddr) || + (p->rnode_ip != path->remote.node_ipaddr))) { + p = p->nextp; + } + + if (p == NULL) { + p = (rds_node_record_t *)kmem_alloc(sizeof (rds_node_record_t), + KM_SLEEP); + p1 = (rds_path_record_t *)kmem_alloc( + sizeof (rds_path_record_t), KM_SLEEP); + + p->nextp = NULL; + p->lnode_ip = path->local.node_ipaddr; + p->rnode_ip = path->remote.node_ipaddr; + p->downp = p1; + p->prevp = NULL; + + p1->libd_ip = path->local.ipaddr; + p1->ribd_ip = path->remote.ipaddr; + p1->up = NULL; + p1->downp = NULL; + (void) strcpy(p1->lifname, path->local.ifname); + (void) strcpy(p1->rifname, path->remote.ifname); + + if (rdsv3_pathmap == NULL) { + rdsv3_pathmap = p; + } else { + /* insert this node at the head */ + rdsv3_pathmap->prevp = p; + p->nextp = rdsv3_pathmap; + rdsv3_pathmap = p; + } + } else { + /* we found a match */ + p1 = (rds_path_record_t *)kmem_alloc( + sizeof (rds_path_record_t), KM_SLEEP); + + p1->libd_ip = path->local.ipaddr; + p1->ribd_ip = path->remote.ipaddr; + p1->downp = p->downp; + p->downp->up = p1; + p1->up = NULL; + p->downp = p1; + (void) strcpy(p1->lifname, path->local.ifname); + (void) strcpy(p1->rifname, path->remote.ifname); + } + + mutex_exit(&rdsv3_pathmap_lock); +} + +/* + * Called by SC to delete a path + */ +void +rdsv3_path_down(rds_path_t *path) +{ + rds_node_record_t *p; + rds_path_record_t *p1, *p1up, *p1downp; + + ASSERT(path != NULL); + + /* ignore if the end points are not of type DL_IB */ + RDS_VALIDATE_PATH(path); + + mutex_enter(&rdsv3_pathmap_lock); + + p = rdsv3_pathmap; + while ((p) && ((p->lnode_ip != path->local.node_ipaddr) || + (p->rnode_ip != path->remote.node_ipaddr))) { + p = p->nextp; + } + + if (p == NULL) { + /* no match */ + RDSV3_DPRINTF2("rdsv3_path_down", "Node record not found " + "(0x%x <-> 0x%x)", path->local.node_ipaddr, + path->remote.node_ipaddr); + mutex_exit(&rdsv3_pathmap_lock); + return; + } + + p1 = p->downp; + while ((p1) && ((p1->libd_ip != path->local.ipaddr) || + (p1->ribd_ip != path->remote.ipaddr))) { + p1 = p1->downp; + } + + if (p1 == NULL) { + /* no match */ + RDSV3_DPRINTF2("rdsv3_path_down", "Path record not found " + "(0x%x <-> 0x%x)", path->local.ipaddr, path->remote.ipaddr); + mutex_exit(&rdsv3_pathmap_lock); + return; + } + + /* we found the record, remove it */ + p1up = p1->up; + p1downp = p1->downp; + + if (p1up) { + p1up->downp = p1downp; + } else { + /* this is the first path record */ + p->downp = p1downp; + } + + if (p1downp) { + p1downp->up = p1up; + } + + kmem_free(p1, sizeof (rds_path_record_t)); + + /* remove the node record if there are no path records */ + if (p->downp == NULL) { + if (p->prevp) { + p->prevp->nextp = p->nextp; + } else { + /* this is the first node record */ + ASSERT(p == rdsv3_pathmap); + rdsv3_pathmap = p->nextp; + } + + if (p->nextp) { + p->nextp->prevp = p->prevp; + } + + kmem_free(p, sizeof (rds_node_record_t)); + } + + mutex_exit(&rdsv3_pathmap_lock); +} + +int +rdsv3_sc_path_lookup(ipaddr_t *localip, ipaddr_t *remip) +{ + rds_node_record_t *p; + rds_path_record_t *p1, *p1downp; + + mutex_enter(&rdsv3_pathmap_lock); + + p = rdsv3_pathmap; + while ((p) && ((p->lnode_ip != *localip) || (p->rnode_ip != *remip))) { + p = p->nextp; + } + + if (p == NULL) { + /* no match */ + RDSV3_DPRINTF2("rdsv3_sc_path_lookup", "Node record not found " + "(0x%x <-> 0x%x)", *localip, *remip); + mutex_exit(&rdsv3_pathmap_lock); + return (0); + } + + /* found a path */ + p1 = p->downp; + *localip = p1->libd_ip; + *remip = p1->ribd_ip; + + /* + * But next time, we want to use a different path record so move this + * path record to the end. + */ + p1downp = p1->downp; + if (p1downp != NULL) { + p->downp = p1downp; + p1downp->up = NULL; + + /* walk down to the last path record */ + while (p1downp->downp != NULL) { + p1downp = p1downp->downp; + } + + /* Attach the first path record to the end */ + p1downp->downp = p1; + p1->up = p1downp; + p1->downp = NULL; + } + + mutex_exit(&rdsv3_pathmap_lock); + + return (1); +} + +boolean_t +rdsv3_if_lookup_by_name(char *devname) +{ + mutex_enter(&rdsv3_pathmap_lock); + + /* + * Sun Cluster always names its interconnect virtual network interface + * as clprivnetx, so return TRUE if there is atleast one node record + * and the interface name is clprivnet something. + */ + if (strcmp(devname, sc_device_name) == 0) { + /* clprivnet address */ + mutex_exit(&rdsv3_pathmap_lock); + return (B_TRUE); + } + + mutex_exit(&rdsv3_pathmap_lock); + return (B_FALSE); +} + +boolean_t +rdsv3_if_lookup_by_addr(ipaddr_t addr) +{ + rds_node_record_t *p; + rds_path_record_t *p1; + + mutex_enter(&rdsv3_pathmap_lock); + + p = rdsv3_pathmap; + while ((p) && (p->lnode_ip != addr)) { + p1 = p->downp; + while ((p1) && (p1->libd_ip != addr)) { + p1 = p1->downp; + } + + /* we found a match */ + if (p1 != NULL) + break; + + /* go to the next node record */ + p = p->nextp; + } + + mutex_exit(&rdsv3_pathmap_lock); + if (p == NULL) { + /* no match */ + RDSV3_DPRINTF2("rds_if_lookup_by_addr", + "Addr: 0x%x not found", addr); + return (B_FALSE); + } + + /* Found a matching node record */ + return (B_TRUE); +} + +/* + * If SC is configured then addr would be a clprivnet address. Find the + * node record and return the first IB address. If the node record is not + * found, then return addr as-is. + */ +ipaddr_t +rdsv3_scaddr_to_ibaddr(ipaddr_t addr) +{ + rds_node_record_t *p; + rds_path_record_t *p1; + ipaddr_t ret = addr; + + mutex_enter(&rdsv3_pathmap_lock); + + p = rdsv3_pathmap; + while ((p) && (p->lnode_ip != addr)) { + /* go to the next node record */ + p = p->nextp; + } + + if (p != NULL) { + p1 = p->downp; + ret = p1->libd_ip; + RDSV3_DPRINTF3("rds_scaddr_to_ibaddr", + "Addr: 0x%x found: 0x%x", addr, p1->libd_ip); + } + mutex_exit(&rdsv3_pathmap_lock); + + /* Found a matching node record */ + return (ret); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/send.c b/usr/src/uts/common/io/ib/clients/rdsv3/send.c new file mode 100644 index 0000000000..8d5d0f7fa4 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/send.c @@ -0,0 +1,1178 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/stropts.h> +#include <sys/systm.h> + +#include <sys/rds.h> +#include <sys/socket.h> +#include <sys/socketvar.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/rdma.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +/* + * When transmitting messages in rdsv3_send_xmit, we need to emerge from + * time to time and briefly release the CPU. Otherwise the softlock watchdog + * will kick our shin. + * Also, it seems fairer to not let one busy connection stall all the + * others. + * + * send_batch_count is the number of times we'll loop in send_xmit. Setting + * it to 0 will restore the old behavior (where we looped until we had + * drained the queue). + */ +static int send_batch_count = 64; + +extern void rdsv3_ib_send_unmap_rdma(void *ic, struct rdsv3_rdma_op *op); +/* + * Reset the send state. Caller must hold c_send_lock when calling here. + */ +void +rdsv3_send_reset(struct rdsv3_connection *conn) +{ + struct rdsv3_message *rm, *tmp; + struct rdsv3_rdma_op *ro; + + RDSV3_DPRINTF4("rdsv3_send_reset", "Enter(conn: %p)", conn); + + if (conn->c_xmit_rm) { + rm = conn->c_xmit_rm; + ro = rm->m_rdma_op; + if (ro && ro->r_mapped) { + RDSV3_DPRINTF2("rdsv3_send_reset", + "rm %p mflg 0x%x map %d mihdl %p sgl %p", + rm, rm->m_flags, ro->r_mapped, + ro->r_rdma_sg[0].mihdl, + ro->r_rdma_sg[0].swr.wr_sgl); + rdsv3_ib_send_unmap_rdma(conn->c_transport_data, ro); + } + /* + * Tell the user the RDMA op is no longer mapped by the + * transport. This isn't entirely true (it's flushed out + * independently) but as the connection is down, there's + * no ongoing RDMA to/from that memory + */ + rdsv3_message_unmapped(conn->c_xmit_rm); + rdsv3_message_put(conn->c_xmit_rm); + conn->c_xmit_rm = NULL; + } + conn->c_xmit_sg = 0; + conn->c_xmit_hdr_off = 0; + conn->c_xmit_data_off = 0; + conn->c_xmit_rdma_sent = 0; + + conn->c_map_queued = 0; + + conn->c_unacked_packets = rdsv3_sysctl_max_unacked_packets; + conn->c_unacked_bytes = rdsv3_sysctl_max_unacked_bytes; + + /* Mark messages as retransmissions, and move them to the send q */ + mutex_enter(&conn->c_lock); + RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &conn->c_retrans, m_conn_item) { + set_bit(RDSV3_MSG_ACK_REQUIRED, &rm->m_flags); + set_bit(RDSV3_MSG_RETRANSMITTED, &rm->m_flags); + if (rm->m_rdma_op && rm->m_rdma_op->r_mapped) { + RDSV3_DPRINTF4("_send_reset", + "RT rm %p mflg 0x%x sgl %p", + rm, rm->m_flags, + rm->m_rdma_op->r_rdma_sg[0].swr.wr_sgl); + } + } + list_move_tail(&conn->c_send_queue, &conn->c_retrans); + mutex_exit(&conn->c_lock); + + RDSV3_DPRINTF4("rdsv3_send_reset", "Return(conn: %p)", conn); +} + +/* + * We're making the concious trade-off here to only send one message + * down the connection at a time. + * Pro: + * - tx queueing is a simple fifo list + * - reassembly is optional and easily done by transports per conn + * - no per flow rx lookup at all, straight to the socket + * - less per-frag memory and wire overhead + * Con: + * - queued acks can be delayed behind large messages + * Depends: + * - small message latency is higher behind queued large messages + * - large message latency isn't starved by intervening small sends + */ +int +rdsv3_send_xmit(struct rdsv3_connection *conn) +{ + struct rdsv3_message *rm; + unsigned int tmp; + unsigned int send_quota = send_batch_count; + struct rdsv3_scatterlist *sg; + int ret = 0; + int was_empty = 0; + list_t to_be_dropped; + + RDSV3_DPRINTF4("rdsv3_send_xmit", "Enter(conn: %p)", conn); + + list_create(&to_be_dropped, sizeof (struct rdsv3_message), + offsetof(struct rdsv3_message, m_conn_item)); + + /* + * sendmsg calls here after having queued its message on the send + * queue. We only have one task feeding the connection at a time. If + * another thread is already feeding the queue then we back off. This + * avoids blocking the caller and trading per-connection data between + * caches per message. + * + * The sem holder will issue a retry if they notice that someone queued + * a message after they stopped walking the send queue but before they + * dropped the sem. + */ + if (!mutex_tryenter(&conn->c_send_lock)) { + RDSV3_DPRINTF4("rdsv3_send_xmit", + "Another thread running(conn: %p)", conn); + rdsv3_stats_inc(s_send_sem_contention); + ret = -ENOMEM; + goto out; + } + + if (conn->c_trans->xmit_prepare) + conn->c_trans->xmit_prepare(conn); + + /* + * spin trying to push headers and data down the connection until + * the connection doens't make forward progress. + */ + while (--send_quota) { + /* + * See if need to send a congestion map update if we're + * between sending messages. The send_sem protects our sole + * use of c_map_offset and _bytes. + * Note this is used only by transports that define a special + * xmit_cong_map function. For all others, we create allocate + * a cong_map message and treat it just like any other send. + */ + if (conn->c_map_bytes) { + ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong, + conn->c_map_offset); + if (ret <= 0) + break; + + conn->c_map_offset += ret; + conn->c_map_bytes -= ret; + if (conn->c_map_bytes) + continue; + } + + /* + * If we're done sending the current message, clear the + * offset and S/G temporaries. + */ + rm = conn->c_xmit_rm; + if (rm != NULL && + conn->c_xmit_hdr_off == sizeof (struct rdsv3_header) && + conn->c_xmit_sg == rm->m_nents) { + conn->c_xmit_rm = NULL; + conn->c_xmit_sg = 0; + conn->c_xmit_hdr_off = 0; + conn->c_xmit_data_off = 0; + conn->c_xmit_rdma_sent = 0; + + /* Release the reference to the previous message. */ + rdsv3_message_put(rm); + rm = NULL; + } + + /* If we're asked to send a cong map update, do so. */ + if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) { + if (conn->c_trans->xmit_cong_map != NULL) { + conn->c_map_offset = 0; + conn->c_map_bytes = + sizeof (struct rdsv3_header) + + RDSV3_CONG_MAP_BYTES; + continue; + } + + rm = rdsv3_cong_update_alloc(conn); + if (IS_ERR(rm)) { + ret = PTR_ERR(rm); + break; + } + + conn->c_xmit_rm = rm; + } + + /* + * Grab the next message from the send queue, if there is one. + * + * c_xmit_rm holds a ref while we're sending this message down + * the connction. We can use this ref while holding the + * send_sem.. rdsv3_send_reset() is serialized with it. + */ + if (rm == NULL) { + unsigned int len; + + mutex_enter(&conn->c_lock); + + if (!list_is_empty(&conn->c_send_queue)) { + rm = list_remove_head(&conn->c_send_queue); + rdsv3_message_addref(rm); + + /* + * Move the message from the send queue to + * the retransmit + * list right away. + */ + list_insert_tail(&conn->c_retrans, rm); + } + + mutex_exit(&conn->c_lock); + + if (rm == NULL) { + was_empty = 1; + break; + } + + /* + * Unfortunately, the way Infiniband deals with + * RDMA to a bad MR key is by moving the entire + * queue pair to error state. We cold possibly + * recover from that, but right now we drop the + * connection. + * Therefore, we never retransmit messages with + * RDMA ops. + */ + if (rm->m_rdma_op && + test_bit(RDSV3_MSG_RETRANSMITTED, &rm->m_flags)) { + mutex_enter(&conn->c_lock); + if (test_and_clear_bit(RDSV3_MSG_ON_CONN, + &rm->m_flags)) + list_remove_node(&rm->m_conn_item); + list_insert_tail(&to_be_dropped, rm); + mutex_exit(&conn->c_lock); + rdsv3_message_put(rm); + continue; + } + + /* Require an ACK every once in a while */ + len = ntohl(rm->m_inc.i_hdr.h_len); + if (conn->c_unacked_packets == 0 || + conn->c_unacked_bytes < len) { + set_bit(RDSV3_MSG_ACK_REQUIRED, &rm->m_flags); + + conn->c_unacked_packets = + rdsv3_sysctl_max_unacked_packets; + conn->c_unacked_bytes = + rdsv3_sysctl_max_unacked_bytes; + rdsv3_stats_inc(s_send_ack_required); + } else { + conn->c_unacked_bytes -= len; + conn->c_unacked_packets--; + } + + conn->c_xmit_rm = rm; + } + + /* + * Try and send an rdma message. Let's see if we can + * keep this simple and require that the transport either + * send the whole rdma or none of it. + */ + if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) { + ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op); + if (ret) + break; + conn->c_xmit_rdma_sent = 1; + /* + * The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue + */ + set_bit(RDSV3_MSG_MAPPED, &rm->m_flags); + } + + if (conn->c_xmit_hdr_off < sizeof (struct rdsv3_header) || + conn->c_xmit_sg < rm->m_nents) { + ret = conn->c_trans->xmit(conn, rm, + conn->c_xmit_hdr_off, + conn->c_xmit_sg, + conn->c_xmit_data_off); + if (ret <= 0) + break; + + if (conn->c_xmit_hdr_off < + sizeof (struct rdsv3_header)) { + tmp = min(ret, + sizeof (struct rdsv3_header) - + conn->c_xmit_hdr_off); + conn->c_xmit_hdr_off += tmp; + ret -= tmp; + } + + sg = &rm->m_sg[conn->c_xmit_sg]; + while (ret) { + tmp = min(ret, rdsv3_sg_len(sg) - + conn->c_xmit_data_off); + conn->c_xmit_data_off += tmp; + ret -= tmp; + if (conn->c_xmit_data_off == rdsv3_sg_len(sg)) { + conn->c_xmit_data_off = 0; + sg++; + conn->c_xmit_sg++; + ASSERT(!(ret != 0 && + conn->c_xmit_sg == rm->m_nents)); + } + } + } + } + + /* Nuke any messages we decided not to retransmit. */ + if (!list_is_empty(&to_be_dropped)) + rdsv3_send_remove_from_sock(&to_be_dropped, RDSV3_RDMA_DROPPED); + + if (conn->c_trans->xmit_complete) + conn->c_trans->xmit_complete(conn); + + /* + * We might be racing with another sender who queued a message but + * backed off on noticing that we held the c_send_lock. If we check + * for queued messages after dropping the sem then either we'll + * see the queued message or the queuer will get the sem. If we + * notice the queued message then we trigger an immediate retry. + * + * We need to be careful only to do this when we stopped processing + * the send queue because it was empty. It's the only way we + * stop processing the loop when the transport hasn't taken + * responsibility for forward progress. + */ + mutex_exit(&conn->c_send_lock); + + if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) { + /* + * We exhausted the send quota, but there's work left to + * do. Return and (re-)schedule the send worker. + */ + ret = -EAGAIN; + } + + if (ret == 0 && was_empty) { + /* + * A simple bit test would be way faster than taking the + * spin lock + */ + mutex_enter(&conn->c_lock); + if (!list_is_empty(&conn->c_send_queue)) { + rdsv3_stats_inc(s_send_sem_queue_raced); + ret = -EAGAIN; + } + mutex_exit(&conn->c_lock); + } + +out: + RDSV3_DPRINTF4("rdsv3_send_xmit", "Return(conn: %p, ret: %d)", + conn, ret); + return (ret); +} + +static void +rdsv3_send_sndbuf_remove(struct rdsv3_sock *rs, struct rdsv3_message *rm) +{ + uint32_t len = ntohl(rm->m_inc.i_hdr.h_len); + + ASSERT(mutex_owned(&rs->rs_lock)); + + ASSERT(rs->rs_snd_bytes >= len); + rs->rs_snd_bytes -= len; + + if (rs->rs_snd_bytes == 0) + rdsv3_stats_inc(s_send_queue_empty); +} + +static inline int +rdsv3_send_is_acked(struct rdsv3_message *rm, uint64_t ack, + is_acked_func is_acked) +{ + if (is_acked) + return (is_acked(rm, ack)); + return (ntohll(rm->m_inc.i_hdr.h_sequence) <= ack); +} + +/* + * Returns true if there are no messages on the send and retransmit queues + * which have a sequence number greater than or equal to the given sequence + * number. + */ +int +rdsv3_send_acked_before(struct rdsv3_connection *conn, uint64_t seq) +{ + struct rdsv3_message *rm; + int ret = 1; + + RDSV3_DPRINTF4("rdsv3_send_acked_before", "Enter(conn: %p)", conn); + + mutex_enter(&conn->c_lock); + + /* XXX - original code spits out warning */ + rm = list_head(&conn->c_retrans); + if (ntohll(rm->m_inc.i_hdr.h_sequence) < seq) + ret = 0; + + /* XXX - original code spits out warning */ + rm = list_head(&conn->c_send_queue); + if (ntohll(rm->m_inc.i_hdr.h_sequence) < seq) + ret = 0; + + mutex_exit(&conn->c_lock); + + RDSV3_DPRINTF4("rdsv3_send_acked_before", "Return(conn: %p)", conn); + + return (ret); +} + +/* + * This is pretty similar to what happens below in the ACK + * handling code - except that we call here as soon as we get + * the IB send completion on the RDMA op and the accompanying + * message. + */ +void +rdsv3_rdma_send_complete(struct rdsv3_message *rm, int status) +{ + struct rdsv3_sock *rs = NULL; + struct rdsv3_rdma_op *ro; + struct rdsv3_notifier *notifier; + + RDSV3_DPRINTF4("rdsv3_rdma_send_complete", "Enter(rm: %p)", rm); + + mutex_enter(&rm->m_rs_lock); + + ro = rm->m_rdma_op; + if (test_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags) && + ro && ro->r_notify && + (notifier = ro->r_notifier) != NULL) { + ro->r_notifier = NULL; + rs = rm->m_rs; + rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs)); + + notifier->n_status = status; + mutex_enter(&rs->rs_lock); + list_insert_tail(&rs->rs_notify_queue, notifier); + mutex_exit(&rs->rs_lock); + } + + mutex_exit(&rm->m_rs_lock); + + if (rs) { + rdsv3_wake_sk_sleep(rs); + rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs)); + } + + RDSV3_DPRINTF4("rdsv3_rdma_send_complete", "Return(rm: %p)", rm); +} + +/* + * This is the same as rdsv3_rdma_send_complete except we + * don't do any locking - we have all the ingredients (message, + * socket, socket lock) and can just move the notifier. + */ +static inline void +__rdsv3_rdma_send_complete(struct rdsv3_sock *rs, struct rdsv3_message *rm, + int status) +{ + struct rdsv3_rdma_op *ro; + void *ic; + + RDSV3_DPRINTF4("__rdsv3_rdma_send_complete", + "Enter(rs: %p, rm: %p)", rs, rm); + + ro = rm->m_rdma_op; + if (ro && ro->r_notify && ro->r_notifier) { + ro->r_notifier->n_status = status; + list_insert_tail(&rs->rs_notify_queue, ro->r_notifier); + ro->r_notifier = NULL; + } + + /* No need to wake the app - caller does this */ +} + +/* + * This is called from the IB send completion when we detect + * a RDMA operation that failed with remote access error. + * So speed is not an issue here. + */ +struct rdsv3_message * +rdsv3_send_get_message(struct rdsv3_connection *conn, + struct rdsv3_rdma_op *op) +{ + struct rdsv3_message *rm, *tmp, *found = NULL; + + RDSV3_DPRINTF4("rdsv3_send_get_message", "Enter(conn: %p)", conn); + + mutex_enter(&conn->c_lock); + + RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &conn->c_retrans, m_conn_item) { + if (rm->m_rdma_op == op) { + atomic_add_32(&rm->m_refcount, 1); + found = rm; + goto out; + } + } + + RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &conn->c_send_queue, + m_conn_item) { + if (rm->m_rdma_op == op) { + atomic_add_32(&rm->m_refcount, 1); + found = rm; + break; + } + } + +out: + mutex_exit(&conn->c_lock); + + return (found); +} + +/* + * This removes messages from the socket's list if they're on it. The list + * argument must be private to the caller, we must be able to modify it + * without locks. The messages must have a reference held for their + * position on the list. This function will drop that reference after + * removing the messages from the 'messages' list regardless of if it found + * the messages on the socket list or not. + */ +void +rdsv3_send_remove_from_sock(struct list *messages, int status) +{ + struct rdsv3_sock *rs = NULL; + struct rdsv3_message *rm; + + RDSV3_DPRINTF4("rdsv3_send_remove_from_sock", "Enter"); + + while (!list_is_empty(messages)) { + rm = list_remove_head(messages); + + /* + * If we see this flag cleared then we're *sure* that someone + * else beat us to removing it from the sock. If we race + * with their flag update we'll get the lock and then really + * see that the flag has been cleared. + * + * The message spinlock makes sure nobody clears rm->m_rs + * while we're messing with it. It does not prevent the + * message from being removed from the socket, though. + */ + mutex_enter(&rm->m_rs_lock); + if (!test_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags)) + goto unlock_and_drop; + + if (rs != rm->m_rs) { + if (rs) { + rdsv3_wake_sk_sleep(rs); + rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs)); + } + rs = rm->m_rs; + rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs)); + } + + mutex_enter(&rs->rs_lock); + if (test_and_clear_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags)) { + struct rdsv3_rdma_op *ro = rm->m_rdma_op; + struct rdsv3_notifier *notifier; + + list_remove_node(&rm->m_sock_item); + rdsv3_send_sndbuf_remove(rs, rm); + + if (ro && + (notifier = ro->r_notifier) != NULL && + (status || ro->r_notify)) { + list_insert_tail(&rs->rs_notify_queue, + notifier); + if (!notifier->n_status) + notifier->n_status = status; + rm->m_rdma_op->r_notifier = NULL; + } + rdsv3_message_put(rm); + rm->m_rs = NULL; + } + mutex_exit(&rs->rs_lock); + +unlock_and_drop: + mutex_exit(&rm->m_rs_lock); + rdsv3_message_put(rm); + } + + if (rs) { + rdsv3_wake_sk_sleep(rs); + rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs)); + } + + RDSV3_DPRINTF4("rdsv3_send_remove_from_sock", "Return"); +} + +/* + * Transports call here when they've determined that the receiver queued + * messages up to, and including, the given sequence number. Messages are + * moved to the retrans queue when rdsv3_send_xmit picks them off the send + * queue. This means that in the TCP case, the message may not have been + * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked + * checks the RDSV3_MSG_HAS_ACK_SEQ bit. + * + * XXX It's not clear to me how this is safely serialized with socket + * destruction. Maybe it should bail if it sees SOCK_DEAD. + */ +void +rdsv3_send_drop_acked(struct rdsv3_connection *conn, uint64_t ack, + is_acked_func is_acked) +{ + struct rdsv3_message *rm, *tmp; + list_t list; + + RDSV3_DPRINTF4("rdsv3_send_drop_acked", "Enter(conn: %p)", conn); + + list_create(&list, sizeof (struct rdsv3_message), + offsetof(struct rdsv3_message, m_conn_item)); + + mutex_enter(&conn->c_lock); + + RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &conn->c_retrans, m_conn_item) { + if (!rdsv3_send_is_acked(rm, ack, is_acked)) + break; + + list_remove_node(&rm->m_conn_item); + list_insert_tail(&list, rm); + clear_bit(RDSV3_MSG_ON_CONN, &rm->m_flags); + } + +#if 0 +XXX + /* order flag updates with spin locks */ + if (!list_is_empty(&list)) + smp_mb__after_clear_bit(); +#endif + + mutex_exit(&conn->c_lock); + + /* now remove the messages from the sock list as needed */ + rdsv3_send_remove_from_sock(&list, RDSV3_RDMA_SUCCESS); + + RDSV3_DPRINTF4("rdsv3_send_drop_acked", "Return(conn: %p)", conn); +} + +void +rdsv3_send_drop_to(struct rdsv3_sock *rs, struct sockaddr_in *dest) +{ + struct rdsv3_message *rm, *tmp; + struct rdsv3_connection *conn; + list_t list; + int wake = 0; + + RDSV3_DPRINTF4("rdsv3_send_drop_to", "Enter(rs: %p)", rs); + + list_create(&list, sizeof (struct rdsv3_message), + offsetof(struct rdsv3_message, m_sock_item)); + + /* get all the messages we're dropping under the rs lock */ + mutex_enter(&rs->rs_lock); + + RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &rs->rs_send_queue, + m_sock_item) { + if (dest && (dest->sin_addr.s_addr != rm->m_daddr || + dest->sin_port != rm->m_inc.i_hdr.h_dport)) + continue; + + wake = 1; + list_remove(&rs->rs_send_queue, rm); + list_insert_tail(&list, rm); + rdsv3_send_sndbuf_remove(rs, rm); + clear_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags); + } + + mutex_exit(&rs->rs_lock); + + conn = NULL; + + /* now remove the messages from the conn list as needed */ + RDSV3_FOR_EACH_LIST_NODE(rm, &list, m_sock_item) { + /* + * We do this here rather than in the loop above, so that + * we don't have to nest m_rs_lock under rs->rs_lock + */ + mutex_enter(&rm->m_rs_lock); + /* If this is a RDMA operation, notify the app. */ + __rdsv3_rdma_send_complete(rs, rm, RDSV3_RDMA_CANCELED); + rm->m_rs = NULL; + mutex_exit(&rm->m_rs_lock); + + /* + * If we see this flag cleared then we're *sure* that someone + * else beat us to removing it from the conn. If we race + * with their flag update we'll get the lock and then really + * see that the flag has been cleared. + */ + if (!test_bit(RDSV3_MSG_ON_CONN, &rm->m_flags)) + continue; + + if (conn != rm->m_inc.i_conn) { + if (conn) + mutex_exit(&conn->c_lock); + conn = rm->m_inc.i_conn; + mutex_enter(&conn->c_lock); + } + + if (test_and_clear_bit(RDSV3_MSG_ON_CONN, &rm->m_flags)) { + list_remove_node(&rm->m_conn_item); + rdsv3_message_put(rm); + } + } + + if (conn) + mutex_exit(&conn->c_lock); + + if (wake) + rdsv3_wake_sk_sleep(rs); + + while (!list_is_empty(&list)) { + rm = list_remove_head(&list); + + rdsv3_message_wait(rm); + rdsv3_message_put(rm); + } + + RDSV3_DPRINTF4("rdsv3_send_drop_to", "Return(rs: %p)", rs); +} + +/* + * we only want this to fire once so we use the callers 'queued'. It's + * possible that another thread can race with us and remove the + * message from the flow with RDSV3_CANCEL_SENT_TO. + */ +static int +rdsv3_send_queue_rm(struct rdsv3_sock *rs, struct rdsv3_connection *conn, + struct rdsv3_message *rm, uint16_be_t sport, + uint16_be_t dport, int *queued) +{ + uint32_t len; + + RDSV3_DPRINTF4("rdsv3_send_queue_rm", "Enter(rs: %p, rm: %p)", rs, rm); + + if (*queued) + goto out; + + len = ntohl(rm->m_inc.i_hdr.h_len); + + /* + * this is the only place which holds both the socket's rs_lock + * and the connection's c_lock + */ + mutex_enter(&rs->rs_lock); + + /* + * If there is a little space in sndbuf, we don't queue anything, + * and userspace gets -EAGAIN. But poll() indicates there's send + * room. This can lead to bad behavior (spinning) if snd_bytes isn't + * freed up by incoming acks. So we check the *old* value of + * rs_snd_bytes here to allow the last msg to exceed the buffer, + * and poll() now knows no more data can be sent. + */ + if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs)) { + rs->rs_snd_bytes += len; + + /* + * let recv side know we are close to send space exhaustion. + * This is probably not the optimal way to do it, as this + * means we set the flag on *all* messages as soon as our + * throughput hits a certain threshold. + */ + if (rs->rs_snd_bytes >= rdsv3_sk_sndbuf(rs) / 2) + set_bit(RDSV3_MSG_ACK_REQUIRED, &rm->m_flags); + + list_insert_tail(&rs->rs_send_queue, rm); + set_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags); + + rdsv3_message_addref(rm); + rm->m_rs = rs; + + /* + * The code ordering is a little weird, but we're + * trying to minimize the time we hold c_lock + */ + rdsv3_message_populate_header(&rm->m_inc.i_hdr, sport, + dport, 0); + rm->m_inc.i_conn = conn; + rdsv3_message_addref(rm); /* XXX - called twice */ + + mutex_enter(&conn->c_lock); + rm->m_inc.i_hdr.h_sequence = htonll(conn->c_next_tx_seq++); + list_insert_tail(&conn->c_send_queue, rm); + set_bit(RDSV3_MSG_ON_CONN, &rm->m_flags); + mutex_exit(&conn->c_lock); + + RDSV3_DPRINTF5("rdsv3_send_queue_rm", + "queued msg %p len %d, rs %p bytes %d seq %llu", + rm, len, rs, rs->rs_snd_bytes, + (unsigned long long)ntohll( + rm->m_inc.i_hdr.h_sequence)); + + *queued = 1; + } + + mutex_exit(&rs->rs_lock); + + RDSV3_DPRINTF4("rdsv3_send_queue_rm", "Return(rs: %p)", rs); +out: + return (*queued); +} + +static int +rdsv3_cmsg_send(struct rdsv3_sock *rs, struct rdsv3_message *rm, + struct msghdr *msg, int *allocated_mr) +{ + struct cmsghdr *cmsg; + int ret = 0; + + RDSV3_DPRINTF4("rdsv3_cmsg_send", "Enter(rs: %p)", rs); + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + + if (cmsg->cmsg_level != SOL_RDS) + continue; + + RDSV3_DPRINTF4("rdsv3_cmsg_send", "cmsg(%p, %p) type %d", + cmsg, rm, cmsg->cmsg_type); + /* + * As a side effect, RDMA_DEST and RDMA_MAP will set + * rm->m_rdma_cookie and rm->m_rdma_mr. + */ + switch (cmsg->cmsg_type) { + case RDSV3_CMSG_RDMA_ARGS: + ret = rdsv3_cmsg_rdma_args(rs, rm, cmsg); + break; + + case RDSV3_CMSG_RDMA_DEST: + ret = rdsv3_cmsg_rdma_dest(rs, rm, cmsg); + break; + + case RDSV3_CMSG_RDMA_MAP: + ret = rdsv3_cmsg_rdma_map(rs, rm, cmsg); + if (ret) + *allocated_mr = 1; + break; + + default: + return (-EINVAL); + } + + if (ret) + break; + } + + RDSV3_DPRINTF4("rdsv3_cmsg_send", "Return(rs: %p)", rs); + + return (ret); +} + +int +rdsv3_sendmsg(struct rdsv3_sock *rs, uio_t *uio, struct nmsghdr *msg, + size_t payload_len) +{ + struct rsock *sk = rdsv3_rs_to_sk(rs); + struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + uint32_be_t daddr; + uint16_be_t dport; + struct rdsv3_message *rm = NULL; + struct rdsv3_connection *conn; + int ret = 0; + int queued = 0, allocated_mr = 0; + int nonblock = msg->msg_flags & MSG_DONTWAIT; + long timeo = rdsv3_rcvtimeo(sk, nonblock); + + RDSV3_DPRINTF4("rdsv3_sendmsg", "Enter(rs: %p)", rs); + + if (msg->msg_namelen) { + /* XXX fail non-unicast destination IPs? */ + if (msg->msg_namelen < sizeof (*usin) || + usin->sin_family != AF_INET_OFFLOAD) { + ret = -EINVAL; + RDSV3_DPRINTF2("rdsv3_sendmsg", "returning: %d", -ret); + goto out; + } + daddr = usin->sin_addr.s_addr; + dport = usin->sin_port; + } else { + /* We only care about consistency with ->connect() */ + mutex_enter(&sk->sk_lock); + daddr = rs->rs_conn_addr; + dport = rs->rs_conn_port; + mutex_exit(&sk->sk_lock); + } + + /* racing with another thread binding seems ok here */ + if (daddr == 0 || rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + RDSV3_DPRINTF2("rdsv3_sendmsg", "returning: %d", -ret); + goto out; + } + + rm = rdsv3_message_copy_from_user(uio, payload_len); + if (IS_ERR(rm)) { + ret = PTR_ERR(rm); + RDSV3_DPRINTF2("rdsv3_sendmsg", + "rdsv3_message_copy_from_user failed %d", -ret); + rm = NULL; + goto out; + } + + rm->m_daddr = daddr; + + /* + * rdsv3_conn_create has a spinlock that runs with IRQ off. + * Caching the conn in the socket helps a lot. + */ + mutex_enter(&rs->rs_conn_lock); + if (rs->rs_conn && rs->rs_conn->c_faddr == daddr) { + conn = rs->rs_conn; + } else { + conn = rdsv3_conn_create_outgoing(rs->rs_bound_addr, + daddr, rs->rs_transport, KM_NOSLEEP); + if (IS_ERR(conn)) { + mutex_exit(&rs->rs_conn_lock); + ret = PTR_ERR(conn); + RDSV3_DPRINTF2("rdsv3_sendmsg", + "rdsv3_conn_create_outgoing failed %d", + -ret); + goto out; + } + rs->rs_conn = conn; + } + mutex_exit(&rs->rs_conn_lock); + + /* Parse any control messages the user may have included. */ + ret = rdsv3_cmsg_send(rs, rm, msg, &allocated_mr); + if (ret) { + RDSV3_DPRINTF2("rdsv3_sendmsg", + "rdsv3_cmsg_send(rs: %p rm: %p msg: %p) returned: %d", + rs, rm, msg, ret); + goto out; + } + + if ((rm->m_rdma_cookie || rm->m_rdma_op) && + conn->c_trans->xmit_rdma == NULL) { + RDSV3_DPRINTF0("rdsv3_sendmsg", "rdma_op %p conn xmit_rdma %p", + rm->m_rdma_op, conn->c_trans->xmit_rdma); + ret = -EOPNOTSUPP; + goto out; + } + + /* + * If the connection is down, trigger a connect. We may + * have scheduled a delayed reconnect however - in this case + * we should not interfere. + */ + if (rdsv3_conn_state(conn) == RDSV3_CONN_DOWN && + !test_and_set_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags)) + rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 0); + + ret = rdsv3_cong_wait(conn->c_fcong, dport, nonblock, rs); + if (ret) { + RDSV3_DPRINTF2("rdsv3_sendmsg", + "rdsv3_cong_wait (dport: %d) returned: %d", dport, ret); + goto out; + } + + (void) rdsv3_send_queue_rm(rs, conn, rm, rs->rs_bound_port, dport, + &queued); + if (!queued) { + /* rdsv3_stats_inc(s_send_queue_full); */ + /* XXX make sure this is reasonable */ + if (payload_len > rdsv3_sk_sndbuf(rs)) { + ret = -EMSGSIZE; + RDSV3_DPRINTF2("rdsv3_sendmsg", + "msgsize(%d) too big, returning: %d", + payload_len, -ret); + goto out; + } + if (nonblock) { + ret = -EAGAIN; + RDSV3_DPRINTF3("rdsv3_sendmsg", + "send queue full (%d), returning: %d", + payload_len, -ret); + goto out; + } + + mutex_enter(&sk->sk_sleep->waitq_mutex); + while (!rdsv3_send_queue_rm(rs, conn, rm, rs->rs_bound_port, + dport, &queued)) { +#if 0 + ret = cv_timedwait_sig(&sk->sk_sleep->waitq_cv, + &sk->sk_sleep->waitq_mutex, + timeo * drv_usectohz(1000000) + ddi_get_lbolt()); + if (ret <= 0) { + /* signal/timeout pending */ + RDSV3_DPRINTF2("rdsv3_sendmsg", + "woke due to signal/timeout: %d", + ret); + ret = (ret == 0) ? -ERESTART : -ETIMEDOUT; + mutex_exit(&sk->sk_sleep->waitq_mutex); + goto out; + } +#else + ret = cv_wait_sig(&sk->sk_sleep->waitq_cv, + &sk->sk_sleep->waitq_mutex); + if (ret == 0) { + /* signal/timeout pending */ + RDSV3_DPRINTF2("rdsv3_sendmsg", + "woke due to signal: %d", + ret); + ret = -ERESTART; + mutex_exit(&sk->sk_sleep->waitq_mutex); + goto out; + } +#endif + } + mutex_exit(&sk->sk_sleep->waitq_mutex); + + RDSV3_DPRINTF5("rdsv3_sendmsg", "sendmsg woke queued %d", + queued); + + ASSERT(queued); + ret = 0; + } + + /* + * By now we've committed to the send. We reuse rdsv3_send_worker() + * to retry sends in the rds thread if the transport asks us to. + */ + rdsv3_stats_inc(s_send_queued); + + if (!test_bit(RDSV3_LL_SEND_FULL, &conn->c_flags)) + rdsv3_send_worker(&conn->c_send_w.work); + + rdsv3_message_put(rm); + RDSV3_DPRINTF4("rdsv3_sendmsg", "Return(rs: %p, len: %d)", + rs, payload_len); + return (payload_len); + +out: + /* + * If the user included a RDMA_MAP cmsg, we allocated a MR on the fly. + * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN + * or in any other way, we need to destroy the MR again + */ + if (allocated_mr) + rdsv3_rdma_unuse(rs, rdsv3_rdma_cookie_key(rm->m_rdma_cookie), + 1); + + if (rm) + rdsv3_message_put(rm); + return (ret); +} + +/* + * Reply to a ping packet. + */ +int +rdsv3_send_pong(struct rdsv3_connection *conn, uint16_be_t dport) +{ + struct rdsv3_message *rm; + int ret = 0; + + RDSV3_DPRINTF4("rdsv3_send_pong", "Enter(conn: %p)", conn); + + rm = rdsv3_message_alloc(0, KM_NOSLEEP); + if (rm == NULL) { + ret = -ENOMEM; + goto out; + } + + rm->m_daddr = conn->c_faddr; + + /* + * If the connection is down, trigger a connect. We may + * have scheduled a delayed reconnect however - in this case + * we should not interfere. + */ + if (rdsv3_conn_state(conn) == RDSV3_CONN_DOWN && + !test_and_set_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags)) + rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 0); + + ret = rdsv3_cong_wait(conn->c_fcong, dport, 1, NULL); + if (ret) + goto out; + + mutex_enter(&conn->c_lock); + list_insert_tail(&conn->c_send_queue, rm); + set_bit(RDSV3_MSG_ON_CONN, &rm->m_flags); + rdsv3_message_addref(rm); + rm->m_inc.i_conn = conn; + + rdsv3_message_populate_header(&rm->m_inc.i_hdr, 0, dport, + conn->c_next_tx_seq); + conn->c_next_tx_seq++; + mutex_exit(&conn->c_lock); + + rdsv3_stats_inc(s_send_queued); + rdsv3_stats_inc(s_send_pong); + + rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); + rdsv3_message_put(rm); + + RDSV3_DPRINTF4("rdsv3_send_pong", "Return(conn: %p)", conn); + return (0); + +out: + if (rm) + rdsv3_message_put(rm); + return (ret); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/stats.c b/usr/src/uts/common/io/ib/clients/rdsv3/stats.c new file mode 100644 index 0000000000..0082657127 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/stats.c @@ -0,0 +1,174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/rds.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> + +RDSV3_DEFINE_PER_CPU(struct rdsv3_statistics, rdsv3_stats); + +static char *rdsv3_stat_names[] = { + "conn_reset", + "recv_drop_bad_checksum", + "recv_drop_old_seq", + "recv_drop_no_sock", + "recv_drop_dead_sock", + "recv_deliver_raced", + "recv_delivered", + "recv_queued", + "recv_immediate_retry", + "recv_delayed_retry", + "recv_ack_required", + "recv_rdma_bytes", + "recv_ping", + "send_queue_empty", + "send_queue_full", + "send_sem_contention", + "send_sem_queue_raced", + "send_immediate_retry", + "send_delayed_retry", + "send_drop_acked", + "send_ack_required", + "send_queued", + "send_rdma", + "send_rdma_bytes", + "send_pong", + "page_remainder_hit", + "page_remainder_miss", + "copy_to_user", + "copy_from_user", + "cong_update_queued", + "cong_update_received", + "cong_send_error", + "cong_send_blocked", +}; + +void +rdsv3_stats_info_copy(struct rdsv3_info_iterator *iter, + uint64_t *values, char **names, size_t nr) +{ + struct rdsv3_info_counter ctr; + size_t i; + + for (i = 0; i < nr; i++) { + ASSERT(!(strlen(names[i]) >= sizeof (ctr.name))); + (void) strncpy((char *)ctr.name, names[i], + sizeof (ctr.name) - 1); + ctr.value = values[i]; + + rdsv3_info_copy(iter, &ctr, sizeof (ctr)); + } +} + +/* + * This gives global counters across all the transports. The strings + * are copied in so that the tool doesn't need knowledge of the specific + * stats that we're exporting. Some are pretty implementation dependent + * and may change over time. That doesn't stop them from being useful. + * + * This is the only function in the chain that knows about the byte granular + * length in userspace. It converts it to number of stat entries that the + * rest of the functions operate in. + */ +/* ARGSUSED */ +static void +rdsv3_stats_info(struct rsock *sock, unsigned int len, + struct rdsv3_info_iterator *iter, + struct rdsv3_info_lengths *lens) +{ + struct rdsv3_statistics stats = {0, }; + uint64_t *src; + uint64_t *sum; + size_t i; + int cpu; + unsigned int avail; + + avail = len / sizeof (struct rdsv3_info_counter); + + if (avail < ARRAY_SIZE(rdsv3_stat_names)) { + avail = 0; + goto trans; + } + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + src = (uint64_t *)&(rdsv3_per_cpu(rdsv3_stats, cpu)); + sum = (uint64_t *)&stats; + for (i = 0; i < sizeof (stats) / sizeof (uint64_t); i++) + *(sum++) += *(src++); + } + + rdsv3_stats_info_copy(iter, (uint64_t *)&stats, rdsv3_stat_names, + ARRAY_SIZE(rdsv3_stat_names)); + avail -= ARRAY_SIZE(rdsv3_stat_names); + +trans: + lens->each = sizeof (struct rdsv3_info_counter); + lens->nr = rdsv3_trans_stats_info_copy(iter, avail) + + ARRAY_SIZE(rdsv3_stat_names); +} + +void +rdsv3_stats_exit(void) +{ + rdsv3_info_deregister_func(RDSV3_INFO_COUNTERS, rdsv3_stats_info); +} + +int +rdsv3_stats_init(void) +{ + rdsv3_info_register_func(RDSV3_INFO_COUNTERS, rdsv3_stats_info); + return (0); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/sysctl.c b/usr/src/uts/common/io/ib/clients/rdsv3/sysctl.c new file mode 100644 index 0000000000..3115394d0e --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/sysctl.c @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#define HZ 100 +#define msecs_to_jiffies(a) a + +static unsigned long rdsv3_sysctl_reconnect_min = 1; + +unsigned long rdsv3_sysctl_reconnect_min_jiffies; +unsigned long rdsv3_sysctl_reconnect_max_jiffies = HZ; + +unsigned int rdsv3_sysctl_max_unacked_packets = 8; +unsigned int rdsv3_sysctl_max_unacked_bytes = (16 << 20); + +unsigned int rdsv3_sysctl_ping_enable = 1; + +unsigned long rdsv3_sysctl_trace_flags = 0; +unsigned int rdsv3_sysctl_trace_level = 0; + +void +rdsv3_sysctl_exit(void) +{ +} + +int +rdsv3_sysctl_init(void) +{ + rdsv3_sysctl_reconnect_min = msecs_to_jiffies(1); + rdsv3_sysctl_reconnect_min_jiffies = rdsv3_sysctl_reconnect_min; + + return (0); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/threads.c b/usr/src/uts/common/io/ib/clients/rdsv3/threads.c new file mode 100644 index 0000000000..3b3bceee96 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/threads.c @@ -0,0 +1,356 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/rds.h> +#include <sys/sunddi.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +/* + * All of connection management is simplified by serializing it through + * work queues that execute in a connection managing thread. + * + * TCP wants to send acks through sendpage() in response to data_ready(), + * but it needs a process context to do so. + * + * The receive paths need to allocate but can't drop packets (!) so we have + * a thread around to block allocating if the receive fast path sees an + * allocation failure. + */ + +/* + * Grand Unified Theory of connection life cycle: + * At any point in time, the connection can be in one of these states: + * DOWN, CONNECTING, UP, DISCONNECTING, ERROR + * + * The following transitions are possible: + * ANY -> ERROR + * UP -> DISCONNECTING + * ERROR -> DISCONNECTING + * DISCONNECTING -> DOWN + * DOWN -> CONNECTING + * CONNECTING -> UP + * + * Transition to state DISCONNECTING/DOWN: + * - Inside the shutdown worker; synchronizes with xmit path + * through c_send_lock, and with connection management callbacks + * via c_cm_lock. + * + * For receive callbacks, we rely on the underlying transport + * (TCP, IB/RDMA) to provide the necessary synchronisation. + */ +struct rdsv3_workqueue_struct_s *rdsv3_wq; + +void +rdsv3_connect_complete(struct rdsv3_connection *conn) +{ + RDSV3_DPRINTF4("rdsv3_connect_complete", "Enter(conn: %p)", conn); + + if (!rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING, + RDSV3_CONN_UP)) { +#ifndef __lock_lint + RDSV3_DPRINTF0("rdsv3_connect_complete", + "%s: Cannot transition to state UP, " + "current state is %d", + __func__, + atomic_get(&conn->c_state)); +#endif + conn->c_state = RDSV3_CONN_ERROR; + rdsv3_queue_work(rdsv3_wq, &conn->c_down_w); + return; + } + + RDSV3_DPRINTF2("rdsv3_connect_complete", + "conn %p for %u.%u.%u.%u to %u.%u.%u.%u complete", + conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); + + conn->c_reconnect_jiffies = 0; + set_bit(0, &conn->c_map_queued); + rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); + rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0); + + RDSV3_DPRINTF4("rdsv3_connect_complete", "Return(conn: %p)", conn); +} + +/* + * This random exponential backoff is relied on to eventually resolve racing + * connects. + * + * If connect attempts race then both parties drop both connections and come + * here to wait for a random amount of time before trying again. Eventually + * the backoff range will be so much greater than the time it takes to + * establish a connection that one of the pair will establish the connection + * before the other's random delay fires. + * + * Connection attempts that arrive while a connection is already established + * are also considered to be racing connects. This lets a connection from + * a rebooted machine replace an existing stale connection before the transport + * notices that the connection has failed. + * + * We should *always* start with a random backoff; otherwise a broken connection + * will always take several iterations to be re-established. + */ +static void +rdsv3_queue_reconnect(struct rdsv3_connection *conn) +{ + unsigned long rand; + + RDSV3_DPRINTF2("rdsv3_queue_reconnect", + "conn %p for %u.%u.%u.%u to %u.%u.%u.%u reconnect jiffies %lu", + conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), + conn->c_reconnect_jiffies); + + set_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags); + if (conn->c_reconnect_jiffies == 0) { + conn->c_reconnect_jiffies = rdsv3_sysctl_reconnect_min_jiffies; + rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 0); + return; + } + + (void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand)); + RDSV3_DPRINTF5("rdsv3", + "%lu delay %lu ceil conn %p for %u.%u.%u.%u -> %u.%u.%u.%u", + rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies, + conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); + rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, + rand % conn->c_reconnect_jiffies); + + conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2, + rdsv3_sysctl_reconnect_max_jiffies); +} + +void +rdsv3_connect_worker(struct rdsv3_work_s *work) +{ + struct rdsv3_connection *conn = container_of(work, + struct rdsv3_connection, c_conn_w.work); + int ret; + + RDSV3_DPRINTF2("rdsv3_connect_worker", "Enter(work: %p)", work); + + clear_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags); + if (rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, + RDSV3_CONN_CONNECTING)) { + ret = conn->c_trans->conn_connect(conn); + RDSV3_DPRINTF5("rdsv3", + "connect conn %p for %u.%u.%u.%u -> %u.%u.%u.%u " + "ret %d", conn, NIPQUAD(conn->c_laddr), + NIPQUAD(conn->c_faddr), ret); + RDSV3_DPRINTF2("rdsv3_connect_worker", + "conn %p for %u.%u.%u.%u to %u.%u.%u.%u dispatched, ret %d", + conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), ret); + + if (ret) { + if (rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING, + RDSV3_CONN_DOWN)) + rdsv3_queue_reconnect(conn); + else { + RDSV3_DPRINTF2("rdsv3_connect_worker", + "RDS: connect failed: %p", conn); + rdsv3_conn_drop(conn); + } + } + } + + RDSV3_DPRINTF2("rdsv3_connect_worker", "Return(work: %p)", work); +} + +extern struct avl_tree rdsv3_conn_hash; + +void +rdsv3_shutdown_worker(struct rdsv3_work_s *work) +{ + struct rdsv3_connection *conn = container_of(work, + struct rdsv3_connection, c_down_w); + struct rdsv3_conn_info_s conn_info; + + RDSV3_DPRINTF2("rdsv3_shutdown_worker", "Enter(work: %p)", work); + + /* shut it down unless it's down already */ + if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, RDSV3_CONN_DOWN)) { + /* + * Quiesce the connection mgmt handlers before we start tearing + * things down. We don't hold the mutex for the entire + * duration of the shutdown operation, else we may be + * deadlocking with the CM handler. Instead, the CM event + * handler is supposed to check for state DISCONNECTING + */ + mutex_enter(&conn->c_cm_lock); + if (!rdsv3_conn_transition(conn, RDSV3_CONN_UP, + RDSV3_CONN_DISCONNECTING) && + !rdsv3_conn_transition(conn, RDSV3_CONN_ERROR, + RDSV3_CONN_DISCONNECTING)) { + RDSV3_DPRINTF2("rdsv3_shutdown_worker", + "RDS: connect failed: conn: %p, state: %d", + conn, atomic_get(&conn->c_state)); + rdsv3_conn_drop(conn); + mutex_exit(&conn->c_cm_lock); + return; + } + mutex_exit(&conn->c_cm_lock); + + mutex_enter(&conn->c_send_lock); + conn->c_trans->conn_shutdown(conn); + rdsv3_conn_reset(conn); + mutex_exit(&conn->c_send_lock); + + if (!rdsv3_conn_transition(conn, RDSV3_CONN_DISCONNECTING, + RDSV3_CONN_DOWN)) { + /* + * This can happen - eg when we're in the middle of + * tearing down the connection, and someone unloads + * the rds module. Quite reproduceable with loopback + * connections. Mostly harmless. + */ +#ifndef __lock_lint + RDSV3_DPRINTF2("rdsv3_shutdown_worker", + "failed to transition to state DOWN, " + "current statis is: %d conn: %p", + atomic_get(&conn->c_state), conn); + rdsv3_conn_drop(conn); +#endif + return; + } + } + + /* + * Then reconnect if it's still live. + * The passive side of an IB loopback connection is never added + * to the conn hash, so we never trigger a reconnect on this + * conn - the reconnect is always triggered by the active peer. + */ + rdsv3_cancel_delayed_work(&conn->c_conn_w); + + conn_info.c_laddr = conn->c_laddr; + conn_info.c_faddr = conn->c_faddr; + if (avl_find(&rdsv3_conn_hash, &conn_info, NULL) == conn) + rdsv3_queue_reconnect(conn); + + RDSV3_DPRINTF2("rdsv3_shutdown_worker", "Return(work: %p)", work); +} + +void +rdsv3_send_worker(struct rdsv3_work_s *work) +{ + struct rdsv3_connection *conn = container_of(work, + struct rdsv3_connection, c_send_w.work); + int ret; + + RDSV3_DPRINTF4("rdsv3_send_worker", "Enter(work: %p)", work); + + if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { + ret = rdsv3_send_xmit(conn); + RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret); + switch (ret) { + case -EAGAIN: + rdsv3_stats_inc(s_send_immediate_retry); + rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); + break; + case -ENOMEM: + rdsv3_stats_inc(s_send_delayed_retry); + rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 2); + default: + break; + } + } + + RDSV3_DPRINTF4("rdsv3_send_worker", "Return(work: %p)", work); +} + +void +rdsv3_recv_worker(struct rdsv3_work_s *work) +{ + struct rdsv3_connection *conn = container_of(work, + struct rdsv3_connection, c_recv_w.work); + int ret; + + RDSV3_DPRINTF4("rdsv3_recv_worker", "Enter(work: %p)", work); + + if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { + ret = conn->c_trans->recv(conn); + RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret); + switch (ret) { + case -EAGAIN: + rdsv3_stats_inc(s_recv_immediate_retry); + rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0); + break; + case -ENOMEM: + rdsv3_stats_inc(s_recv_delayed_retry); + rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 2); + default: + break; + } + } + + RDSV3_DPRINTF4("rdsv3_recv_worker", "Return(work: %p)", work); +} + +void +rdsv3_threads_exit(void) +{ + rdsv3_destroy_task_workqueue(rdsv3_wq); +} + +int +rdsv3_threads_init(void) +{ + rdsv3_wq = rdsv3_create_task_workqueue("krdsd"); + if (rdsv3_wq == NULL) + return (-ENOMEM); + + return (0); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/transport.c b/usr/src/uts/common/io/ib/clients/rdsv3/transport.c new file mode 100644 index 0000000000..519442a392 --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/transport.c @@ -0,0 +1,142 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <sys/ksynch.h> +#include <sys/list.h> +#include <sys/rds.h> +#include <sys/sysmacros.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/loop.h> +#include <sys/ib/clients/rdsv3/rdsv3_impl.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +list_t transports; +krwlock_t trans_sem; /* this was a semaphore */ + +int +rdsv3_trans_register(struct rdsv3_transport *trans) +{ + RDSV3_DPRINTF4("rdsv3_trans_register", "Enter(trans: %p)", trans); + + rw_enter(&trans_sem, RW_WRITER); + + list_insert_tail(&transports, trans); + + rw_exit(&trans_sem); + + RDSV3_DPRINTF4("rdsv3_trans_register", "Return(trans: %p)", trans); + + return (0); +} + +void +rdsv3_trans_unregister(struct rdsv3_transport *trans) +{ + RDSV3_DPRINTF4("rdsv3_trans_register", "Enter(trans: %p)", trans); + + rw_enter(&trans_sem, RW_WRITER); + + list_remove(&transports, trans); + + rw_exit(&trans_sem); + + RDSV3_DPRINTF4("rdsv3_trans_register", "Return(trans: %p)", trans); +} + +struct rdsv3_transport * +rdsv3_trans_get_preferred(uint32_be_t addr) +{ + struct rdsv3_transport *trans; + struct rdsv3_transport *ret = NULL; + + RDSV3_DPRINTF4("rdsv3_trans_get_preferred", "Enter(addr: %x)", + ntohl(addr)); + + if (rdsv3_isloopback(addr)) + return (&rdsv3_loop_transport); + + rw_enter(&trans_sem, RW_READER); + RDSV3_FOR_EACH_LIST_NODE(trans, &transports, t_item) { + if (trans->laddr_check(addr) == 0) { + ret = trans; + break; + } + } + rw_exit(&trans_sem); + + RDSV3_DPRINTF4("rdsv3_trans_get_preferred", + "Return(addr: %x, ret: %p)", ntohl(addr), ret); + + return (ret); +} + +/* + * This returns the number of stats entries in the snapshot and only + * copies them using the iter if there is enough space for them. The + * caller passes in the global stats so that we can size and copy while + * holding the lock. + */ +/* ARGSUSED */ +unsigned int +rdsv3_trans_stats_info_copy(struct rdsv3_info_iterator *iter, + unsigned int avail) +{ + /* + * XXX - Add this when we port info (info.c) + */ + return (0); +} diff --git a/usr/src/uts/common/io/warlock/rdsv3.wlcmd b/usr/src/uts/common/io/warlock/rdsv3.wlcmd new file mode 100644 index 0000000000..8b50fb6de5 --- /dev/null +++ b/usr/src/uts/common/io/warlock/rdsv3.wlcmd @@ -0,0 +1,365 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# + +# entry points +root _init +root _info +root _fini +root __rdsv3_conn_create +root __rdsv3_conn_error +root __rdsv3_ib_conn_error +root __rdsv3_ib_destroy_conns +root __rdsv3_ib_ring_empty +root __rdsv3_ib_ring_used +root __rdsv3_ib_teardown_mr +root __rdsv3_put_mr_final +root __rdsv3_rdma_map +root __rdsv3_rdma_send_complete +root __rdsv3_wake_sk_sleep +root _fini +root _info +root _init +root ib_addr_get_dgid +root ib_addr_get_mgid +root ib_addr_get_pkey +root ib_addr_get_sgid +root ib_addr_set_dgid +root ib_addr_set_pkey +root ib_addr_set_sgid +root ib_width_enum_to_int +root init_genlist +root ip_addr_size +root rdsv3_activate +root rdsv3_add_bound +root rdsv3_attach +root rdsv3_bind +root rdsv3_bind_node_compare +root rdsv3_bind_tree_exit +root rdsv3_bind_tree_init +root rdsv3_bind_tree_walk +root rdsv3_cancel_delayed_work +root rdsv3_capable_interface +root rdsv3_clear_print_buf +root rdsv3_clear_recv_queue +root rds_clif_name +root rdsv3_clrflowctrl +root rdsv3_cmsg_rdma_args +root rdsv3_cmsg_rdma_dest +root rdsv3_cmsg_rdma_map +root rdsv3_cmsg_recv +root rdsv3_cmsg_send +root rdsv3_cong_add_conn +root rdsv3_cong_add_socket +root rdsv3_cong_clear_bit +root rdsv3_cong_compare +root rdsv3_cong_exit +root rdsv3_cong_from_addr +root rdsv3_cong_get_maps +root rdsv3_cong_init +root rdsv3_cong_map_updated +root rdsv3_cong_monitor +root rdsv3_cong_queue_updates +root rdsv3_cong_remove_conn +root rdsv3_cong_remove_socket +root rdsv3_cong_set_bit +root rdsv3_cong_test_bit +root rdsv3_cong_tree_walk +root rdsv3_cong_update_alloc +root rdsv3_cong_updated_since +root rdsv3_cong_wait +root rdsv3_conn_compare +root rdsv3_conn_constructor +root rdsv3_conn_create +root rdsv3_conn_create_outgoing +root rdsv3_conn_destroy +root rdsv3_conn_destructor +root rdsv3_conn_drop +root rdsv3_conn_exit +root rdsv3_conn_info +root rdsv3_conn_info_visitor +root rdsv3_conn_init +root rdsv3_conn_is_sending +root rdsv3_conn_lookup +root rdsv3_conn_message_info +root rdsv3_conn_message_info_retrans +root rdsv3_conn_message_info_send +root rdsv3_conn_transition +root rdsv3_connect +root rdsv3_connect_complete +root rdsv3_connect_worker +root rdsv3_create_singlethread_workqueue +root rdsv3_destroy_mr +root rdsv3_destroy_workqueue +root rdsv3_detach +root rdsv3_do_ip_ioctl +root rdsv3_dprintf_intr +root rdsv3_dprintf0 +root rdsv3_dprintf1 +root rdsv3_dprintf2 +root rdsv3_dprintf3 +root rdsv3_dprintf4 +root rdsv3_dprintf5 +root rdsv3_exit +root rdsv3_fast_ip_csum +root rdsv3_find_bound +root rdsv3_flush_workqueue +root rdsv3_for_each_conn_info +root rdsv3_free_mr +root rdsv3_get_mr +root rdsv3_getname +root rdsv3_getpeername +root rdsv3_getsockopt +root rdsv3_ib_ack_send_complete +root rdsv3_ib_add_conn +root rdsv3_ib_add_ipaddr +root rdsv3_ib_add_one +root rdsv3_ib_advertise_credits +root rdsv3_ib_alloc_fmr +root rdsv3_ib_alloc_hdrs +root rdsv3_ib_attempt_ack +root rdsv3_ib_cm_connect_complete +root rdsv3_ib_cm_fill_conn_param +root rdsv3_ib_cm_handle_connect +root rdsv3_ib_cm_initiate_connect +root rdsv3_ib_cong_recv +root rdsv3_ib_conn_alloc +root rdsv3_ib_conn_connect +root rdsv3_ib_conn_free +root rdsv3_ib_conn_info_visitor +root rdsv3_ib_conn_shutdown +root rdsv3_ib_cq_event_handler +root rdsv3_ib_destroy_conns +root rdsv3_ib_destroy_nodev_conns +root rdsv3_ib_dma_map_sg +root rdsv3_ib_dma_map_sg_rdma +root rdsv3_ib_dma_unmap_sg +root rdsv3_ib_dma_unmap_sg_rdma +root rdsv3_ib_exit +root rdsv3_ib_flush_mr_pool +root rdsv3_ib_flush_mrs +root rdsv3_ib_frag_drop_page +root rdsv3_ib_frag_free +root rdsv3_ib_free_hdrs +root rdsv3_ib_free_mr +root rdsv3_ib_get_ack +root rdsv3_ib_get_device +root rdsv3_ib_get_header +root rdsv3_ib_get_mr +root rdsv3_ib_get_mr_info +root rdsv3_ib_ic_info +root rdsv3_ib_inc_copy_to_user +root rdsv3_ib_inc_free +root rdsv3_ib_inc_purge +root rdsv3_ib_init +root rdsv3_ib_laddr_check +root rdsv3_ib_map_fmr +root rdsv3_ib_piggyb_ack +root rdsv3_ib_process_recv +root rdsv3_ib_protocol_compatible +root rdsv3_ib_qp_event_handler +root rdsv3_ib_recv +root rdsv3_ib_recv_clear_one +root rdsv3_ib_recv_clear_ring +root rdsv3_ib_recv_cq_comp_handler +root rdsv3_ib_recv_exit +root rdsv3_ib_recv_init +root rdsv3_ib_recv_init_ack +root rdsv3_ib_recv_init_ring +root rdsv3_ib_recv_refill +root rdsv3_ib_recv_refill_one +root rdsv3_ib_recv_unmap_page +root rdsv3_ib_remove_conn +root rdsv3_ib_remove_ipaddr +root rdsv3_ib_remove_one +root rdsv3_ib_ring_alloc +root rdsv3_ib_ring_completed +root rdsv3_ib_ring_empty +root rdsv3_ib_ring_free +root rdsv3_ib_ring_init +root rdsv3_ib_ring_low +root rdsv3_ib_ring_oldest +root rdsv3_ib_ring_resize +root rdsv3_ib_ring_unalloc +root rdsv3_ib_send_ack +root rdsv3_ib_send_add_credits +root rdsv3_ib_send_clear_ring +root rdsv3_ib_send_cq_comp_handler +root rdsv3_ib_send_grab_credits +root rdsv3_ib_send_init_ring +root rdsv3_ib_send_rdma_complete +root rdsv3_ib_send_unmap_rdma +root rdsv3_ib_send_unmap_rm +root rdsv3_ib_set_ack +root rdsv3_ib_set_flow_control +root rdsv3_ib_set_protocol +root rdsv3_ib_setup_qp +root rdsv3_ib_sg_dma_address +root rdsv3_ib_stats_info_copy +root rdsv3_ib_sync_mr +root rdsv3_ib_sysctl_exit +root rdsv3_ib_sysctl_init +root rdsv3_ib_tune_rnr +root rdsv3_ib_update_ipaddr +root rdsv3_ib_xmit +root rdsv3_ib_xmit_complete +root rdsv3_ib_xmit_populate_wr +root rdsv3_ib_xmit_rdma +root rdsv3_if_lookup_by_addr +root rdsv3_if_lookup_by_name +root rdsv3_inc_addref +root rdsv3_inc_info_copy +root rdsv3_inc_init +root rdsv3_inc_put +root rdsv3_info +root rdsv3_info_deregister_func +root rdsv3_info_getsockopt +root rdsv3_info_register_func +root rdsv3_init +root rdsv3_ioctl +root rdsv3_logging_destroy +root rdsv3_logging_initialization +root rdsv3_loop_conn_alloc +root rdsv3_loop_conn_connect +root rdsv3_loop_conn_free +root rdsv3_loop_conn_shutdown +root rdsv3_loop_exit +root rdsv3_loop_init +root rdsv3_loop_recv +root rdsv3_loop_xmit +root rdsv3_loop_xmit_cong_map +root rdsv3_message_add_extension +root rdsv3_message_add_rdma_dest_extension +root rdsv3_message_add_version_extension +root rdsv3_message_addref +root rdsv3_message_alloc +root rdsv3_message_copy_from_user +root rdsv3_message_get_version_extension +root rdsv3_message_inc_copy_to_user +root rdsv3_message_inc_free +root rdsv3_message_inc_purge +root rdsv3_message_map_pages +root rdsv3_message_next_extension +root rdsv3_message_populate_header +root rdsv3_message_purge +root rdsv3_message_put +root rdsv3_message_unmapped +root rdsv3_message_wait +root rdsv3_mr_compare +root rdsv3_mr_put +root rdsv3_mr_tree_walk +root rdsv3_next_incoming +root rdsv3_notify_cong +root rdsv3_notify_queue_get +root rdsv3_ntop +root rdsv3_page_remainder_alloc +root rdsv3_pages_in_vec +root rds_path_down +root rds_path_up +root rdsv3_poll +root rdsv3_poll_wait +root rdsv3_put_cmsg +root rdsv3_queue_delayed_work +root rdsv3_queue_reconnect +root rdsv3_queue_work +root rdsv3_rdma_cm_event_handler +root rdsv3_rdma_drop_keys +root rdsv3_rdma_exit +root rdsv3_rdma_free_op +root rdsv3_rdma_init +root rdsv3_rdma_listen_init +root rdsv3_rdma_listen_stop +root rdsv3_rdma_prepare +root rdsv3_rdma_send_complete +root rdsv3_rdma_unuse +root rdsv3_recv_incoming +root rdsv3_recv_incoming_exthdrs +root rdsv3_recv_rcvbuf_delta +root rdsv3_recv_uio +root rdsv3_recv_worker +root rdsv3_recvmsg +root rdsv3_release +root rdsv3_remove_bound +root rds_sc_path_lookup +root rdsv3_scaddr_to_ibaddr +root rdsv3_send_acked_before +root rdsv3_send_drop_acked +root rdsv3_send_drop_to +root rdsv3_send_get_message +root rdsv3_send_is_acked +root rdsv3_send_pong +root rdsv3_send_queue_rm +root rdsv3_send_remove_from_sock +root rdsv3_send_reset +root rdsv3_send_sndbuf_remove +root rdsv3_send_uio +root rdsv3_send_worker +root rdsv3_send_xmit +root rdsv3_sendmsg +root rdsv3_set_bool_option +root rdsv3_setsockopt +root rdsv3_shutdown +root rdsv3_shutdown_worker +root rdsv3_sk_alloc +root rdsv3_sock_addref +root rdsv3_sock_exit +root rdsv3_sock_exit_data +root rdsv3_sock_inc_info +root rdsv3_sock_info +root rdsv3_sock_init +root rdsv3_sock_init_data +root rdsv3_sock_put +root rdsv3_stats_exit +root rdsv3_stats_info +root rdsv3_stats_info_copy +root rdsv3_stats_init +root rdsv3_still_queued +root rdsv3_sysctl_exit +root rdsv3_sysctl_init +root rdsv3_threads_exit +root rdsv3_threads_init +root rdsv3_trace +root rdsv3_trans_exit +root rdsv3_trans_get_preferred +root rdsv3_trans_init +root rdsv3_trans_register +root rdsv3_trans_stats_info_copy +root rdsv3_trans_unregister +root rdsv3_umem_cb +root rdsv3_verify_bind_address +root rdsv3_vlog +root rdsv3_vprintk +root rdsv3_wake_sk_sleep +root rdsv3_work_timeout_handler +root rdsv3_worker_thread +root rdsv3_create +root rdsv3_isloopback + +add bus_ops::bus_add_eventcall targets warlock_dummy +add bus_ops::bus_config targets warlock_dummy +add bus_ops::bus_get_eventcookie targets warlock_dummy +add bus_ops::bus_intr_ctl targets warlock_dummy +add bus_ops::bus_post_event targets warlock_dummy +add bus_ops::bus_remove_eventcall targets warlock_dummy +add bus_ops::bus_unconfig targets warlock_dummy + + diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index 677a328a49..a130c54ac0 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -465,6 +465,7 @@ CHKHDRS= \ random.h \ rctl.h \ rctl_impl.h \ + rds.h \ reboot.h \ refstr.h \ refstr_impl.h \ @@ -754,7 +755,8 @@ SOL_UCMAHDRS= \ SOL_OFSHDRS= \ sol_cma.h \ sol_ib_cma.h \ - sol_ofs_common.h + sol_ofs_common.h \ + sol_kverb_impl.h TAVORHDRS= \ tavor_ioctl.h diff --git a/usr/src/uts/common/sys/ib/clients/of/rdma/ib_verbs.h b/usr/src/uts/common/sys/ib/clients/of/rdma/ib_verbs.h index 4f6c5f6829..0106d80848 100644 --- a/usr/src/uts/common/sys/ib/clients/of/rdma/ib_verbs.h +++ b/usr/src/uts/common/sys/ib/clients/of/rdma/ib_verbs.h @@ -649,6 +649,135 @@ typedef struct ib_client { } state; } ib_client_t; +int ib_register_client(struct ib_client *client); +void ib_unregister_client(struct ib_client *client); + +void *ib_get_client_data(struct ib_device *device, struct ib_client *client); +void ib_set_client_data(struct ib_device *device, struct ib_client *client, + void *data); + +int ib_query_device(struct ib_device *device, + struct ib_device_attr *device_attr); + +/* + * ib_alloc_pd - Allocates an unused protection domain. + * @device: The device on which to allocate the protection domain. + * + * A protection domain object provides an association between QPs, shared + * receive queues, address handles, memory regions, and memory windows. + */ +struct ib_pd *ib_alloc_pd(struct ib_device *device); + +/* + * ib_dealloc_pd - Deallocates a protection domain. + * @pd: The protection domain to deallocate. + */ +int ib_dealloc_pd(struct ib_pd *pd); + +/* + * ib_create_qp - Creates a QP associated with the specified protection + * domain. + * @pd: The protection domain associated with the QP. + * @qp_init_attr: A list of initial attributes required to create the + * QP. If QP creation succeeds, then the attributes are updated to + * the actual capabilities of the created QP. + */ +struct ib_qp *ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr); + +/* + * ib_modify_qp - Modifies the attributes for the specified QP and then + * transitions the QP to the given state. + * @qp: The QP to modify. + * @qp_attr: On input, specifies the QP attributes to modify. On output, + * the current values of selected QP attributes are returned. + * @qp_attr_mask: A bit-mask used to specify which attributes of the QP + * are being modified. + */ +int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask); + +/* + * ib_destroy_qp - Destroys the specified QP. + * @qp: The QP to destroy. + */ +int ib_destroy_qp(struct ib_qp *qp); + +/* + * ib_create_cq - Creates a CQ on the specified device. + * @device: The device on which to create the CQ. + * @comp_handler: A user-specified callback that is invoked when a + * completion event occurs on the CQ. + * @event_handler: A user-specified callback that is invoked when an + * asynchronous event not associated with a completion occurs on the CQ. + * @cq_context: Context associated with the CQ returned to the user via + * the associated completion and event handlers. + * @cqe: The minimum size of the CQ. + * @comp_vector - Completion vector used to signal completion events. + * Must be >= 0 and < context->num_comp_vectors. + * + * Users can examine the cq structure to determine the actual CQ size. + */ +struct ib_cq *ib_create_cq(struct ib_device *device, + ib_comp_handler comp_handler, + void (*event_handler)(struct ib_event *, void *), + void *cq_context, int cqe, int comp_vector); + +/* + * ib_destroy_cq - Destroys the specified CQ. + * @cq: The CQ to destroy. + */ +int ib_destroy_cq(struct ib_cq *cq); + +/* + * ib_poll_cq - poll a CQ for completion(s) + * @cq:the CQ being polled + * @num_entries:maximum number of completions to return + * @wc:array of at least @num_entries &struct ib_wc where completions + * will be returned + * + * Poll a CQ for (possibly multiple) completions. If the return value + * is < 0, an error occurred. If the return value is >= 0, it is the + * number of completions returned. If the return value is + * non-negative and < num_entries, then the CQ was emptied. + */ +int ib_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc); + +/* + * ib_req_notify_cq - Request completion notification on a CQ. + * @cq: The CQ to generate an event for. + * @flags: + * Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP + * to request an event on the next solicited event or next work + * completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS + * may also be |ed in to request a hint about missed events, as + * described below. + * + * Return Value: + * < 0 means an error occurred while requesting notification + * == 0 means notification was requested successfully, and if + * IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events + * were missed and it is safe to wait for another event. In + * this case is it guaranteed that any work completions added + * to the CQ since the last CQ poll will trigger a completion + * notification event. + * > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed + * in. It means that the consumer must poll the CQ again to + * make sure it is empty to avoid missing an event because of a + * race between requesting notification and an entry being + * added to the CQ. This return value means it is possible + * (but not guaranteed) that a work completion has been added + * to the CQ since the last poll without triggering a + * completion notification event. + */ +int ib_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); + +struct rdma_cm_id; +ibt_hca_hdl_t ib_get_ibt_hca_hdl(struct ib_device *device); + +ibt_channel_hdl_t +ib_get_ibt_channel_hdl(struct rdma_cm_id *cm); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/ib/clients/of/sol_ofs/sol_cma.h b/usr/src/uts/common/sys/ib/clients/of/sol_ofs/sol_cma.h index b18d8405ce..d58e31cda2 100644 --- a/usr/src/uts/common/sys/ib/clients/of/sol_ofs/sol_cma.h +++ b/usr/src/uts/common/sys/ib/clients/of/sol_ofs/sol_cma.h @@ -140,8 +140,19 @@ typedef enum { #define SOL_CMA_DISCONNECT_OK(chanp) (((chanp)->chan_connect_flag == \ SOL_CMA_CONNECT_INITIATED) || SOL_CMAID_IS_CONNECTED(chanp)) +/* + * CMID_DESTROYED - Flag to indicate rdma_destroy_id has been + * called for this CMID + * + * EVENT_PROGRESS - RDMACM Event for this CMID been passed to + * the sol_ofs client. + * + * API_PROGRESS - rdma_resolve_addr() / rdma_resolve_route() / + * rdma_listen() is in progress. + */ #define SOL_CMA_CALLER_CMID_DESTROYED 0x01 #define SOL_CMA_CALLER_EVENT_PROGRESS 0x02 +#define SOL_CMA_CALLER_API_PROGRESS 0x04 typedef enum { REQ_CMID_NONE = 0, @@ -211,6 +222,9 @@ typedef struct { /* Session ID for completion */ void *chan_session_id; + uint32_t chan_qp_num; + uint8_t chan_is_srq; + union { ibcma_chan_t chan_ib_xport; } un_xport; /* Transport specific fields */ @@ -308,10 +322,8 @@ cma_get_acpt_idp(struct rdma_cm_id *root_idp, void *qp_hdl) sol_cma_chan_t *root_chanp; root_chanp = (sol_cma_chan_t *)root_idp; - mutex_enter(&root_chanp->chan_mutex); acpt_idp = (struct rdma_cm_id *)avl_find( &root_chanp->chan_acpt_avl_tree, (void *)qp_hdl, NULL); - mutex_exit(&root_chanp->chan_mutex); return (acpt_idp); } #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/ib/clients/of/sol_ofs/sol_kverb_impl.h b/usr/src/uts/common/sys/ib/clients/of/sol_ofs/sol_kverb_impl.h new file mode 100644 index 0000000000..07168b0a48 --- /dev/null +++ b/usr/src/uts/common/sys/ib/clients/of/sol_ofs/sol_kverb_impl.h @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_IB_CLIENTS_OF_SOL_OFS_SOL_KVERB_IMPL_H +#define _SYS_IB_CLIENTS_OF_SOL_OFS_SOL_KVERB_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/ib/ibtl/ibvti.h> + +/* + * If there is yet an active async event, hdl is not freed. However, + * if the device state is IB_DEV_CLOSE, the device is about to be closed + * so that the event should be discarded. + */ +#define FIRE_QP_EVENT(clnt, hdl, ib_event, qpp, type) \ + rw_enter(&clnt->lock, RW_READER); \ + if (qpp && qpp->event_handler && \ + qpp->device->reg_state == IB_DEV_OPEN) { \ + ib_event.device = qpp->device; \ + ib_event.event = type; \ + ib_event.element.qp = qpp; \ + qpp->event_handler(&ib_event, qpp->qp_context); \ + } \ + rw_exit(&clnt->lock) + +#define FIRE_CQ_EVENT(clnt, hdl, ib_event, cqp, type) \ + rw_enter(&clnt->lock, RW_READER); \ + if (cqp && cqp->event_handler && \ + cqp->device->reg_state == IB_DEV_OPEN) { \ + ib_event.device = cqp->device; \ + ib_event.event = type; \ + ib_event.element.cq = cqp; \ + cqp->event_handler(&ib_event, cqp->cq_context); \ + } \ + rw_exit(&clnt->lock) + +#define IBTF2OF_PGSZ(hca_page_sz) ((hca_page_sz) << 10) +#define OF2IBTF_STATE(s) ((enum ibt_cep_state_e)(s)) +#define OF2IBTF_SRATE(r) ((enum ibt_srate_e)(r)) +#define OF2IBTF_PATH_MIG_STATE(s) ((ibt_cep_cmstate_t)((s)+1)) +#define OF2IBTF_PATH_MTU(m) ((ib_mtu_t)(m)) + +typedef unsigned int gfp_t; + +typedef struct sol_ofs_client_s { + ib_client_t *ib_client; + ibt_clnt_modinfo_t ibt_client; + ibt_clnt_hdl_t ibt_hdl; + uint_t hca_num; + uint_t hca_open_num; + llist_head_t device_list; + llist_head_t client_list; + krwlock_t lock; + enum { + IB_OFS_CLNT_UNINITIALIZED, + IB_OFS_CLNT_INITIALIZED + } state; +} ofs_client_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_IB_CLIENTS_OF_SOL_OFS_SOL_KVERB_IMPL_H */ diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/ib.h b/usr/src/uts/common/sys/ib/clients/rdsv3/ib.h new file mode 100644 index 0000000000..ff52bb29e2 --- /dev/null +++ b/usr/src/uts/common/sys/ib/clients/rdsv3/ib.h @@ -0,0 +1,359 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _RDSV3_IB_H +#define _RDSV3_IB_H + +#include <sys/rds.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> +#include <sys/ib/clients/rdsv3/rdma_transport.h> + +#define RDSV3_FMR_SIZE 256 +#define RDSV3_FMR_POOL_SIZE (12 * 1024) + +#define RDSV3_IB_SEND_WRS 64 + +#define RDSV3_IB_MAX_SGE 8 +#define RDSV3_IB_RECV_SGE 2 + +#define RDSV3_IB_DEFAULT_RECV_WR 1024 +#define RDSV3_IB_DEFAULT_SEND_WR 256 + +#define RDSV3_IB_DEFAULT_RETRY_COUNT 2 + +/* minor versions supported */ +#define RDSV3_IB_SUPPORTED_PROTOCOLS 0x00000003 + +extern struct list rdsv3_ib_devices; + +/* + * IB posts RDSV3_FRAG_SIZE fragments of pages to the receive queues to + * try and minimize the amount of memory tied up both the device and + * socket receive queues. + */ +/* page offset of the final full frag that fits in the page */ +#define RDSV3_PAGE_LAST_OFF \ + (((PAGE_SIZE / RDSV3_FRAG_SIZE) - 1) * RDSV3_FRAG_SIZE) +struct rdsv3_page_frag { + struct list_node f_item; + caddr_t f_page; + unsigned long f_offset; + ibt_mi_hdl_t f_mapped; +}; + +struct rdsv3_ib_incoming { + struct list ii_frags; + struct rdsv3_incoming ii_inc; +}; + +struct rdsv3_ib_connect_private { + /* Add new fields at the end, and don't permute existing fields. */ + uint32_be_t dp_saddr; + uint32_be_t dp_daddr; + uint8_t dp_protocol_major; + uint8_t dp_protocol_minor; + uint16_be_t dp_protocol_minor_mask; /* bitmask */ + uint32_be_t dp_reserved1; + uint32_be_t dp_ack_seq; + uint32_be_t dp_credit; /* non-zero enables flow ctl */ +}; + +struct rdsv3_ib_send_work { + struct rdsv3_message *s_rm; + struct rdsv3_rdma_op *s_op; + ibt_wrc_opcode_t s_opcode; + unsigned long s_queued; +}; + +struct rdsv3_ib_recv_work { + struct rdsv3_ib_incoming *r_ibinc; + struct rdsv3_page_frag *r_frag; + ibt_all_wr_t r_wr; + ibt_wr_ds_t r_sge[2]; +}; + +struct rdsv3_ib_work_ring { + uint32_t w_nr; + uint32_t w_alloc_ptr; + uint32_t w_alloc_ctr; + uint32_t w_free_ptr; + atomic_t w_free_ctr; +}; + +struct rdsv3_ib_device; + +struct rdsv3_ib_connection { + + struct list_node ib_node; + struct rdsv3_ib_device *rds_ibdev; + struct rdsv3_connection *conn; + + /* alphabet soup, IBTA style */ + struct rdma_cm_id *i_cm_id; + struct ib_pd *i_pd; + struct rdsv3_hdrs_mr *i_mr; + struct ib_cq *i_send_cq; + struct ib_cq *i_recv_cq; + + /* tx */ + struct rdsv3_ib_work_ring i_send_ring; + struct rdsv3_message *i_rm; + struct rdsv3_header *i_send_hdrs; + uint64_t i_send_hdrs_dma; + struct rdsv3_ib_send_work *i_sends; + ibt_send_wr_t *i_send_wrs; + + /* rx */ + ddi_taskq_t *i_recv_tasklet; + struct mutex i_recv_mutex; + struct rdsv3_ib_work_ring i_recv_ring; + struct rdsv3_ib_incoming *i_ibinc; + uint32_t i_recv_data_rem; + struct rdsv3_header *i_recv_hdrs; + uint64_t i_recv_hdrs_dma; + struct rdsv3_ib_recv_work *i_recvs; + struct rdsv3_page_frag i_frag; + uint64_t i_ack_recv; /* last ACK received */ + processorid_t i_recv_tasklet_cpuid; + /* CPU to which the tasklet taskq should be bound */ + + /* sending acks */ + unsigned long i_ack_flags; +#ifndef KERNEL_HAS_ATOMIC64 + kmutex_t i_ack_lock; /* protect i_ack_next */ + uint64_t i_ack_next; /* next ACK to send */ +#else + atomic64_t i_ack_next; /* next ACK to send */ +#endif + struct rdsv3_header *i_ack; + ibt_send_wr_t i_ack_wr; + ibt_wr_ds_t i_ack_sge; + uint64_t i_ack_dma; + unsigned long i_ack_queued; + + /* + * Flow control related information + * + * Our algorithm uses a pair variables that we need to access + * atomically - one for the send credits, and one posted + * recv credits we need to transfer to remote. + * Rather than protect them using a slow spinlock, we put both into + * a single atomic_t and update it using cmpxchg + */ + atomic_t i_credits; + + /* Protocol version specific information */ + unsigned int i_flowctl:1; /* enable/disable flow ctl */ + + /* Batched completions */ + unsigned int i_unsignaled_wrs; + long i_unsignaled_bytes; +}; + +/* This assumes that atomic_t is at least 32 bits */ +#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_GET_POST_CREDITS(v) ((v) >> 16) +#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_SET_POST_CREDITS(v) ((v) << 16) + +struct rdsv3_ib_ipaddr { + struct list_node list; + uint32_be_t ipaddr; +}; + +struct rdsv3_ib_device { + struct list_node list; + struct list ipaddr_list; + struct list conn_list; + ib_device_t *dev; + struct ib_pd *pd; + ibt_lkey_t local_dma_lkey; + struct rds_ib_mr_pool *mr_pool; + unsigned int fmr_max_remaps; + unsigned int max_fmrs; + unsigned int fmr_message_size; + int max_sge; + unsigned int max_wrs; + ibt_fmr_pool_hdl_t fmr_pool_hdl; + kmutex_t spinlock; /* protect the above */ + ibt_hca_attr_t hca_attr; +}; + +/* bits for i_ack_flags */ +#define IB_ACK_IN_FLIGHT 0 +#define IB_ACK_REQUESTED 1 + +/* Magic WR_ID for ACKs */ +#define RDSV3_IB_ACK_WR_ID (~(uint64_t)0) + +struct rdsv3_ib_statistics { + uint64_t s_ib_connect_raced; + uint64_t s_ib_listen_closed_stale; + uint64_t s_ib_tx_cq_call; + uint64_t s_ib_tx_cq_event; + uint64_t s_ib_tx_ring_full; + uint64_t s_ib_tx_throttle; + uint64_t s_ib_tx_sg_mapping_failure; + uint64_t s_ib_tx_stalled; + uint64_t s_ib_tx_credit_updates; + uint64_t s_ib_rx_cq_call; + uint64_t s_ib_rx_cq_event; + uint64_t s_ib_rx_ring_empty; + uint64_t s_ib_rx_refill_from_cq; + uint64_t s_ib_rx_refill_from_thread; + uint64_t s_ib_rx_alloc_limit; + uint64_t s_ib_rx_credit_updates; + uint64_t s_ib_ack_sent; + uint64_t s_ib_ack_send_failure; + uint64_t s_ib_ack_send_delayed; + uint64_t s_ib_ack_send_piggybacked; + uint64_t s_ib_ack_received; + uint64_t s_ib_rdma_mr_alloc; + uint64_t s_ib_rdma_mr_free; + uint64_t s_ib_rdma_mr_used; + uint64_t s_ib_rdma_mr_pool_flush; + uint64_t s_ib_rdma_mr_pool_wait; + uint64_t s_ib_rdma_mr_pool_depleted; +}; + +extern struct rdsv3_workqueue_struct_s *rds_ib_wq; + +/* ib.c */ +extern struct rdsv3_transport rdsv3_ib_transport; +extern void rdsv3_ib_add_one(ib_device_t *device); +extern void rdsv3_ib_remove_one(ib_device_t *device); +extern struct ib_client rdsv3_ib_client; + +extern unsigned int fmr_pool_size; +extern unsigned int fmr_message_size; +extern unsigned int rdsv3_ib_retry_count; + +extern kmutex_t ib_nodev_conns_lock; +extern struct list ib_nodev_conns; + +/* ib_cm.c */ +int rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp); +void rdsv3_ib_conn_free(void *arg); +int rdsv3_ib_conn_connect(struct rdsv3_connection *conn); +void rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn); +void rdsv3_conn_drop(struct rdsv3_connection *conn); +int rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); +int rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); +void rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn, + struct rdma_cm_event *event); + +/* ib_rdma.c */ +int rdsv3_ib_update_ipaddr(struct rdsv3_ib_device *rds_ibdev, + uint32_be_t ipaddr); +void rdsv3_ib_add_conn(struct rdsv3_ib_device *rds_ibdev, + struct rdsv3_connection *conn); +void rdsv3_ib_remove_conn(struct rdsv3_ib_device *rds_ibdev, + struct rdsv3_connection *conn); +void __rdsv3_ib_destroy_conns(struct list *list, kmutex_t *list_lock); +static inline void rdsv3_ib_destroy_nodev_conns(void) +{ + __rdsv3_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock); +} +static inline void rdsv3_ib_destroy_conns(struct rdsv3_ib_device *rds_ibdev) +{ + __rdsv3_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock); +} + +int rdsv3_ib_create_mr_pool(struct rdsv3_ib_device *); +void rdsv3_ib_destroy_mr_pool(struct rdsv3_ib_device *); +void rdsv3_ib_get_mr_info(struct rdsv3_ib_device *rds_ibdev, + struct rdsv3_info_rdma_connection *iinfo); +void *rdsv3_ib_get_mr(struct rdsv3_iovec *args, unsigned long nents, + struct rdsv3_sock *rs, uint32_t *key_ret); +void rdsv3_ib_sync_mr(void *trans_private, int dir); +void rdsv3_ib_free_mr(void *trans_private, int invalidate); +void rdsv3_ib_flush_mrs(void); + +/* ib_recv.c */ +int rdsv3_ib_recv_init(void); +void rdsv3_ib_recv_exit(void); +int rdsv3_ib_recv(struct rdsv3_connection *conn); +int rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int kptr_gfp, + int page_gfp, int prefill); +void rdsv3_ib_inc_purge(struct rdsv3_incoming *inc); +void rdsv3_ib_inc_free(struct rdsv3_incoming *inc); +int rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming *inc, uio_t *uiop, + size_t size); +void rdsv3_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); +void rdsv3_ib_recv_tasklet_fn(void *data); +void rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection *ic); +void rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection *ic); +void rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection *ic); +void rdsv3_ib_attempt_ack(struct rdsv3_ib_connection *ic); +void rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection *ic); +uint64_t rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection *ic); + +/* ib_ring.c */ +void rdsv3_ib_ring_init(struct rdsv3_ib_work_ring *ring, uint32_t nr); +void rdsv3_ib_ring_resize(struct rdsv3_ib_work_ring *ring, uint32_t nr); +uint32_t rdsv3_ib_ring_alloc(struct rdsv3_ib_work_ring *ring, uint32_t val, + uint32_t *pos); +void rdsv3_ib_ring_free(struct rdsv3_ib_work_ring *ring, uint32_t val); +void rdsv3_ib_ring_unalloc(struct rdsv3_ib_work_ring *ring, uint32_t val); +int rdsv3_ib_ring_empty(struct rdsv3_ib_work_ring *ring); +int rdsv3_ib_ring_low(struct rdsv3_ib_work_ring *ring); +uint32_t rdsv3_ib_ring_oldest(struct rdsv3_ib_work_ring *ring); +uint32_t rdsv3_ib_ring_completed(struct rdsv3_ib_work_ring *ring, + uint32_t wr_id, uint32_t oldest); +extern rdsv3_wait_queue_t rdsv3_ib_ring_empty_wait; + +/* ib_send.c */ +void rdsv3_ib_xmit_complete(struct rdsv3_connection *conn); +int rdsv3_ib_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off); +void rdsv3_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); +void rdsv3_ib_send_init_ring(struct rdsv3_ib_connection *ic); +void rdsv3_ib_send_clear_ring(struct rdsv3_ib_connection *ic); +int rdsv3_ib_xmit_rdma(struct rdsv3_connection *conn, struct rdsv3_rdma_op *op); +void rdsv3_ib_send_add_credits(struct rdsv3_connection *conn, + unsigned int credits); +void rdsv3_ib_advertise_credits(struct rdsv3_connection *conn, + unsigned int posted); +int rdsv3_ib_send_grab_credits(struct rdsv3_ib_connection *ic, uint32_t wanted, + uint32_t *adv_credits, int need_posted, int max_posted); + +/* ib_stats.c */ +RDSV3_DECLARE_PER_CPU(struct rdsv3_ib_statistics, rdsv3_ib_stats); +#define rdsv3_ib_stats_inc(member) rdsv3_stats_inc_which(rdsv3_ib_stats, member) +unsigned int rdsv3_ib_stats_info_copy(struct rdsv3_info_iterator *iter, + unsigned int avail); + +/* ib_sysctl.c */ +int rdsv3_ib_sysctl_init(void); +void rdsv3_ib_sysctl_exit(void); +extern unsigned long rdsv3_ib_sysctl_max_send_wr; +extern unsigned long rdsv3_ib_sysctl_max_recv_wr; +extern unsigned long rdsv3_ib_sysctl_max_unsig_wrs; +extern unsigned long rdsv3_ib_sysctl_max_unsig_bytes; +extern unsigned long rdsv3_ib_sysctl_max_recv_allocation; +extern unsigned int rdsv3_ib_sysctl_flow_control; + +#endif /* _RDSV3_IB_H */ diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/info.h b/usr/src/uts/common/sys/ib/clients/rdsv3/info.h new file mode 100644 index 0000000000..cda82e0b0f --- /dev/null +++ b/usr/src/uts/common/sys/ib/clients/rdsv3/info.h @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _RDSV3_INFO_H +#define _RDSV3_INFO_H + +struct rdsv3_info_iterator { + char *addr; + unsigned long offset; +}; + +struct rdsv3_info_lengths { + unsigned int nr; + unsigned int each; +}; + +struct rdsv3_sock; + +/* + * These functions must fill in the fields of @lens to reflect the size + * of the available info source. If the snapshot fits in @len then it + * should be copied using @iter. The caller will deduce if it was copied + * or not by comparing the lengths. + */ +typedef void (*rdsv3_info_func)(struct rsock *sock, unsigned int len, + struct rdsv3_info_iterator *iter, + struct rdsv3_info_lengths *lens); + +#define rdsv3_info_copy(iter, data, bytes) \ + bcopy(data, iter->addr + iter->offset, bytes); \ + iter->offset += bytes + +void rdsv3_info_register_func(int optname, rdsv3_info_func func); +void rdsv3_info_deregister_func(int optname, rdsv3_info_func func); +int rdsv3_info_getsockopt(struct rsock *sock, int optname, char *optval, + socklen_t *optlen); + +#endif /* _RDSV3_INFO_H */ diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/loop.h b/usr/src/uts/common/sys/ib/clients/rdsv3/loop.h new file mode 100644 index 0000000000..240a57aed5 --- /dev/null +++ b/usr/src/uts/common/sys/ib/clients/rdsv3/loop.h @@ -0,0 +1,33 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _RDSV3_LOOP_H +#define _RDSV3_LOOP_H + +/* loop.c */ +extern struct rdsv3_transport rdsv3_loop_transport; + +void rdsv3_loop_exit(void); + +#endif /* _RDSV3_LOOP_H */ diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/rdma.h b/usr/src/uts/common/sys/ib/clients/rdsv3/rdma.h new file mode 100644 index 0000000000..b2e6322808 --- /dev/null +++ b/usr/src/uts/common/sys/ib/clients/rdsv3/rdma.h @@ -0,0 +1,120 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _RDSV3_RDMA_H +#define _RDSV3_RDMA_H + +#include <sys/rds.h> +#include <sys/uio.h> + +#include <sys/ib/clients/rdsv3/rdsv3.h> + +struct rdsv3_mr { + /* for AVL tree */ + avl_node_t r_rb_node; + atomic_t r_refcount; + uint32_t r_key; + + /* A copy of the creation flags */ + unsigned int r_use_once:1; + unsigned int r_invalidate:1; + unsigned int r_write:1; + + /* + * This is for RDS_MR_DEAD. + * It would be nice & consistent to make this part of the above + * bit field here, but we need to use test_and_set_bit. + */ + unsigned long r_state; + /* back pointer to the socket that owns us */ + struct rdsv3_sock *r_sock; + struct rdsv3_transport *r_trans; + void *r_trans_private; +}; + +/* Flags for mr->r_state */ +#define RDSV3_MR_DEAD 0 + +struct rdsv3_rdma_sg { + ddi_umem_cookie_t umem_cookie; + struct rdsv3_iovec iovec; + ibt_send_wr_t swr; + ibt_mi_hdl_t mihdl; + ibt_hca_hdl_t hca_hdl; +}; + +struct rdsv3_rdma_op { + uint32_t r_key; + uint64_t r_remote_addr; + unsigned int r_write:1; + unsigned int r_fence:1; + unsigned int r_notify:1; + unsigned int r_recverr:1; + unsigned int r_mapped:1; + struct rdsv3_notifier *r_notifier; + unsigned int r_bytes; + unsigned int r_nents; + unsigned int r_count; + struct rdsv3_scatterlist *r_sg; + struct rdsv3_rdma_sg r_rdma_sg[1]; +}; + +inline rdsv3_rdma_cookie_t +rdsv3_rdma_make_cookie(uint32_t r_key, uint32_t offset) +{ + return (r_key | (((uint64_t)offset) << 32)); +} + +inline uint32_t +rdsv3_rdma_cookie_key(rdsv3_rdma_cookie_t cookie) +{ + return ((uint32_t)cookie); +} + +inline uint32_t +rdsv3_rdma_cookie_offset(rdsv3_rdma_cookie_t cookie) +{ + return (cookie >> 32); +} + +int rdsv3_get_mr(struct rdsv3_sock *rs, const void *optval, int optlen); +int rdsv3_free_mr(struct rdsv3_sock *rs, const void *optval, int optlen); +void rdsv3_rdma_drop_keys(struct rdsv3_sock *rs); +int rdsv3_cmsg_rdma_args(struct rdsv3_sock *rs, struct rdsv3_message *rm, + struct cmsghdr *cmsg); +int rdsv3_cmsg_rdma_dest(struct rdsv3_sock *rs, struct rdsv3_message *rm, + struct cmsghdr *cmsg); +int rdsv3_cmsg_rdma_map(struct rdsv3_sock *rs, struct rdsv3_message *rm, + struct cmsghdr *cmsg); +void rdsv3_rdma_free_op(struct rdsv3_rdma_op *ro); +void rdsv3_rdma_send_complete(struct rdsv3_message *rm, int); + +extern void __rdsv3_put_mr_final(struct rdsv3_mr *mr); +static inline void rdsv3_mr_put(struct rdsv3_mr *mr) +{ + if (atomic_dec_and_test(&mr->r_refcount)) + __rdsv3_put_mr_final(mr); +} + +#endif /* _RDSV3_RDMA_H */ diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/rdma_transport.h b/usr/src/uts/common/sys/ib/clients/rdsv3/rdma_transport.h new file mode 100644 index 0000000000..b3e5283cdb --- /dev/null +++ b/usr/src/uts/common/sys/ib/clients/rdsv3/rdma_transport.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _RDSV3_RDMA_TRANSPORT_H +#define _RDSV3_RDMA_TRANSPORT_H + +#include "rdsv3.h" + +#define RDSV3_RDMA_RESOLVE_TIMEOUT_MS 5000 + +int rdsv3_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); + +/* from rdma_transport.c */ +void rdsv3_rdma_init(); +void rdsv3_rdma_exit(void *); + +/* from ib.c */ +extern struct rdsv3_transport rdsv3_ib_transport; +int rdsv3_ib_init(void); +void rdsv3_ib_exit(void); + +#endif /* _RDSV3_RDMA_TRANSPORT_H */ diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3.h b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3.h new file mode 100644 index 0000000000..498852bc70 --- /dev/null +++ b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3.h @@ -0,0 +1,790 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _RDSV3_RDSV3_H +#define _RDSV3_RDSV3_H + +/* + * The name of this file is rds.h in ofed. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/sunndi.h> +#include <netinet/in.h> +#include <sys/synch.h> +#include <sys/stropts.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <inet/ip.h> +#include <sys/avl.h> +#include <sys/param.h> +#include <sys/rds.h> + +#include <sys/ib/ibtl/ibti.h> +#include <sys/ib/clients/of/rdma/ib_verbs.h> +#include <sys/ib/clients/of/rdma/ib_addr.h> +#include <sys/ib/clients/of/rdma/rdma_cm.h> +#include <sys/ib/clients/rdsv3/rdsv3_impl.h> +#include <sys/ib/clients/rdsv3/info.h> + +#define NIPQUAD(addr) \ + (unsigned char)((ntohl(addr) >> 24) & 0xFF), \ + (unsigned char)((ntohl(addr) >> 16) & 0xFF), \ + (unsigned char)((ntohl(addr) >> 8) & 0xFF), \ + (unsigned char)(ntohl(addr) & 0xFF) + +/* + * RDS Network protocol version + */ +#define RDS_PROTOCOL_3_0 0x0300 +#define RDS_PROTOCOL_3_1 0x0301 +#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1 +#define RDS_PROTOCOL_MAJOR(v) ((v) >> 8) +#define RDS_PROTOCOL_MINOR(v) ((v) & 255) +#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) + +/* + * XXX randomly chosen, but at least seems to be unused: + * # 18464-18768 Unassigned + * We should do better. We want a reserved port to discourage unpriv'ed + * userspace from listening. + * + * port 18633 was the version that had ack frames on the wire. + */ +#define RDSV3_PORT 18634 + +#include <sys/ib/clients/rdsv3/info.h> + +/* + * RDS trace facilities + */ +enum { + RDSV3_BIND = 0, + RDSV3_CONG, + RDSV3_CONNECTION, + RDSV3_RDMA, + RDSV3_PAGE, + RDSV3_SEND, + RDSV3_RECV, + RDSV3_THREADS, + RDSV3_INFO, + RDSV3_MESSAGE, + RDSV3_IB, + RDSV3_IB_CM, + RDSV3_IB_RDMA, + RDSV3_IB_RING, + RDSV3_IB_RECV, + RDSV3_IB_SEND, + RDSV3_TCP, + RDSV3_TCP_CONNECT, + RDSV3_TCP_LISTEN, + RDSV3_TCP_RECV, + RDSV3_TCP_SEND +}; + +enum { + RDSV3_ALWAYS = 0, + RDSV3_MINIMAL, + RDSV3_LOW, + RDSV3_MEDIUM, + RDSV3_HIGH, + RDSV3_VERBOSE +}; + +/* + * This is the sad making. Some kernels have a bug in the per_cpu() api which + * makes DEFINE_PER_CPU trigger an oops on insmod because the per-cpu section + * in the module is not cacheline-aligned. As much as we'd like to tell users + * with older kernels to stuff it, that's not reasonable. We'll roll our own + * until this doesn't have to build against older kernels. + */ +#define RDSV3_DEFINE_PER_CPU(type, var) type var[NR_CPUS] +#define RDSV3_DECLARE_PER_CPU(type, var) extern type var[NR_CPUS] +#define rdsv3_per_cpu(var, cpu) var[cpu] + +static inline ulong_t +ceil(ulong_t x, ulong_t y) +{ + return ((x + y - 1) / y); +} + +#define RDSV3_FRAG_SHIFT 12 +#define RDSV3_FRAG_SIZE ((unsigned int)(1 << RDSV3_FRAG_SHIFT)) + +#define RDSV3_CONG_MAP_BYTES (65536 / 8) +#define RDSV3_CONG_MAP_LONGS (RDSV3_CONG_MAP_BYTES / sizeof (unsigned long)) +#define RDSV3_CONG_MAP_PAGES (RDSV3_CONG_MAP_BYTES / PAGE_SIZE) +#define RDSV3_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) + +struct rdsv3_cong_map { + struct avl_node m_rb_node; + uint32_be_t m_addr; + rdsv3_wait_queue_t m_waitq; + struct list m_conn_list; + unsigned long m_page_addrs[RDSV3_CONG_MAP_PAGES]; +}; + + +/* + * This is how we will track the connection state: + * A connection is always in one of the following + * states. Updates to the state are atomic and imply + * a memory barrier. + */ +enum { + RDSV3_CONN_DOWN = 0, + RDSV3_CONN_CONNECTING, + RDSV3_CONN_DISCONNECTING, + RDSV3_CONN_UP, + RDSV3_CONN_ERROR, +}; + +/* Bits for c_flags */ +#define RDSV3_LL_SEND_FULL 0 +#define RDSV3_RECONNECT_PENDING 1 + +struct rdsv3_connection { + struct avl_node c_hash_node; + uint32_be_t c_laddr; + uint32_be_t c_faddr; + unsigned int c_loopback:1; + struct rdsv3_connection *c_passive; + + struct rdsv3_cong_map *c_lcong; + struct rdsv3_cong_map *c_fcong; + + struct mutex c_send_lock; /* protect send ring */ + struct rdsv3_message *c_xmit_rm; + unsigned long c_xmit_sg; + unsigned int c_xmit_hdr_off; + unsigned int c_xmit_data_off; + unsigned int c_xmit_rdma_sent; + + kmutex_t c_lock; /* protect msg queues */ + uint64_t c_next_tx_seq; + struct list c_send_queue; + struct list c_retrans; + + uint64_t c_next_rx_seq; + + struct rdsv3_transport *c_trans; + void *c_transport_data; + + atomic_t c_state; + unsigned long c_flags; + unsigned long c_reconnect_jiffies; + struct rdsv3_delayed_work_s c_send_w; + struct rdsv3_delayed_work_s c_recv_w; + struct rdsv3_delayed_work_s c_conn_w; + struct rdsv3_work_s c_down_w; + struct mutex c_cm_lock; /* protect conn state & cm */ + + struct list_node c_map_item; + unsigned long c_map_queued; + unsigned long c_map_offset; + unsigned long c_map_bytes; + + unsigned int c_unacked_packets; + unsigned int c_unacked_bytes; + + /* Protocol version */ + unsigned int c_version; +}; + +#define RDSV3_FLAG_CONG_BITMAP 0x01 +#define RDSV3_FLAG_ACK_REQUIRED 0x02 +#define RDSV3_FLAG_RETRANSMITTED 0x04 +#define RDSV3_MAX_ADV_CREDIT 255 + +/* + * Maximum space available for extension headers. + */ +#define RDSV3_HEADER_EXT_SPACE 16 + +struct rdsv3_header { + uint64_be_t h_sequence; + uint64_be_t h_ack; + uint32_be_t h_len; + uint16_be_t h_sport; + uint16_be_t h_dport; + uint8_t h_flags; + uint8_t h_credit; + uint8_t h_padding[4]; + uint16_be_t h_csum; + + uint8_t h_exthdr[RDSV3_HEADER_EXT_SPACE]; +}; + +/* Reserved - indicates end of extensions */ +#define RDSV3_EXTHDR_NONE 0 + +/* + * This extension header is included in the very + * first message that is sent on a new connection, + * and identifies the protocol level. This will help + * rolling updates if a future change requires breaking + * the protocol. + */ +#define RDSV3_EXTHDR_VERSION 1 +struct rdsv3_ext_header_version { + uint32_be_t h_version; +}; + +/* + * This extension header is included in the RDS message + * chasing an RDMA operation. + */ +#define RDSV3_EXTHDR_RDMA 2 +struct rdsv3_ext_header_rdma { + uint32_be_t h_rdma_rkey; +}; + +/* + * This extension header tells the peer about the + * destination <R_Key,offset> of the requested RDMA + * operation. + */ +#define RDSV3_EXTHDR_RDMA_DEST 3 +struct rdsv3_ext_header_rdma_dest { + uint32_be_t h_rdma_rkey; + uint32_be_t h_rdma_offset; +}; + +#define __RDSV3_EXTHDR_MAX 16 /* for now */ + +struct rdsv3_incoming { + atomic_t i_refcount; + struct list_node i_item; + struct rdsv3_connection *i_conn; + struct rdsv3_header i_hdr; + unsigned long i_rx_jiffies; + uint32_be_t i_saddr; + + rdsv3_rdma_cookie_t i_rdma_cookie; +}; + +/* + * m_sock_item and m_conn_item are on lists that are serialized under + * conn->c_lock. m_sock_item has additional meaning in that once it is empty + * the message will not be put back on the retransmit list after being sent. + * messages that are canceled while being sent rely on this. + * + * m_inc is used by loopback so that it can pass an incoming message straight + * back up into the rx path. It embeds a wire header which is also used by + * the send path, which is kind of awkward. + * + * m_sock_item indicates the message's presence on a socket's send or receive + * queue. m_rs will point to that socket. + * + * m_daddr is used by cancellation to prune messages to a given destination. + * + * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock + * nesting. As paths iterate over messages on a sock, or conn, they must + * also lock the conn, or sock, to remove the message from those lists too. + * Testing the flag to determine if the message is still on the lists lets + * us avoid testing the list_head directly. That means each path can use + * the message's list_head to keep it on a local list while juggling locks + * without confusing the other path. + * + * m_ack_seq is an optional field set by transports who need a different + * sequence number range to invalidate. They can use this in a callback + * that they pass to rdsv3_send_drop_acked() to see if each message has been + * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't + * had ack_seq set yet. + */ +#define RDSV3_MSG_ON_SOCK 1 +#define RDSV3_MSG_ON_CONN 2 +#define RDSV3_MSG_HAS_ACK_SEQ 3 +#define RDSV3_MSG_ACK_REQUIRED 4 +#define RDSV3_MSG_RETRANSMITTED 5 +#define RDSV3_MSG_MAPPED 6 +#define RDSV3_MSG_PAGEVEC 7 + +struct rdsv3_message { + atomic_t m_refcount; + struct list_node m_sock_item; + struct list_node m_conn_item; + struct rdsv3_incoming m_inc; + uint64_t m_ack_seq; + uint32_be_t m_daddr; + unsigned long m_flags; + + /* + * Never access m_rs without holding m_rs_lock. + * Lock nesting is + * rm->m_rs_lock + * -> rs->rs_lock + */ + kmutex_t m_rs_lock; + struct rdsv3_sock *m_rs; + struct rdsv3_rdma_op *m_rdma_op; + rdsv3_rdma_cookie_t m_rdma_cookie; + struct rdsv3_mr *m_rdma_mr; + unsigned int m_nents; + unsigned int m_count; + struct rdsv3_scatterlist m_sg[1]; +}; + +/* + * The RDS notifier is used (optionally) to tell the application about + * completed RDMA operations. Rather than keeping the whole rds message + * around on the queue, we allocate a small notifier that is put on the + * socket's notifier_list. Notifications are delivered to the application + * through control messages. + */ +struct rdsv3_notifier { + list_node_t n_list; + uint64_t n_user_token; + int n_status; +}; + +/* + * struct rdsv3_transport - transport specific behavioural hooks + * + * @xmit: .xmit is called by rdsv3_send_xmit() to tell the transport to send + * part of a message. The caller serializes on the send_sem so this + * doesn't need to be reentrant for a given conn. The header must be + * sent before the data payload. .xmit must be prepared to send a + * message with no data payload. .xmit should return the number of + * bytes that were sent down the connection, including header bytes. + * Returning 0 tells the caller that it doesn't need to perform any + * additional work now. This is usually the case when the transport has + * filled the sending queue for its connection and will handle + * triggering the rds thread to continue the send when space becomes + * available. Returning -EAGAIN tells the caller to retry the send + * immediately. Returning -ENOMEM tells the caller to retry the send at + * some point in the future. + * + * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once + * it returns the connection can not call rdsv3_recv_incoming(). + * This will only be called once after conn_connect returns + * non-zero success and will The caller serializes this with + * the send and connecting paths (xmit_* and conn_*). The + * transport is responsible for other serialization, including + * rdsv3_recv_incoming(). This is called in process context but + * should try hard not to block. + * + * @xmit_cong_map: This asks the transport to send the local bitmap down the + * given connection. XXX get a better story about the bitmap + * flag and header. + */ + +struct rdsv3_transport { + struct list_node t_item; + char *t_name; + unsigned int t_prefer_loopback:1; + + int (*laddr_check)(uint32_be_t addr); + int (*conn_alloc)(struct rdsv3_connection *conn, int gfp); + void (*conn_free)(void *data); + int (*conn_connect)(struct rdsv3_connection *conn); + void (*conn_shutdown)(struct rdsv3_connection *conn); + void (*xmit_prepare)(struct rdsv3_connection *conn); + void (*xmit_complete)(struct rdsv3_connection *conn); + int (*xmit)(struct rdsv3_connection *conn, struct rdsv3_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off); + int (*xmit_cong_map)(struct rdsv3_connection *conn, + struct rdsv3_cong_map *map, unsigned long offset); + int (*xmit_rdma)(struct rdsv3_connection *conn, + struct rdsv3_rdma_op *op); + int (*recv)(struct rdsv3_connection *conn); + int (*inc_copy_to_user)(struct rdsv3_incoming *inc, uio_t *uio, + size_t size); + void (*inc_purge)(struct rdsv3_incoming *inc); + void (*inc_free)(struct rdsv3_incoming *inc); + + int (*cm_handle_connect)(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); + int (*cm_initiate_connect)(struct rdma_cm_id *cm_id); + void (*cm_connect_complete)(struct rdsv3_connection *conn, + struct rdma_cm_event *event); + + unsigned int (*stats_info_copy)(struct rdsv3_info_iterator *iter, + unsigned int avail); + void (*exit)(void); + void *(*get_mr)(struct rdsv3_iovec *sg, unsigned long nr_sg, + struct rdsv3_sock *rs, uint32_t *key_ret); + void (*sync_mr)(void *trans_private, int direction); + void (*free_mr)(void *trans_private, int invalidate); + void (*flush_mrs)(void); +}; + +struct rdsv3_sock { + struct rsock *rs_sk; + + uint64_t rs_user_addr; + uint64_t rs_user_bytes; + + /* + * bound_addr used for both incoming and outgoing, no INADDR_ANY + * support. + */ + struct avl_node rs_bound_node; + uint32_be_t rs_bound_addr; + uint32_be_t rs_conn_addr; + uint16_be_t rs_bound_port; + uint16_be_t rs_conn_port; + + /* + * This is only used to communicate the transport between bind and + * initiating connections. All other trans use is referenced through + * the connection. + */ + struct rdsv3_transport *rs_transport; + + /* + * rdsv3_sendmsg caches the conn it used the last time around. + * This helps avoid costly lookups. + */ + struct rdsv3_connection *rs_conn; + kmutex_t rs_conn_lock; + + /* flag indicating we were congested or not */ + int rs_congested; + + /* rs_lock protects all these adjacent members before the newline */ + kmutex_t rs_lock; + struct list rs_send_queue; + uint32_t rs_snd_bytes; + int rs_rcv_bytes; + /* currently used for failed RDMAs */ + struct list rs_notify_queue; + + /* + * Congestion wake_up. If rs_cong_monitor is set, we use cong_mask + * to decide whether the application should be woken up. + * If not set, we use rs_cong_track to find out whether a cong map + * update arrived. + */ + uint64_t rs_cong_mask; + uint64_t rs_cong_notify; + struct list_node rs_cong_list; + unsigned long rs_cong_track; + + /* + * rs_recv_lock protects the receive queue, and is + * used to serialize with rdsv3_release. + */ + krwlock_t rs_recv_lock; + struct list rs_recv_queue; + + /* just for stats reporting */ + struct list_node rs_item; + + /* these have their own lock */ + kmutex_t rs_rdma_lock; + struct avl_tree rs_rdma_keys; + + /* Socket options - in case there will be more */ + unsigned char rs_recverr, + rs_cong_monitor; + + cred_t *rs_cred; + zoneid_t rs_zoneid; +}; + +inline struct rdsv3_sock * +rdsv3_sk_to_rs(const struct rsock *sk) +{ + return ((struct rdsv3_sock *)sk->sk_protinfo); +} + +inline struct rsock * +rdsv3_rs_to_sk(const struct rdsv3_sock *rs) +{ + return ((struct rsock *)rs->rs_sk); +} + +/* + * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value + * to account for overhead. We don't account for overhead, we just apply + * the number of payload bytes to the specified value. + */ +inline int +rdsv3_sk_sndbuf(struct rdsv3_sock *rs) +{ + /* XXX */ + return (rdsv3_rs_to_sk(rs)->sk_sndbuf); +} + +inline int +rdsv3_sk_rcvbuf(struct rdsv3_sock *rs) +{ + /* XXX */ + return (rdsv3_rs_to_sk(rs)->sk_rcvbuf); +} + +struct rdsv3_statistics { + uint64_t s_conn_reset; + uint64_t s_recv_drop_bad_checksum; + uint64_t s_recv_drop_old_seq; + uint64_t s_recv_drop_no_sock; + uint64_t s_recv_drop_dead_sock; + uint64_t s_recv_deliver_raced; + uint64_t s_recv_delivered; + uint64_t s_recv_queued; + uint64_t s_recv_immediate_retry; + uint64_t s_recv_delayed_retry; + uint64_t s_recv_ack_required; + uint64_t s_recv_rdma_bytes; + uint64_t s_recv_ping; + uint64_t s_send_queue_empty; + uint64_t s_send_queue_full; + uint64_t s_send_sem_contention; + uint64_t s_send_sem_queue_raced; + uint64_t s_send_immediate_retry; + uint64_t s_send_delayed_retry; + uint64_t s_send_drop_acked; + uint64_t s_send_ack_required; + uint64_t s_send_queued; + uint64_t s_send_rdma; + uint64_t s_send_rdma_bytes; + uint64_t s_send_pong; + uint64_t s_page_remainder_hit; + uint64_t s_page_remainder_miss; + uint64_t s_copy_to_user; + uint64_t s_copy_from_user; + uint64_t s_cong_update_queued; + uint64_t s_cong_update_received; + uint64_t s_cong_send_error; + uint64_t s_cong_send_blocked; +}; + +/* af_rds.c */ +void rdsv3_sock_addref(struct rdsv3_sock *rs); +void rdsv3_sock_put(struct rdsv3_sock *rs); +void rdsv3_wake_sk_sleep(struct rdsv3_sock *rs); +void __rdsv3_wake_sk_sleep(struct rsock *sk); + +extern rdsv3_wait_queue_t rdsv3_poll_waitq; + +/* bind.c */ +int rdsv3_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, + socklen_t len, cred_t *cr); +void rdsv3_remove_bound(struct rdsv3_sock *rs); +struct rdsv3_sock *rdsv3_find_bound(uint32_be_t addr, uint16_be_t port); + +/* conn.c */ +int rdsv3_conn_init(void); +void rdsv3_conn_exit(void); +struct rdsv3_connection *rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr, + struct rdsv3_transport *trans, int gfp); +struct rdsv3_connection *rdsv3_conn_create_outgoing(uint32_be_t laddr, + uint32_be_t faddr, + struct rdsv3_transport *trans, int gfp); +void rdsv3_conn_destroy(struct rdsv3_connection *conn); +void rdsv3_conn_reset(struct rdsv3_connection *conn); +void rdsv3_conn_drop(struct rdsv3_connection *conn); +void rdsv3_for_each_conn_info(struct rsock *sock, unsigned int len, + struct rdsv3_info_iterator *iter, + struct rdsv3_info_lengths *lens, + int (*visitor)(struct rdsv3_connection *, void *), + size_t item_len); + +static inline int +rdsv3_conn_transition(struct rdsv3_connection *conn, int old, int new) +{ + return (atomic_cmpxchg(&conn->c_state, old, new) == old); +} + +inline int +rdsv3_conn_state(struct rdsv3_connection *conn) +{ + return (atomic_get(&conn->c_state)); +} + +inline int +rdsv3_conn_up(struct rdsv3_connection *conn) +{ + return (atomic_get(&conn->c_state) == RDSV3_CONN_UP); +} + +inline int +rdsv3_conn_connecting(struct rdsv3_connection *conn) +{ + return (atomic_get(&conn->c_state) == RDSV3_CONN_CONNECTING); +} + +/* recv.c */ +void rdsv3_inc_init(struct rdsv3_incoming *inc, struct rdsv3_connection *conn, + uint32_be_t saddr); +void rdsv3_inc_addref(struct rdsv3_incoming *inc); +void rdsv3_inc_put(struct rdsv3_incoming *inc); +void rdsv3_recv_incoming(struct rdsv3_connection *conn, uint32_be_t saddr, + uint32_be_t daddr, + struct rdsv3_incoming *inc, int gfp); +int rdsv3_recvmsg(struct rdsv3_sock *rs, uio_t *uio, + struct msghdr *msg, size_t size, int msg_flags); +void rdsv3_clear_recv_queue(struct rdsv3_sock *rs); +int rdsv3_notify_queue_get(struct rdsv3_sock *rs, struct msghdr *msg); +void rdsv3_inc_info_copy(struct rdsv3_incoming *inc, + struct rdsv3_info_iterator *iter, + uint32_be_t saddr, uint32_be_t daddr, int flip); + +/* page.c */ +int rdsv3_page_remainder_alloc(struct rdsv3_scatterlist *scat, + unsigned long bytes, int gfp); + +/* send.c */ +int rdsv3_sendmsg(struct rdsv3_sock *rs, uio_t *uio, struct nmsghdr *msg, + size_t payload_len); +void rdsv3_send_reset(struct rdsv3_connection *conn); +int rdsv3_send_xmit(struct rdsv3_connection *conn); +struct sockaddr_in; +void rdsv3_send_drop_to(struct rdsv3_sock *rs, struct sockaddr_in *dest); +typedef int (*is_acked_func)(struct rdsv3_message *rm, uint64_t ack); +void rdsv3_send_drop_acked(struct rdsv3_connection *conn, uint64_t ack, + is_acked_func is_acked); +int rdsv3_send_acked_before(struct rdsv3_connection *conn, uint64_t seq); +void rdsv3_send_remove_from_sock(struct list *messages, int status); +int rdsv3_send_pong(struct rdsv3_connection *conn, uint16_be_t dport); +struct rdsv3_message *rdsv3_send_get_message(struct rdsv3_connection *, + struct rdsv3_rdma_op *); + +/* rdma.c */ +void rdsv3_rdma_unuse(struct rdsv3_sock *rs, uint32_t r_key, int force); + +/* cong.c */ +void rdsv3_cong_init(void); +int rdsv3_cong_get_maps(struct rdsv3_connection *conn); +void rdsv3_cong_add_conn(struct rdsv3_connection *conn); +void rdsv3_cong_remove_conn(struct rdsv3_connection *conn); +void rdsv3_cong_set_bit(struct rdsv3_cong_map *map, uint16_be_t port); +void rdsv3_cong_clear_bit(struct rdsv3_cong_map *map, uint16_be_t port); +int rdsv3_cong_wait(struct rdsv3_cong_map *map, uint16_be_t port, int nonblock, + struct rdsv3_sock *rs); +void rdsv3_cong_queue_updates(struct rdsv3_cong_map *map); +void rdsv3_cong_map_updated(struct rdsv3_cong_map *map, uint64_t); +int rdsv3_cong_updated_since(unsigned long *recent); +void rdsv3_cong_add_socket(struct rdsv3_sock *); +void rdsv3_cong_remove_socket(struct rdsv3_sock *); +void rdsv3_cong_exit(void); +struct rdsv3_message *rdsv3_cong_update_alloc(struct rdsv3_connection *conn); + +/* stats.c */ +RDSV3_DECLARE_PER_CPU(struct rdsv3_statistics, rdsv3_stats); +#define rdsv3_stats_inc_which(which, member) do { \ + rdsv3_per_cpu(which, get_cpu()).member++; \ + put_cpu(); \ +} while (0) +#define rdsv3_stats_inc(member) rdsv3_stats_inc_which(rdsv3_stats, member) +#define rdsv3_stats_add_which(which, member, count) do { \ + rdsv3_per_cpu(which, get_cpu()).member += count; \ + put_cpu(); \ +} while (0) +#define rdsv3_stats_add(member, count) \ + rdsv3_stats_add_which(rdsv3_stats, member, count) +int rdsv3_stats_init(void); +void rdsv3_stats_exit(void); +void rdsv3_stats_info_copy(struct rdsv3_info_iterator *iter, + uint64_t *values, char **names, size_t nr); + + +/* sysctl.c */ +int rdsv3_sysctl_init(void); +void rdsv3_sysctl_exit(void); +extern unsigned long rdsv3_sysctl_sndbuf_min; +extern unsigned long rdsv3_sysctl_sndbuf_default; +extern unsigned long rdsv3_sysctl_sndbuf_max; +extern unsigned long rdsv3_sysctl_reconnect_min_jiffies; +extern unsigned long rdsv3_sysctl_reconnect_max_jiffies; +extern unsigned int rdsv3_sysctl_max_unacked_packets; +extern unsigned int rdsv3_sysctl_max_unacked_bytes; +extern unsigned int rdsv3_sysctl_ping_enable; +extern unsigned long rdsv3_sysctl_trace_flags; +extern unsigned int rdsv3_sysctl_trace_level; + +/* threads.c */ +int rdsv3_threads_init(); +void rdsv3_threads_exit(void); +extern struct rdsv3_workqueue_struct_s *rdsv3_wq; +void rdsv3_connect_worker(struct rdsv3_work_s *); +void rdsv3_shutdown_worker(struct rdsv3_work_s *); +void rdsv3_send_worker(struct rdsv3_work_s *); +void rdsv3_recv_worker(struct rdsv3_work_s *); +void rdsv3_connect_complete(struct rdsv3_connection *conn); + +/* transport.c */ +int rdsv3_trans_register(struct rdsv3_transport *trans); +void rdsv3_trans_unregister(struct rdsv3_transport *trans); +struct rdsv3_transport *rdsv3_trans_get_preferred(uint32_be_t addr); +unsigned int rdsv3_trans_stats_info_copy(struct rdsv3_info_iterator *iter, + unsigned int avail); +void rdsv3_trans_exit(void); + +/* message.c */ +struct rdsv3_message *rdsv3_message_alloc(unsigned int nents, int gfp); +struct rdsv3_message *rdsv3_message_copy_from_user(struct uio *uiop, + size_t total_len); +struct rdsv3_message *rdsv3_message_map_pages(unsigned long *page_addrs, + unsigned int total_len); +void rdsv3_message_populate_header(struct rdsv3_header *hdr, uint16_be_t sport, + uint16_be_t dport, uint64_t seq); +int rdsv3_message_add_extension(struct rdsv3_header *hdr, + unsigned int type, const void *data, unsigned int len); +int rdsv3_message_next_extension(struct rdsv3_header *hdr, + unsigned int *pos, void *buf, unsigned int *buflen); +int rdsv3_message_add_version_extension(struct rdsv3_header *hdr, + unsigned int version); +int rdsv3_message_get_version_extension(struct rdsv3_header *hdr, + unsigned int *version); +int rdsv3_message_add_rdma_dest_extension(struct rdsv3_header *hdr, + uint32_t r_key, uint32_t offset); +int rdsv3_message_inc_copy_to_user(struct rdsv3_incoming *inc, + uio_t *uio, size_t size); +void rdsv3_message_inc_purge(struct rdsv3_incoming *inc); +void rdsv3_message_inc_free(struct rdsv3_incoming *inc); +void rdsv3_message_addref(struct rdsv3_message *rm); +void rdsv3_message_put(struct rdsv3_message *rm); +void rdsv3_message_wait(struct rdsv3_message *rm); +void rdsv3_message_unmapped(struct rdsv3_message *rm); + +inline void +rdsv3_message_make_checksum(struct rdsv3_header *hdr) +{ + hdr->h_csum = 0; + hdr->h_csum = + rdsv3_ip_fast_csum((void *)hdr, sizeof (*hdr) >> 2); +} + +inline int +rdsv3_message_verify_checksum(const struct rdsv3_header *hdr) +{ + return (!hdr->h_csum || + rdsv3_ip_fast_csum((void *)hdr, sizeof (*hdr) >> 2) == 0); +} + +/* rdsv3_sc.c */ +extern boolean_t rdsv3_if_lookup_by_name(char *if_name); +extern int rdsv3_sc_path_lookup(ipaddr_t *localip, ipaddr_t *remip); +extern ipaddr_t rdsv3_scaddr_to_ibaddr(ipaddr_t addr); + +#ifdef __cplusplus +} +#endif + +#endif /* _RDSV3_RDSV3_H */ diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_debug.h b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_debug.h new file mode 100644 index 0000000000..f970d70209 --- /dev/null +++ b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_debug.h @@ -0,0 +1,139 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _RDSV3_DEBUG_H +#define _RDSV3_DEBUG_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define LABEL "RDSV3" + +/* + * warnings, console & syslog buffer. + * For Non recoverable or Major Errors + */ +#define RDSV3_LOG_L0 0 + +/* + * syslog buffer or RDS trace buffer (console if booted /w debug) + * For additional information on Non recoverable errors and + * warnings/informational message for sys-admin types. + */ +#define RDSV3_LOG_L1 1 + +/* + * debug only + * for more verbose trace than L1, for e.g. recoverable errors, + * or intersting trace + */ +#define RDSV3_LOG_L2 2 + +/* + * debug only + * for more verbose trace than L2, for e.g. informational messages + */ +#define RDSV3_LOG_L3 3 + +/* + * debug only + * for more verbose trace than L3, for e.g. printing function entries... + */ +#define RDSV3_LOG_L4 4 + +/* + * debug only + * most verbose level. Used only for excessive trace, for e.g. + * printing structures etc. + */ +#define RDSV3_LOG_L5 5 + +/* + * debug only + * for messages from softints, taskqs, intr handlers, timeout handlers etc. + */ +#define RDSV3_LOG_LINTR 6 + + +#ifdef DEBUG +#define RDSV3_DPRINTF_INTR rdsv3_dprintf_intr +#define RDSV3_DPRINTF5 rdsv3_dprintf5 +#define RDSV3_DPRINTF4 rdsv3_dprintf4 +#define RDSV3_DPRINTF3 rdsv3_dprintf3 +#define RDSV3_DPRINTF2 rdsv3_dprintf2 +#define RDSV3_DPRINTF1 rdsv3_dprintf1 +#define RDSV3_DPRINTF0 rdsv3_dprintf0 + +void rdsv3_dprintf_intr( + char *name, + char *fmt, ...); +void rdsv3_dprintf5( + char *name, + char *fmt, ...); +void rdsv3_dprintf4( + char *name, + char *fmt, ...); +void rdsv3_dprintf3( + char *name, + char *fmt, ...); +void rdsv3_dprintf2( + char *name, + char *fmt, ...); +void rdsv3_dprintf1( + char *name, + char *fmt, ...); +void rdsv3_dprintf0( + char *name, + char *fmt, ...); +#else +#define RDSV3_DPRINTF_INTR 0 && +#define RDSV3_DPRINTF5 0 && +#define RDSV3_DPRINTF4 0 && +#define RDSV3_DPRINTF3 0 && +#define RDSV3_DPRINTF2 0 && +#define RDSV3_DPRINTF1 0 && +#define RDSV3_DPRINTF0 0 && +#endif + +void rdsv3_trace( + char *name, + uint8_t lvl, + char *fmt, ...); + +void rdsv3_vprintk( + char *name, + uint8_t lvl, + const char *fmt, + va_list ap); + +/* defined in rds_debug.c */ +void rdsv3_logging_initialization(); +void rdsv3_logging_destroy(); + +#ifdef __cplusplus +} +#endif + +#endif /* _RDSV3_DEBUG_H */ diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_impl.h b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_impl.h new file mode 100644 index 0000000000..d7a734138f --- /dev/null +++ b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_impl.h @@ -0,0 +1,402 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _RDSV3_IMPL_H +#define _RDSV3_IMPL_H + +#include <sys/atomic.h> + +/* + * This file is only present in Solaris + */ + +#ifdef __cplusplus +extern "C" { +#endif + +extern dev_info_t *rdsv3_dev_info; + +#define uint16_be_t uint16_t +#define uint32_be_t uint32_t +#define uint64_be_t uint64_t + +/* + * RDS Well known service id + * Format: 0x1h00144Fhhhhhhhh + * "00144F" is the Sun OUI + * 'h' can be any hex-decimal digit. + */ +#define RDS_SERVICE_ID 0x1000144F00000001ULL + +/* + * Atomic operations + */ +typedef unsigned int atomic_t; +#define ATOMIC_INIT(a) a + +#define atomic_get(p) (*(p)) + +#define atomic_cmpset_long(p, c, n) \ + ((c == atomic_cas_uint(p, c, n)) ? c : -1) + +#define atomic_dec_and_test(a) \ + (atomic_dec_uint_nv((a)) == 0) + +#define atomic_cmpxchg(a, o, n) \ + atomic_cas_uint(a, o, n) + +#ifdef _LP64 +#define set_bit(b, p) \ + atomic_or_ulong(((volatile ulong_t *)(void *)(p)) + ((b) >> 6), \ + 1ul << ((b) & 0x3f)) + +#define clear_bit(b, p) \ + atomic_and_ulong(((volatile ulong_t *)(void *)(p)) + ((b) >> 6), \ + ~(1ul << ((b) & 0x3f))) + +#define test_bit(b, p) \ + (((volatile ulong_t *)(void *)(p))[(b) >> 6] & (1ul << ((b) & 0x3f))) + +#define test_and_set_bit(b, p) \ + atomic_set_long_excl(((ulong_t *)(void *)(p)) + \ + ((b) >> 6), ((b) & 0x3f)) +#define test_and_clear_bit(b, p) \ + !atomic_clear_long_excl(((ulong_t *)(void *)(p)) + ((b) >> 6), \ + ((b) & 0x3f)) +#else +#define set_bit(b, p) \ + atomic_or_uint(((volatile uint_t *)(void *)p) + (b >> 5), \ + 1ul << (b & 0x1f)) + +#define clear_bit(b, p) \ + atomic_and_uint(((volatile uint_t *)(void *)p) + (b >> 5), \ + ~(1ul << (b & 0x1f))) + +#define test_bit(b, p) \ + (((volatile uint_t *)(void *)p)[b >> 5] & (1ul << (b & 0x1f))) + +#define test_and_set_bit(b, p) \ + atomic_set_long_excl(((ulong_t *)(void *)p) + (b >> 5), (b & 0x1f)) +#define test_and_clear_bit(b, p) \ + !atomic_clear_long_excl(((ulong_t *)(void *)p) + (b >> 5), (b & 0x1f)) +#endif + + +uint_t rdsv3_one_sec_in_hz; + +#define jiffies 100 +#define HZ (drv_hztousec(1)) +#define container_of(m, s, name) \ + (void *)((uintptr_t)(m) - (uintptr_t)offsetof(s, name)) +#define ARRAY_SIZE(x) (sizeof (x) / sizeof (x[0])) +/* setting this to PAGESIZE throws build errors */ +#define PAGE_SIZE 4096 /* xxx - fix this */ +#define BITS_PER_LONG (sizeof (unsigned long) * 8) + +/* debug */ +#define RDSV3_PANIC() cmn_err(CE_PANIC, "Panic forced by RDSV3"); + +/* ERR */ +#define MAX_ERRNO 4095 +#define ERR_PTR(x) ((void *)(uintptr_t)x) +#define IS_ERR(ptr) (((uintptr_t)ptr) >= (uintptr_t)-MAX_ERRNO) +#define PTR_ERR(ptr) (int)(uintptr_t)ptr + +/* cpu */ +#define NR_CPUS 1 +#define put_cpu() +#define get_cpu() 0 + +#define MAX_SCHEDULE_TIMEOUT (~0UL>>1) + +#define RDMA_CM_EVENT_ADDR_CHANGE 14 + +/* list */ +/* copied and modified list_remove_node */ +#define list_remove_node(node) \ + if ((node)->list_next != NULL) { \ + (node)->list_prev->list_next = (node)->list_next; \ + (node)->list_next->list_prev = (node)->list_prev; \ + (node)->list_next = (node)->list_prev = NULL; \ + } + +#define list_splice(src, dst) { \ + list_create(dst, (src)->list_size, (src)->list_offset); \ + list_move_tail(dst, src); \ + } + +#define RDSV3_FOR_EACH_LIST_NODE(objp, listp, member) \ + for (objp = list_head(listp); objp; objp = list_next(listp, objp)) +#define RDSV3_FOR_EACH_LIST_NODE_SAFE(objp, tmp, listp, member) \ + for (objp = list_head(listp), tmp = (objp != NULL) ? \ + list_next(listp, objp) : NULL; \ + objp; \ + objp = tmp, tmp = (objp != NULL) ? \ + list_next(listp, objp) : NULL) + +/* simulate wait_queue_head_t */ +typedef struct rdsv3_wait_queue_s { + kmutex_t waitq_mutex; + kcondvar_t waitq_cv; +} rdsv3_wait_queue_t; + +#define rdsv3_init_waitqueue(waitqp) \ + mutex_init(&(waitqp)->waitq_mutex, NULL, MUTEX_DRIVER, NULL); \ + cv_init(&(waitqp)->waitq_cv, NULL, CV_DRIVER, NULL) + +#define rdsv3_exit_waitqueue(waitqp) \ + mutex_destroy(&(waitqp)->waitq_mutex); \ + cv_destroy(&(waitqp)->waitq_cv) + +#define rdsv3_wake_up(waitqp) { \ + mutex_enter(&(waitqp)->waitq_mutex); \ + cv_signal(&(waitqp)->waitq_cv); \ + mutex_exit(&(waitqp)->waitq_mutex); \ + } + +#define rdsv3_wake_up_all(waitqp) { \ + mutex_enter(&(waitqp)->waitq_mutex); \ + cv_broadcast(&(waitqp)->waitq_cv); \ + mutex_exit(&(waitqp)->waitq_mutex); \ + } + +#define rdsv3_wait_event(waitq, condition) \ +{ \ + mutex_enter(&(waitq).waitq_mutex); \ + while (!(condition)) { \ + cv_wait(&(waitq).waitq_cv, &(waitq).waitq_mutex); \ + } \ + mutex_exit(&(waitq).waitq_mutex); \ +} \ + +#ifndef __lock_lint +#define rdsv3_wait_event_interruptible_timeout(waitq, condition, timeo) \ +( \ +{ \ + long cv_return; \ + mutex_enter(&((waitq).waitq_mutex)); \ + cv_return = condition; \ + while (!(cv_return)) { \ + cv_return = cv_timedwait_sig(&((waitq).waitq_cv), \ + &((waitq).waitq_mutex), \ + timeo * drv_usectohz(1000000) + ddi_get_lbolt()); \ + if (cv_return == 0) { \ + break; \ + } \ + cv_return = condition; \ + } \ + mutex_exit(&((waitq).waitq_mutex)); \ + cv_return; \ +} \ +) +#else +#define rdsv3_wait_event_interruptible(waitq, condition) 0 +#define rdsv3_wait_event_interruptible_timeout(waitq, condition, timeo) 0 +#endif + +#define SOCK_DEAD 1ul + +/* socket */ +typedef struct rsock { + sock_upper_handle_t sk_upper_handle; + sock_upcalls_t *sk_upcalls; + + kmutex_t sk_lock; + ulong_t sk_flag; + rdsv3_wait_queue_t *sk_sleep; + int sk_sndbuf; + int sk_rcvbuf; + atomic_t sk_refcount; + + struct rdsv3_sock *sk_protinfo; +} rsock_t; + +typedef struct rdsv3_conn_info_s { + uint32_be_t c_laddr; + uint32_be_t c_faddr; +} rdsv3_conn_info_t; + +/* WQ */ +typedef struct rdsv3_workqueue_struct_s { + kmutex_t wq_lock; + uint_t wq_state; + int wq_pending; + list_t wq_queue; +} rdsv3_workqueue_struct_t; + +struct rdsv3_work_s; +typedef void (*rdsv3_work_func_t)(struct rdsv3_work_s *); +typedef struct rdsv3_work_s { + list_node_t work_item; + rdsv3_work_func_t func; +} rdsv3_work_t; + +/* simulate delayed_work */ +typedef struct rdsv3_delayed_work_s { + kmutex_t lock; + rdsv3_work_t work; + timeout_id_t timeid; + rdsv3_workqueue_struct_t *wq; +} rdsv3_delayed_work_t; + +#define RDSV3_INIT_WORK(wp, f) (wp)->func = f +#define RDSV3_INIT_DELAYED_WORK(dwp, f) \ + (dwp)->work.func = f; \ + mutex_init(&(dwp)->lock, NULL, MUTEX_DRIVER, NULL); \ + (dwp)->timeid = 0 + +/* simulate scatterlist */ +struct rdsv3_scatterlist { + caddr_t vaddr; + uint_t length; + ibt_wr_ds_t *sgl; + ibt_mi_hdl_t mihdl; +}; +#define rdsv3_sg_page(scat) (scat)->vaddr +#define rdsv3_sg_len(scat) (scat)->length +#define rdsv3_sg_set_page(scat, pg, len, off) \ + (scat)->vaddr = (caddr_t)(pg + off); \ + (scat)->length = len +#define rdsv3_ib_sg_dma_len(dev, scat) rdsv3_sg_len(scat) + +/* copied from sys/socket.h */ +#if defined(__sparc) +/* To maintain backward compatibility, alignment needs to be 8 on sparc. */ +#define _CMSG_HDR_ALIGNMENT 8 +#else +/* for __i386 (and other future architectures) */ +#define _CMSG_HDR_ALIGNMENT 4 +#endif /* defined(__sparc) */ + +/* + * The cmsg headers (and macros dealing with them) were made available as + * part of UNIX95 and hence need to be protected with a _XPG4_2 define. + */ +#define _CMSG_DATA_ALIGNMENT (sizeof (int)) +#define _CMSG_HDR_ALIGN(x) (((uintptr_t)(x) + _CMSG_HDR_ALIGNMENT - 1) & \ + ~(_CMSG_HDR_ALIGNMENT - 1)) +#define _CMSG_DATA_ALIGN(x) (((uintptr_t)(x) + _CMSG_DATA_ALIGNMENT - 1) & \ + ~(_CMSG_DATA_ALIGNMENT - 1)) +#define CMSG_DATA(c) \ + ((unsigned char *)_CMSG_DATA_ALIGN((struct cmsghdr *)(c) + 1)) + +#define CMSG_FIRSTHDR(m) \ + (((m)->msg_controllen < sizeof (struct cmsghdr)) ? \ + (struct cmsghdr *)0 : (struct cmsghdr *)((m)->msg_control)) + +#define CMSG_NXTHDR(m, c) \ + (((c) == 0) ? CMSG_FIRSTHDR(m) : \ + ((((uintptr_t)_CMSG_HDR_ALIGN((char *)(c) + \ + ((struct cmsghdr *)(c))->cmsg_len) + sizeof (struct cmsghdr)) > \ + (((uintptr_t)((struct msghdr *)(m))->msg_control) + \ + ((uintptr_t)((struct msghdr *)(m))->msg_controllen))) ? \ + ((struct cmsghdr *)0) : \ + ((struct cmsghdr *)_CMSG_HDR_ALIGN((char *)(c) + \ + ((struct cmsghdr *)(c))->cmsg_len)))) + +/* Amount of space + padding needed for a message of length l */ +#define CMSG_SPACE(l) \ + ((unsigned int)_CMSG_HDR_ALIGN(sizeof (struct cmsghdr) + (l))) + +/* Value to be used in cmsg_len, does not include trailing padding */ +#define CMSG_LEN(l) \ + ((unsigned int)_CMSG_DATA_ALIGN(sizeof (struct cmsghdr)) + (l)) + +/* OFUV -> IB */ +#define RDSV3_IBDEV2HCAHDL(device) (device)->hca_hdl +#define RDSV3_QP2CHANHDL(qp) (qp)->ibt_qp +#define RDSV3_PD2PDHDL(pd) (pd)->ibt_pd +#define RDSV3_CQ2CQHDL(cq) (cq)->ibt_cq + +struct rdsv3_hdrs_mr { + ibt_lkey_t lkey; + caddr_t addr; + size_t size; + ibt_mr_hdl_t hdl; +}; + +/* rdsv3_impl.c */ +void rdsv3_trans_init(); +boolean_t rdsv3_capable_interface(struct lifreq *lifrp); +int rdsv3_do_ip_ioctl(ksocket_t so4, void **ipaddrs, int *size, int *nifs); +int rdsv3_do_ip_ioctl_old(ksocket_t so4, void **ipaddrs, int *size, int *nifs); +boolean_t rdsv3_isloopback(ipaddr_t addr); +void rdsv3_cancel_delayed_work(rdsv3_delayed_work_t *dwp); +void rdsv3_flush_workqueue(rdsv3_workqueue_struct_t *wq); +void rdsv3_queue_work(rdsv3_workqueue_struct_t *wq, rdsv3_work_t *wp); +void rdsv3_queue_delayed_work(rdsv3_workqueue_struct_t *wq, + rdsv3_delayed_work_t *dwp, uint_t delay); +struct rsock *rdsv3_sk_alloc(); +void rdsv3_sock_init_data(struct rsock *sk); +void rdsv3_sock_exit_data(struct rsock *sk); +void rdsv3_poll_wait(struct rsock *sk, rdsv3_wait_queue_t *waitq, short events); +void rdsv3_destroy_task_workqueue(rdsv3_workqueue_struct_t *wq); +rdsv3_workqueue_struct_t *rdsv3_create_task_workqueue(char *name); +int rdsv3_conn_constructor(void *buf, void *arg, int kmflags); +void rdsv3_conn_destructor(void *buf, void *arg); +int rdsv3_conn_compare(const void *conn1, const void *conn2); +void rdsv3_loop_init(); +int rdsv3_mr_compare(const void *mr1, const void *mr2); +int rdsv3_put_cmsg(struct nmsghdr *msg, int level, int type, size_t size, + void *payload); +int rdsv3_verify_bind_address(ipaddr_t addr); +int rdsv3_bind_node_compare(const void *a, const void *b); +void rdsv3_bind_tree_init(); +void rdsv3_bind_tree_exit(); +uint16_t rdsv3_ip_fast_csum(void *buffer, size_t length); +uint_t rdsv3_ib_dma_map_sg(struct ib_device *dev, struct rdsv3_scatterlist + *scat, uint_t num); +void rdsv3_ib_dma_unmap_sg(ib_device_t *dev, struct rdsv3_scatterlist *scat, + uint_t num); +inline void +rdsv3_sk_sock_hold(struct rsock *sk) +{ + atomic_add_32(&sk->sk_refcount, 1); +} +inline void +rdsv3_sk_sock_put(struct rsock *sk) +{ + if (atomic_dec_and_test(&sk->sk_refcount)) + rdsv3_sock_exit_data(sk); +} +inline int +rdsv3_sk_sock_flag(struct rsock *sk, uint_t flag) +{ + return (test_bit(flag, &sk->sk_flag)); +} +inline void +rdsv3_sk_sock_orphan(struct rsock *sk) +{ + set_bit(SOCK_DEAD, &sk->sk_flag); +} + +#define rdsv3_rcvtimeo(a, b) 3600 /* check this value on linux */ + +void rdsv3_ib_free_conn(void *arg); + +#ifdef __cplusplus +} +#endif + +#endif /* _RDSV3_IMPL_H */ diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_sc.h b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_sc.h new file mode 100644 index 0000000000..dc7b7924d7 --- /dev/null +++ b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_sc.h @@ -0,0 +1,55 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _RDSV3_SC_H +#define _RDSV3_SC_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <netinet/in.h> +#include <net/if.h> + +typedef struct rds_path_endpoint_s { + uint32_t iftype; + ipaddr_t ipaddr; + ipaddr_t node_ipaddr; + char *ifname; +} rds_path_endpoint_t; + +typedef struct rds_path_s { + rds_path_endpoint_t local; + rds_path_endpoint_t remote; +} rds_path_t; + +extern void rds_clif_name(char *name); +extern void rds_path_up(struct rds_path_s *path); +extern void rds_path_down(struct rds_path_s *path); + +#ifdef __cplusplus +} +#endif + +#endif /* _RDSV3_SC_H */ diff --git a/usr/src/uts/common/sys/rds.h b/usr/src/uts/common/sys/rds.h new file mode 100644 index 0000000000..99743e5bbf --- /dev/null +++ b/usr/src/uts/common/sys/rds.h @@ -0,0 +1,375 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright (c) 2008 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +/* + * Include this file if the application uses rdsv3 sockets. + */ + +/* + * This file contains definitions from the ofed rds.h and rds_rdma.h + * header file. + */ +#ifndef _RDSV3_RDS_H +#define _RDSV3_RDS_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define RDS_IB_ABI_VERSION 0x301 + +#define AF_RDS AF_INET_OFFLOAD +#define PF_RDS AF_INET_OFFLOAD + +#define SOL_RDS 272 + +/* + * setsockopt/getsockopt for SOL_RDS + */ +#define RDSV3_CANCEL_SENT_TO 1 +#define RDSV3_GET_MR 2 +#define RDSV3_FREE_MR 3 +/* deprecated: RDS_BARRIER 4 */ +#define RDSV3_RECVERR 5 +#define RDSV3_CONG_MONITOR 6 + +/* + * Control message types for SOL_RDS. + * + * RDS_CMSG_RDMA_ARGS (sendmsg) + * Request a RDMA transfer to/from the specified + * memory ranges. + * The cmsg_data is a struct rdsv3_rdma_args. + * RDS_CMSG_RDMA_DEST (recvmsg, sendmsg) + * Kernel informs application about intended + * source/destination of a RDMA transfer + * RDS_CMSG_RDMA_MAP (sendmsg) + * Application asks kernel to map the given + * memory range into a IB MR, and send the + * R_Key along in an RDS extension header. + * The cmsg_data is a struct rdsv3_get_mr_args, + * the same as for the GET_MR setsockopt. + * RDS_CMSG_RDMA_STATUS (recvmsg) + * Returns the status of a completed RDMA operation. + */ +#define RDSV3_CMSG_RDMA_ARGS 1 +#define RDSV3_CMSG_RDMA_DEST 2 +#define RDSV3_CMSG_RDMA_MAP 3 +#define RDSV3_CMSG_RDMA_STATUS 4 +#define RDSV3_CMSG_CONG_UPDATE 5 + +/* + * RDMA related types + */ + +/* + * This encapsulates a remote memory location. + * In the current implementation, it contains the R_Key + * of the remote memory region, and the offset into it + * (so that the application does not have to worry about + * alignment). + */ +typedef uint64_t rdsv3_rdma_cookie_t; + +struct rdsv3_iovec { + uint64_t addr; + uint64_t bytes; +}; + +struct rdsv3_get_mr_args { + struct rdsv3_iovec vec; + uint64_t cookie_addr; + uint64_t flags; +}; + +struct rdsv3_free_mr_args { + rdsv3_rdma_cookie_t cookie; + uint64_t flags; +}; + +struct rdsv3_rdma_args { + rdsv3_rdma_cookie_t cookie; + struct rdsv3_iovec remote_vec; + uint64_t local_vec_addr; + uint64_t nr_local; + uint64_t flags; + uint64_t user_token; +}; + +struct rdsv3_rdma_notify { + uint64_t user_token; + int32_t status; +}; + +#define RDSV3_RDMA_SUCCESS 0 +#define RDSV3_RDMA_REMOTE_ERROR 1 +#define RDSV3_RDMA_CANCELED 2 +#define RDSV3_RDMA_DROPPED 3 +#define RDSV3_RDMA_OTHER_ERROR 4 + +/* + * Common set of flags for all RDMA related structs + */ +#define RDSV3_RDMA_READWRITE 0x0001 +#define RDSV3_RDMA_FENCE 0x0002 /* use FENCE for immediate send */ +#define RDSV3_RDMA_INVALIDATE 0x0004 /* invalidate R_Key after freeing MR */ +#define RDSV3_RDMA_USE_ONCE 0x0008 /* free MR after use */ +#define RDSV3_RDMA_DONTWAIT 0x0010 /* Don't wait in SET_BARRIER */ +#define RDSV3_RDMA_NOTIFY_ME 0x0020 /* Notify when operation completes */ + +/* + * Congestion monitoring. + * Congestion control in RDS happens at the host connection + * level by exchanging a bitmap marking congested ports. + * By default, a process sleeping in poll() is always woken + * up when the congestion map is updated. + * With explicit monitoring, an application can have more + * fine-grained control. + * The application installs a 64bit mask value in the socket, + * where each bit corresponds to a group of ports. + * When a congestion update arrives, RDS checks the set of + * ports that are now uncongested against the list bit mask + * installed in the socket, and if they overlap, we queue a + * cong_notification on the socket. + * + * To install the congestion monitor bitmask, use RDS_CONG_MONITOR + * with the 64bit mask. + * Congestion updates are received via RDS_CMSG_CONG_UPDATE + * control messages. + * + * The correspondence between bits and ports is + * 1 << (portnum % 64) + */ +#define RDSV3_CONG_MONITOR_SIZE 64 +#define RDSV3_CONG_MONITOR_BIT(port) \ + (((unsigned int) port) % RDSV3_CONG_MONITOR_SIZE) +#define RDSV3_CONG_MONITOR_MASK(port) (1ULL << RDSV3_CONG_MONITOR_BIT(port)) + +/* rds-info related */ + +#define RDSV3_INFO_FIRST 10000 +#define RDSV3_INFO_COUNTERS 10000 +#define RDSV3_INFO_CONNECTIONS 10001 +/* 10002 aka RDS_INFO_FLOWS is deprecated */ +#define RDSV3_INFO_SEND_MESSAGES 10003 +#define RDSV3_INFO_RETRANS_MESSAGES 10004 +#define RDSV3_INFO_RECV_MESSAGES 10005 +#define RDSV3_INFO_SOCKETS 10006 +#define RDSV3_INFO_TCP_SOCKETS 10007 +#define RDSV3_INFO_IB_CONNECTIONS 10008 +#define RDSV3_INFO_CONNECTION_STATS 10009 +#define RDSV3_INFO_IWARP_CONNECTIONS 10010 +#define RDSV3_INFO_LAST 10010 + +#ifndef __lock_lint +#pragma pack(1) +struct rdsv3_info_counter { + uint8_t name[32]; + uint64_t value; +} __attribute__((packed)); +#pragma pack() +#else +struct rdsv3_info_counter { + uint8_t name[32]; + uint64_t value; +}; +#endif + +#define RDSV3_INFO_CONNECTION_FLAG_SENDING 0x01 +#define RDSV3_INFO_CONNECTION_FLAG_CONNECTING 0x02 +#define RDSV3_INFO_CONNECTION_FLAG_CONNECTED 0x04 + +#ifndef __lock_lint +#pragma pack(1) +struct rdsv3_info_connection { + uint64_t next_tx_seq; + uint64_t next_rx_seq; + uint32_t laddr; /* network order */ + uint32_t faddr; /* network order */ + uint8_t transport[15]; /* null term ascii */ + uint8_t flags; +} __attribute__((packed)); +#pragma pack() +#else +struct rdsv3_info_connection { + uint64_t next_tx_seq; + uint64_t next_rx_seq; + uint32_t laddr; /* network order */ + uint32_t faddr; /* network order */ + uint8_t transport[15]; /* null term ascii */ + uint8_t flags; +}; +#endif + +#ifndef __lock_lint +#pragma pack(1) +struct rdsv3_info_flow { + uint32_t laddr; /* network order */ + uint32_t faddr; /* network order */ + uint32_t bytes; + uint16_t lport; /* network order */ + uint16_t fport; /* network order */ +} __attribute__((packed)); +#pragma pack() +#else +struct rdsv3_info_flow { + uint32_t laddr; /* network order */ + uint32_t faddr; /* network order */ + uint32_t bytes; + uint16_t lport; /* network order */ + uint16_t fport; /* network order */ +}; +#endif + +#define RDSV3_INFO_MESSAGE_FLAG_ACK 0x01 +#define RDSV3_INFO_MESSAGE_FLAG_FAST_ACK 0x02 + +#ifndef __lock_lint +#pragma pack(1) +struct rdsv3_info_message { + uint64_t seq; + uint32_t len; + uint32_t laddr; /* network order */ + uint32_t faddr; /* network order */ + uint16_t lport; /* network order */ + uint16_t fport; /* network order */ + uint8_t flags; +} __attribute__((packed)); +#pragma pack() +#else +struct rdsv3_info_message { + uint64_t seq; + uint32_t len; + uint32_t laddr; /* network order */ + uint32_t faddr; /* network order */ + uint16_t lport; /* network order */ + uint16_t fport; /* network order */ + uint8_t flags; +}; +#endif + +#ifndef __lock_lint +#pragma pack(1) +struct rdsv3_info_socket { + uint32_t sndbuf; + uint32_t bound_addr; /* network order */ + uint32_t connected_addr; /* network order */ + uint16_t bound_port; /* network order */ + uint16_t connected_port; /* network order */ + uint32_t rcvbuf; + uint64_t inum; +} __attribute__((packed)); +#pragma pack() +#else +struct rdsv3_info_socket { + uint32_t sndbuf; + uint32_t bound_addr; /* network order */ + uint32_t connected_addr; /* network order */ + uint16_t bound_port; /* network order */ + uint16_t connected_port; /* network order */ + uint32_t rcvbuf; + uint64_t inum; +}; +#endif + +#ifndef __lock_lint +#pragma pack(1) +struct rdsv3_info_socket_v1 { + uint32_t sndbuf; + uint32_t bound_addr; /* network order */ + uint32_t connected_addr; /* network order */ + uint16_t bound_port; /* network order */ + uint16_t connected_port; /* network order */ + uint32_t rcvbuf; +} __attribute__((packed)); +#pragma pack() +#else +struct rdsv3_info_socket_v1 { + uint32_t sndbuf; + uint32_t bound_addr; /* network order */ + uint32_t connected_addr; /* network order */ + uint16_t bound_port; /* network order */ + uint16_t connected_port; /* network order */ + uint32_t rcvbuf; +}; +#endif + +#define RDS_IB_GID_LEN 16 +struct rdsv3_info_rdma_connection { + uint32_t src_addr; /* network order */ + uint32_t dst_addr; /* network order */ + uint8_t src_gid[RDS_IB_GID_LEN]; + uint8_t dst_gid[RDS_IB_GID_LEN]; + + uint32_t max_send_wr; + uint32_t max_recv_wr; + uint32_t max_send_sge; + uint32_t rdma_mr_max; + uint32_t rdma_mr_size; +}; + +#define rdsv3_info_ib_connection rdsv3_info_rdma_connection +#define rdma_fmr_max rdma_mr_max +#define rdma_fmr_size rdma_mr_size + +#ifdef __cplusplus +} +#endif + +#endif /* _RDSV3_RDS_H */ diff --git a/usr/src/uts/intel/Makefile.intel.shared b/usr/src/uts/intel/Makefile.intel.shared index 3d4ac210ce..3c7e4def72 100644 --- a/usr/src/uts/intel/Makefile.intel.shared +++ b/usr/src/uts/intel/Makefile.intel.shared @@ -309,6 +309,7 @@ DRV_KMODS += ral DRV_KMODS += ramdisk DRV_KMODS += random DRV_KMODS += rds +DRV_KMODS += rdsv3 DRV_KMODS += rpcib DRV_KMODS += rsm DRV_KMODS += rts @@ -732,6 +733,7 @@ MAC_KMODS += mac_ib SOCKET_KMODS += sockpfp SOCKET_KMODS += socksctp SOCKET_KMODS += socksdp +SOCKET_KMODS += sockrds # # kiconv modules (/kernel/kiconv): diff --git a/usr/src/uts/intel/rdsv3/Makefile b/usr/src/uts/intel/rdsv3/Makefile new file mode 100644 index 0000000000..238ba56640 --- /dev/null +++ b/usr/src/uts/intel/rdsv3/Makefile @@ -0,0 +1,94 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = rdsv3 +OBJECTS = $(RDSV3_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(RDSV3_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) +CFLAGS += $(CCVERBOSE) $(_XPG4_2) +LDFLAGS += -dy -Nfs/sockfs -Nmisc/ksocket -Nmisc/ip -Nmisc/ibtl -Nmisc/ibcm -Nmisc/sol_ofs +CONF_SRCDIR = $(UTSBASE)/common/io/ib/clients/rdsv3 +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# CFLAGS += -DOFA_SOLARIS + +# +# Disable these lint checks since some errors suppressed here are +# in the OFED code, but we'd like to keep it as is as much as possible. +# Note. maintainers should endeavor to investigate and remove these for +# maximum lint coverage, but please do not carry these forward to new +# Makefiles blindly. +# +LINTTAGS += -erroff=E_STATIC_UNUSED +LINTTAGS += -erroff=E_CONSTANT_CONDITION +LINTTAGS += -erroff=E_FUNC_VAR_UNUSED +LINTTAGS += -erroff=E_SUSPICIOUS_COMPARISON +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN +LINTTAGS += -erroff=E_ASSIGN_NARROW_CONV +LINTTAGS += -erroff=E_FUNC_SET_NOT_USED + +# +# Define targets +# +ALL_TARGET = $(BINARY) $(SRC_CONFILE) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ + diff --git a/usr/src/uts/intel/sockrds/Makefile b/usr/src/uts/intel/sockrds/Makefile new file mode 100644 index 0000000000..64a8ccd3c8 --- /dev/null +++ b/usr/src/uts/intel/sockrds/Makefile @@ -0,0 +1,86 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# +# This makefile drives the production of the nca driver +# kernel module. +# +# intel architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = sockrds +OBJECTS = $(RDS_SOCK_MOD_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(RDS_SOCK_MOD_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_SOCK_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement and OS version +# +CFLAGS += $(CCVERBOSE) + +LDFLAGS += -dy -Nfs/sockfs -Ndrv/ip -Ndrv/rdsv3 + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/sparc/Makefile.sparc.shared b/usr/src/uts/sparc/Makefile.sparc.shared index 5a9b506817..ada0e7d643 100644 --- a/usr/src/uts/sparc/Makefile.sparc.shared +++ b/usr/src/uts/sparc/Makefile.sparc.shared @@ -219,6 +219,7 @@ DRV_KMODS += dlpistub DRV_KMODS += vnic DRV_KMODS += xge DRV_KMODS += rds +DRV_KMODS += rdsv3 DRV_KMODS += chxge DRV_KMODS += smbsrv DRV_KMODS += vscan @@ -504,6 +505,7 @@ MAC_KMODS += mac_ib SOCKET_KMODS += sockpfp SOCKET_KMODS += socksctp SOCKET_KMODS += socksdp +SOCKET_KMODS += sockrds # # kiconv modules (/kernel/kiconv): diff --git a/usr/src/uts/sparc/rdsv3/Makefile b/usr/src/uts/sparc/rdsv3/Makefile new file mode 100644 index 0000000000..959610d555 --- /dev/null +++ b/usr/src/uts/sparc/rdsv3/Makefile @@ -0,0 +1,143 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# +# This makefile drives the production of the rds driver +# kernel module. +# +# sparc architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = rdsv3 +OBJECTS = $(RDSV3_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(RDSV3_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/common/io/ib/clients/rdsv3 +WARLOCK_OUT = $(RDSV3_OBJS:%.o=%.ll) +WARLOCK_OK = $(MODULE).ok +WLCMD_DIR = $(UTSBASE)/common/io/warlock + +# +# Include common rules. +# +include $(UTSBASE)/sparc/Makefile.sparc + +# +# Define targets +# +ALL_TARGET = $(BINARY) $(SRC_CONFILE) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# +# lint pass one enforcement and OS version +# +CFLAGS += $(CCVERBOSE) + +# +# Disable these lint checks since some errors suppressed here are +# in the OFED code, but we'd like to keep it as is as much as possible. +# Note. maintainers should endeavor to investigate and remove these for +# maximum lint coverage, but please do not carry these forward to new +# Makefiles blindly. +# +LINTTAGS += -erroff=E_STATIC_UNUSED +LINTTAGS += -erroff=E_CONSTANT_CONDITION +LINTTAGS += -erroff=E_FUNC_VAR_UNUSED +LINTTAGS += -erroff=E_SUSPICIOUS_COMPARISON +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN +LINTTAGS += -erroff=E_ASSIGN_NARROW_CONV +LINTTAGS += -erroff=E_FUNC_SET_NOT_USED +LINTTAGS += -erroff=E_FUNC_USED_VAR_ARG2 +LINTTAGS += -erroff=E_INCONS_ARG_USED2 + +LDFLAGS += -dy -Nfs/sockfs -Nmisc/ksocket -Ndrv/ip -Nmisc/ibtl -Nmisc/ibcm -Nmisc/sol_ofs + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) lint64 + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE) + -$(RM) $@; ln $(ROOTMODULE) $@ + +# +# Include common targets. +# +include $(UTSBASE)/sparc/Makefile.targ + +# +# Defines for local commands. +# +WARLOCK = warlock +WLCC = wlcc +TOUCH = touch +TEST = test + +warlock: $(WARLOCK_OK) $(WARLOCK_OUT) + +$(WARLOCK_OK): $(WARLOCK_OUT) $(WLCMD_DIR)/rdsv3.wlcmd warlock_ddi.files + $(WARLOCK) -c $(WLCMD_DIR)/rdsv3.wlcmd $(WARLOCK_OUT) \ + -l ../warlock/ddi_dki_impl.ll + $(TOUCH) $@ + +%.ll: $(UTSBASE)/common/io/ib/clients/rdsv3/%.c \ + $(UTSBASE)/common/sys/ib/clients/rdsv3/ib.h \ + $(UTSBASE)/common/sys/ib/clients/rdsv3/info.h \ + $(UTSBASE)/common/sys/ib/clients/rdsv3/loop.h \ + $(UTSBASE)/common/sys/ib/clients/rdsv3/rdma.h \ + $(UTSBASE)/common/sys/ib/clients/rdsv3/rdma_transport.h \ + $(UTSBASE)/common/sys/ib/clients/rdsv3/rds.h \ + $(UTSBASE)/common/sys/ib/clients/rdsv3/rds_rdma.h \ + $(UTSBASE)/common/sys/ib/clients/rdsv3/rdsv3_atomic.h \ + $(UTSBASE)/common/sys/ib/clients/rdsv3/rdsv3_debug.h \ + $(UTSBASE)/common/sys/ib/clients/rdsv3/rdsv3_impl.h \ + $(UTSBASE)/common/sys/ib/clients/rdsv3/rdsv3_ofed_types.h \ + $(UTSBASE)/common/sys/ib/clients/rdsv3/rdsv3_sc.h + $(WLCC) $(CPPFLAGS) -DDEBUG -Dinline= -o $@ $< + +warlock_ddi.files: + @cd ../warlock; pwd; $(MAKE) warlock diff --git a/usr/src/uts/sparc/sockrds/Makefile b/usr/src/uts/sparc/sockrds/Makefile new file mode 100644 index 0000000000..a8e1a0702f --- /dev/null +++ b/usr/src/uts/sparc/sockrds/Makefile @@ -0,0 +1,88 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# + +# +# This makefile drives the production of the nca driver +# kernel module. +# +# sparc architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = sockrds +OBJECTS = $(RDS_SOCK_MOD_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(RDS_SOCK_MOD_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_SOCK_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sparc/Makefile.sparc + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement and OS version +# +CFLAGS += $(CCVERBOSE) + +LDFLAGS += -dy -Nfs/sockfs -Ndrv/ip -Ndrv/rdsv3 + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/sparc/Makefile.targ |